## Framework

- Data cleaning and formatting 
- Exploratory data analysis
- Feature engineering and selection
- Compare several machine learning models on a performance metric
- Perform hyper-parameter tuning on the best model 
- Evaluate the best model on the testing set
- Interpret the model results
- Draw conclusions and document work

In [1]:
#load packages

#lm pacakges
from sklearn import tree 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.preprocessing import normalize, scale, Normalizer, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.dummy import DummyClassifier

#other 
import numpy as np
import pandas as pd
import pickle 
import graphviz
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import pandas_profiling
pd.set_option("display.max_colwidth", 200)

import altair as alt
import time

import autotime

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [3]:
df1 = pd.read_csv('data/01_oscar_data.csv')
df2 = pd.read_csv('data/02_oscar_data.csv')

In [4]:
#df1.profile_report(style={'full_width':True})

- remove all the categorical for wins and nominations so model only is concern with # of nominations and wins 
    - might have to return to this later

In [5]:
df1.head(1)

Unnamed: 0,year,movie,movie_id,certificate,duration,genre,rate,metascore,synopsis,votes,gross,release_date,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Oscar_nominated_categories,Golden_Globes_won,Golden_Globes_won_categories,Golden_Globes_nominated,Golden_Globes_nominated_categories,BAFTA_won,BAFTA_won_categories,BAFTA_nominated,BAFTA_nominated_categories,Screen_Actors_Guild_won,Screen_Actors_Guild_won_categories,Screen_Actors_Guild_nominated,Screen_Actors_Guild_nominated_categories,Critics_Choice_won,Critics_Choice_won_categories,Critics_Choice_nominated,Critics_Choice_nominated_categories,Directors_Guild_won,Directors_Guild_won_categories,Directors_Guild_nominated,Directors_Guild_nominated_categories,Producers_Guild_won,Producers_Guild_won_categories,Producers_Guild_nominated,Producers_Guild_nominated_categories,Art_Directors_Guild_won,Art_Directors_Guild_won_categories,Art_Directors_Guild_nominated,Art_Directors_Guild_nominated_categories,Writers_Guild_won,Writers_Guild_won_categories,Writers_Guild_nominated,Writers_Guild_nominated_categories,Costume_Designers_Guild_won,Costume_Designers_Guild_won_categories,Costume_Designers_Guild_nominated,Costume_Designers_Guild_nominated_categories,Online_Film_Television_Association_won,Online_Film_Television_Association_won_categories,Online_Film_Television_Association_nominated,Online_Film_Television_Association_nominated_categories,Online_Film_Critics_Society_won,Online_Film_Critics_Society_won_categories,Online_Film_Critics_Society_nominated,Online_Film_Critics_Society_nominated_categories,People_Choice_won,People_Choice_won_categories,People_Choice_nominated,People_Choice_nominated_categories,London_Critics_Circle_Film_won,London_Critics_Circle_Film_won_categories,London_Critics_Circle_Film_nominated,London_Critics_Circle_Film_nominated_categories,American_Cinema_Editors_won,American_Cinema_Editors_won_categories,American_Cinema_Editors_nominated,American_Cinema_Editors_nominated_categories,Hollywood_Film_won,Hollywood_Film_won_categories,Hollywood_Film_nominated,Hollywood_Film_nominated_categories,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_won_categories,Austin_Film_Critics_Association_nominated,Austin_Film_Critics_Association_nominated_categories,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_won_categories,Denver_Film_Critics_Society_nominated,Denver_Film_Critics_Society_nominated_categories,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_won_categories,Boston_Society_of_Film_Critics_nominated,Boston_Society_of_Film_Critics_nominated_categories,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_won_categories,New_York_Film_Critics_Circle_nominated,New_York_Film_Critics_Circle_nominated_categories,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_won_categories,Los_Angeles_Film_Critics_Association_nominated,Los_Angeles_Film_Critics_Association_nominated_categories,release_date.year,release_date.month,release_date.day-of-month,release_date.day-of-week
0,2001,Kate & Leopold,tt0035423,PG-13,118,Comedy|Fantasy|Romance,6.4,44.0,An English Duke from 1876 is inadvertedly dragged to modern day New York where he falls for a plucky advertising executive.,66660,47100000.0,2001-12-25,318.0,125.0,2363.0,1,4,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,"Best Music, Original Song",1,Best Original Song - Motion Picture,2,Best Original Song - Motion Picture|Best Performance by an Actor in a Motion Picture - Comedy or Musical,0,,0,,0,,0,,0,,1,Best Song,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,"Best Music, Original Song",0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2001.0,12.0,25.0,2.0


### Fill best picture nomination 2018

In [6]:
df1.query("movie == 'Roma'")

Unnamed: 0,year,movie,movie_id,certificate,duration,genre,rate,metascore,synopsis,votes,gross,release_date,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Oscar_nominated_categories,Golden_Globes_won,Golden_Globes_won_categories,Golden_Globes_nominated,Golden_Globes_nominated_categories,BAFTA_won,BAFTA_won_categories,BAFTA_nominated,BAFTA_nominated_categories,Screen_Actors_Guild_won,Screen_Actors_Guild_won_categories,Screen_Actors_Guild_nominated,Screen_Actors_Guild_nominated_categories,Critics_Choice_won,Critics_Choice_won_categories,Critics_Choice_nominated,Critics_Choice_nominated_categories,Directors_Guild_won,Directors_Guild_won_categories,Directors_Guild_nominated,Directors_Guild_nominated_categories,Producers_Guild_won,Producers_Guild_won_categories,Producers_Guild_nominated,Producers_Guild_nominated_categories,Art_Directors_Guild_won,Art_Directors_Guild_won_categories,Art_Directors_Guild_nominated,Art_Directors_Guild_nominated_categories,Writers_Guild_won,Writers_Guild_won_categories,Writers_Guild_nominated,Writers_Guild_nominated_categories,Costume_Designers_Guild_won,Costume_Designers_Guild_won_categories,Costume_Designers_Guild_nominated,Costume_Designers_Guild_nominated_categories,Online_Film_Television_Association_won,Online_Film_Television_Association_won_categories,Online_Film_Television_Association_nominated,Online_Film_Television_Association_nominated_categories,Online_Film_Critics_Society_won,Online_Film_Critics_Society_won_categories,Online_Film_Critics_Society_nominated,Online_Film_Critics_Society_nominated_categories,People_Choice_won,People_Choice_won_categories,People_Choice_nominated,People_Choice_nominated_categories,London_Critics_Circle_Film_won,London_Critics_Circle_Film_won_categories,London_Critics_Circle_Film_nominated,London_Critics_Circle_Film_nominated_categories,American_Cinema_Editors_won,American_Cinema_Editors_won_categories,American_Cinema_Editors_nominated,American_Cinema_Editors_nominated_categories,Hollywood_Film_won,Hollywood_Film_won_categories,Hollywood_Film_nominated,Hollywood_Film_nominated_categories,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_won_categories,Austin_Film_Critics_Association_nominated,Austin_Film_Critics_Association_nominated_categories,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_won_categories,Denver_Film_Critics_Society_nominated,Denver_Film_Critics_Society_nominated_categories,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_won_categories,Boston_Society_of_Film_Critics_nominated,Boston_Society_of_Film_Critics_nominated_categories,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_won_categories,New_York_Film_Critics_Circle_nominated,New_York_Film_Critics_Circle_nominated_categories,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_won_categories,Los_Angeles_Film_Critics_Association_nominated,Los_Angeles_Film_Critics_Association_nominated_categories,release_date.year,release_date.month,release_date.day-of-month,release_date.day-of-week
1202,2018,Roma,tt6155172,R,135,Drama,8.1,96.0,A year in the life of a middle-class family's maid in Mexico City in the early 1970s.,54490,,,497.0,290.0,24.0,29,60,No,Yes,No,Yes,No,No,No,Yes,No,No,No,Yes,No,No,No,Yes,10,Best Foreign Language Film of the Year|Best Achievement in Directing|Best Achievement in Cinematography|Best Motion Picture of the Year|Best Performance by an Actress in a Leading Role|Best Perfor...,2,Best Director - Motion Picture|Best Motion Picture - Foreign Language,3,Best Director - Motion Picture|Best Motion Picture - Foreign Language|Best Screenplay - Motion Picture,4,Best Film|Best Film Not in the English Language|Best Cinematography|David Lean Award for Direction,7,Best Film|Best Film Not in the English Language|Best Cinematography|David Lean Award for Direction|Best Screenplay (Original)|Best Editing|Best Production Design,0,,0,,4,Best Picture|Best Director|Best Foreign Language Film|Best Cinematography,8,Best Picture|Best Director|Best Foreign Language Film|Best Cinematography|Best Actress|Best Original Screenplay|Best Production Design|Best Editing,1,Outstanding Directorial Achievement in Feature Film,1,Outstanding Directorial Achievement in Feature Film,0,,1,Outstanding Producer of Theatrical Motion Pictures,0,,1,Period Film,0,,1,Original Screenplay,0,,0,,2,Best Foreign Language Film|Best Cinematography,9,"Best Foreign Language Film|Best Cinematography|Best Picture|Best Breakthrough Performance: Female|Best Director|Best Writing, Screenplay Written Directly for the Screen|Best Film Editing|Best Soun...",4,Best Picture|Best Director|Best Cinematography|Best Film Not in the English Language,7,Best Picture|Best Director|Best Cinematography|Best Film Not in the English Language|Best Actress|Best Original Screenplay|Best Editing,0,,0,,2,Film of the Year|Director of the Year,5,Film of the Year|Director of the Year|Actress of the Year|Screenwriter of the Year|Foreign Language Film of the Year,0,,1,Best Edited Feature Film - Dramatic,1,New Hollywood Award,1,New Hollywood Award,0,,3,Best Original Screenplay|Best Film|Breakthrough Artist Award,3,Best Picture|Best Director|Best Foreign Language Film,4,Best Picture|Best Director|Best Foreign Language Film|Best Actress,1,Best Cinematography,1,Best Cinematography,3,Best Film|Best Director|Best Cinematographer,3,Best Film|Best Director|Best Cinematographer,2,Best Picture|Best Cinematography,4,Best Picture|Best Cinematography|Best Director|Best Editing,,,,


In [7]:
df1.loc[df1['movie'].str.contains('Roma', case=False), 'release_date'] = "2018-11-21"

In [8]:
df1.loc[df1['movie'].str.contains('Roma', case=False), 'gross'] = 5.206600e+07


In [9]:
df1.loc[df1['movie'].str.contains('Roma', case=False), 'release_date.month'] = 11

In [10]:
df1.query("movie == 'Roma'")

Unnamed: 0,year,movie,movie_id,certificate,duration,genre,rate,metascore,synopsis,votes,gross,release_date,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Oscar_nominated_categories,Golden_Globes_won,Golden_Globes_won_categories,Golden_Globes_nominated,Golden_Globes_nominated_categories,BAFTA_won,BAFTA_won_categories,BAFTA_nominated,BAFTA_nominated_categories,Screen_Actors_Guild_won,Screen_Actors_Guild_won_categories,Screen_Actors_Guild_nominated,Screen_Actors_Guild_nominated_categories,Critics_Choice_won,Critics_Choice_won_categories,Critics_Choice_nominated,Critics_Choice_nominated_categories,Directors_Guild_won,Directors_Guild_won_categories,Directors_Guild_nominated,Directors_Guild_nominated_categories,Producers_Guild_won,Producers_Guild_won_categories,Producers_Guild_nominated,Producers_Guild_nominated_categories,Art_Directors_Guild_won,Art_Directors_Guild_won_categories,Art_Directors_Guild_nominated,Art_Directors_Guild_nominated_categories,Writers_Guild_won,Writers_Guild_won_categories,Writers_Guild_nominated,Writers_Guild_nominated_categories,Costume_Designers_Guild_won,Costume_Designers_Guild_won_categories,Costume_Designers_Guild_nominated,Costume_Designers_Guild_nominated_categories,Online_Film_Television_Association_won,Online_Film_Television_Association_won_categories,Online_Film_Television_Association_nominated,Online_Film_Television_Association_nominated_categories,Online_Film_Critics_Society_won,Online_Film_Critics_Society_won_categories,Online_Film_Critics_Society_nominated,Online_Film_Critics_Society_nominated_categories,People_Choice_won,People_Choice_won_categories,People_Choice_nominated,People_Choice_nominated_categories,London_Critics_Circle_Film_won,London_Critics_Circle_Film_won_categories,London_Critics_Circle_Film_nominated,London_Critics_Circle_Film_nominated_categories,American_Cinema_Editors_won,American_Cinema_Editors_won_categories,American_Cinema_Editors_nominated,American_Cinema_Editors_nominated_categories,Hollywood_Film_won,Hollywood_Film_won_categories,Hollywood_Film_nominated,Hollywood_Film_nominated_categories,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_won_categories,Austin_Film_Critics_Association_nominated,Austin_Film_Critics_Association_nominated_categories,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_won_categories,Denver_Film_Critics_Society_nominated,Denver_Film_Critics_Society_nominated_categories,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_won_categories,Boston_Society_of_Film_Critics_nominated,Boston_Society_of_Film_Critics_nominated_categories,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_won_categories,New_York_Film_Critics_Circle_nominated,New_York_Film_Critics_Circle_nominated_categories,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_won_categories,Los_Angeles_Film_Critics_Association_nominated,Los_Angeles_Film_Critics_Association_nominated_categories,release_date.year,release_date.month,release_date.day-of-month,release_date.day-of-week
1202,2018,Roma,tt6155172,R,135,Drama,8.1,96.0,A year in the life of a middle-class family's maid in Mexico City in the early 1970s.,54490,52066000.0,2018-11-21,497.0,290.0,24.0,29,60,No,Yes,No,Yes,No,No,No,Yes,No,No,No,Yes,No,No,No,Yes,10,Best Foreign Language Film of the Year|Best Achievement in Directing|Best Achievement in Cinematography|Best Motion Picture of the Year|Best Performance by an Actress in a Leading Role|Best Perfor...,2,Best Director - Motion Picture|Best Motion Picture - Foreign Language,3,Best Director - Motion Picture|Best Motion Picture - Foreign Language|Best Screenplay - Motion Picture,4,Best Film|Best Film Not in the English Language|Best Cinematography|David Lean Award for Direction,7,Best Film|Best Film Not in the English Language|Best Cinematography|David Lean Award for Direction|Best Screenplay (Original)|Best Editing|Best Production Design,0,,0,,4,Best Picture|Best Director|Best Foreign Language Film|Best Cinematography,8,Best Picture|Best Director|Best Foreign Language Film|Best Cinematography|Best Actress|Best Original Screenplay|Best Production Design|Best Editing,1,Outstanding Directorial Achievement in Feature Film,1,Outstanding Directorial Achievement in Feature Film,0,,1,Outstanding Producer of Theatrical Motion Pictures,0,,1,Period Film,0,,1,Original Screenplay,0,,0,,2,Best Foreign Language Film|Best Cinematography,9,"Best Foreign Language Film|Best Cinematography|Best Picture|Best Breakthrough Performance: Female|Best Director|Best Writing, Screenplay Written Directly for the Screen|Best Film Editing|Best Soun...",4,Best Picture|Best Director|Best Cinematography|Best Film Not in the English Language,7,Best Picture|Best Director|Best Cinematography|Best Film Not in the English Language|Best Actress|Best Original Screenplay|Best Editing,0,,0,,2,Film of the Year|Director of the Year,5,Film of the Year|Director of the Year|Actress of the Year|Screenwriter of the Year|Foreign Language Film of the Year,0,,1,Best Edited Feature Film - Dramatic,1,New Hollywood Award,1,New Hollywood Award,0,,3,Best Original Screenplay|Best Film|Breakthrough Artist Award,3,Best Picture|Best Director|Best Foreign Language Film,4,Best Picture|Best Director|Best Foreign Language Film|Best Actress,1,Best Cinematography,1,Best Cinematography,3,Best Film|Best Director|Best Cinematographer,3,Best Film|Best Director|Best Cinematographer,2,Best Picture|Best Cinematography,4,Best Picture|Best Cinematography|Best Director|Best Editing,,11.0,,


# Data cleaning and formatting and Exploratory data analysis

In [11]:
df1_temp = df1.drop(['movie_id', 'release_date', 'synopsis', 'release_date.year','release_date.month', 'release_date.day-of-month', 'release_date.day-of-week', 'American_Cinema_Editors_nominated_categories', 'American_Cinema_Editors_won_categories', 'Art_Directors_Guild_nominated_categories', 'Art_Directors_Guild_won_categories', 'Austin_Film_Critics_Association_nominated_categories', 'Austin_Film_Critics_Association_won_categories', 'BAFTA_nominated_categories', 'BAFTA_won_categories', 'Boston_Society_of_Film_Critics_nominated_categories', 'Boston_Society_of_Film_Critics_won_categories', 'Costume_Designers_Guild_nominated_categories', 'Costume_Designers_Guild_won_categories', 'Critics_Choice_nominated_categories','Critics_Choice_won_categories', 'Denver_Film_Critics_Society_nominated_categories', 'Denver_Film_Critics_Society_won_categories', 'Directors_Guild_nominated_categories', 'Directors_Guild_won_categories', 'Golden_Globes_nominated_categories', 'Golden_Globes_won_categories', 'Hollywood_Film_nominated_categories', 'Hollywood_Film_won_categories', 'London_Critics_Circle_Film_nominated_categories', 'London_Critics_Circle_Film_won_categories', 'Los_Angeles_Film_Critics_Association_nominated_categories', 'Los_Angeles_Film_Critics_Association_won_categories', 'New_York_Film_Critics_Circle_nominated_categories', 'New_York_Film_Critics_Circle_won_categories', 'Online_Film_Critics_Society_nominated_categories', 'Online_Film_Critics_Society_won_categories', 'Online_Film_Television_Association_nominated_categories', 'Online_Film_Television_Association_won_categories', 'Oscar_nominated_categories', 'People_Choice_nominated_categories', 'People_Choice_won_categories', 'Producers_Guild_nominated_categories', 'Screen_Actors_Guild_nominated_categories', 'Screen_Actors_Guild_won_categories', 'Writers_Guild_nominated_categories', 'Writers_Guild_won_categories', 'Producers_Guild_won_categories'], axis = 1)

In [12]:
#df1_temp.profile_report(style={'full_width':True})

- Remove Hollywood_Film_won and Hollywood_Film_nominated do to high correlation 
- Can keep Online_Film_Television_Association_nominated and awards_nominations because it does not look like they are related
- give gross and popularity missing values the mediumn value
- remove the missing metascore, certificate, and release_date.month values 
- Deal with the categorical nature of genre using MultiLabelBinarizer

In [13]:
df1_temp = df1_temp.drop(['Hollywood_Film_won', 'Hollywood_Film_nominated'], axis = 1)

In [14]:
df1_temp = df1_temp.dropna(subset=['metascore', 'certificate'])

In [15]:
df1_temp['gross'].fillna((df1_temp['gross'].median()), inplace=True)
df1_temp['popularity'].fillna((df1_temp['popularity'].median()), inplace=True)

In [16]:
df1_temp['genre'] = [x.split('|') for x in df1_temp['genre']]

In [17]:
mlb = MultiLabelBinarizer()
df1_temp = df1_temp.join(pd.DataFrame(mlb.fit_transform(df1_temp.pop('genre')),
                          columns=mlb.classes_,
                          index=df1_temp.index))

In [18]:
# df1_temp.profile_report(style={'full_width':True})

- Fix the histor category history 

In [19]:
df1_temp = df1_temp.reset_index()

In [20]:
df1_temp.query("Histor == 1")

Unnamed: 0,index,year,movie,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Histor,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
682,694,2014,Selma,PG-13,128,7.5,89.0,64976,52066000.0,214.0,367.0,489.0,2,20,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No,2,1,4,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,1,0,5,0,3,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [21]:
df1_temp.iloc[682]["History"] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df1_temp = df1_temp.drop(["Histor", "index"], axis=1) 

- fix the certified column to move tv-ma to r and unrated to not rated

In [23]:
df1_temp = df1_temp.reset_index()

In [24]:
df1_temp.query("certificate == 'TV-MA'")

Unnamed: 0,index,year,movie,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
1164,1164,2016,Jim: The James Foley Story,TV-MA,111,7.8,73.0,631,52822418.0,3.0,19.0,1048.0,0,0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
df1_temp.loc[df1_temp['certificate'].str.contains('TV-MA', case=False), 'certificate'] = 'R'

In [26]:
df1_temp.iloc[1159]['certificate'] 

'R'

In [27]:
df1_temp.loc[df1_temp['certificate'].str.contains('Unrated', case=False), 'certificate'] = 'Not Rated'

In [28]:
df1_temp.query("certificate == 'Unrated'")

Unnamed: 0,index,year,movie,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western


In [29]:
df1_temp = df1_temp.dropna()

In [30]:
df_clean = df1_temp

In [31]:
# #structure features 
# df_clean = df_clean.drop(['index'], axis = 1)
# df_clean.head(1)

### Pull the winners of 2018 for back testing 

In [32]:
best_picture_2018_identifiable = df_clean.query("Oscar_Best_Picture_nominated == 'Yes' & year == 2018")
best_director_2018_identifiable = df_clean.query("Oscar_Best_Director_nominated == 'Yes' & year == 2018")
best_actor_2018_identifiable = df_clean.query("Oscar_Best_Actor_nominated == 'Yes' & year == 2018")
best_actress_2018_identifiable = df_clean.query("Oscar_Best_Actress_nominated == 'Yes' & year == 2018")
best_supporting_actor_2018_identifiable = df_clean.query("Oscar_Best_Supporting_Actor_nominated == 'Yes' & year == 2018")
best_supporting_actress_2018_identifiable = df_clean.query("Oscar_Best_Supporting_Actress_nominated == 'Yes' & year == 2018")
best_oriscreen_2018_identifiable = df_clean.query("Oscar_Best_OriScreen_nominated == 'Yes' & year == 2018")
best_adascreen_2018_identifiable = df_clean.query("Oscar_Best_AdaScreen_nominated == 'Yes' & year == 2018")


In [33]:
t = pd.concat([best_actor_2018_identifiable, best_actress_2018_identifiable, best_adascreen_2018_identifiable, best_director_2018_identifiable, best_oriscreen_2018_identifiable, best_picture_2018_identifiable, best_supporting_actor_2018_identifiable, best_supporting_actress_2018_identifiable], axis=0)
best_back_test_2018_identifiable = t.drop_duplicates(keep='last')

df_ml = df_clean.drop(list(best_back_test_2018_identifiable.index))


In [34]:
#final drop of identifable information
df_ml = df_ml.drop(['year', 'movie', 'index'], axis = 1)
best_back_test_2018_unidentifiable = best_back_test_2018_identifiable.drop(['year', 'movie', 'index'], axis = 1)

In [70]:
df_ml

Unnamed: 0,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,PG-13,118,6.4,44.0,66660,47100000.0,318.0,125.0,2363.0,1,4,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,G,84,7.0,88.0,144475,106790000.0,361.0,186.0,2859.0,5,11,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1,0,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,2,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PG-13,106,5.7,40.0,273203,154700000.0,1008.0,278.0,1876.0,0,0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,R,123,7.4,61.0,63852,25780000.0,272.0,126.0,2508.0,2,12,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,6,1,2,1,4,0,2,0,2,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,PG-13,178,8.8,92.0,1286275,313840000.0,5078.0,296.0,204.0,26,67,No,Yes,No,Yes,No,No,No,No,No,Yes,No,No,No,Yes,No,No,13,0,4,5,14,1,2,3,5,0,1,0,1,0,1,0,1,0,0,13,22,1,8,2,3,0,0,0,1,0,1,0,0,0,1,0,0,1,2,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,PG,98,7.2,81.0,1430,730000.0,15.0,67.0,3108.0,0,5,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1190,PG,98,7.6,71.0,4781,14020000.0,40.0,91.0,2023.0,0,7,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,2,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1191,PG-13,100,8.6,83.0,3802,13170000.0,60.0,53.0,272.0,2,8,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1192,R,121,8.1,93.0,11471,2250000.0,42.0,198.0,231.0,3,14,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,1,2,5,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Create the X and Ys

In [35]:
X = df_ml.drop(['Oscar_Best_Actor_won', 'Oscar_Best_Actress_won', 'Oscar_Best_AdaScreen_won', 'Oscar_Best_Director_won', 'Oscar_Best_OriScreen_won', 'Oscar_Best_Picture_won', "Oscar_Best_Supporting_Actor_won", "Oscar_Best_Supporting_Actress_won"], axis = 1)
y_best_actor = df_ml['Oscar_Best_Actor_won']
y_best_actress = df_ml['Oscar_Best_Actress_won']
y_best_ada_screen = df_ml['Oscar_Best_AdaScreen_won']
y_best_director = df_ml['Oscar_Best_Director_won']
y_best_ori_screen = df_ml['Oscar_Best_OriScreen_won']
y_best_picture = df_ml['Oscar_Best_Picture_won']
y_best_supporting_actor = df_ml['Oscar_Best_Supporting_Actor_won']
y_best_supporting_actress = df_ml['Oscar_Best_Supporting_Actress_won']

### Split X and Y 

In [36]:
#best actor
X_train_best_actor, X_test_best_actor, y_train_best_actor, y_test_best_actor = train_test_split(X,
                                                                          y_best_actor,
                                                                          test_size=0.2)
#best actoress 
X_train_best_actress, X_test_best_actress, y_train_best_actress, y_test_best_actress = train_test_split(X,
                                                                          y_best_actress,
                                                                          test_size=0.2)

#best adaptive screen play 
X_train_best_ada_screen, X_test_best_ada_screen, y_train_best_ada_screen, y_test_best_ada_screen = train_test_split(X,
                                                                          y_best_ada_screen,
                                                                          test_size=0.2)
#best director
X_train_best_director, X_test_best_director, y_train_best_director, y_test_best_director = train_test_split(X,
                                                                          y_best_director,
                                                                          test_size=0.2)

#best orginal screen play 
X_train_best_ori_screen, X_test_best_ori_screen, y_train_best_ori_screen, y_test_best_ori_screen = train_test_split(X,
                                                                          y_best_ori_screen,
                                                                          test_size=0.2)

#best picture 
X_train_best_picture, X_test_best_picture, y_train_best_picture, y_test_best_picture = train_test_split(X,
                                                                          y_best_picture,
                                                                          test_size=0.2)

#best supporting actor 
X_train_best_supporting_actor, X_test_best_supporting_actor, y_train_best_supporting_actor, y_test_best_supporting_actor = train_test_split(X,
                                                                          y_best_supporting_actor,
                                                                          test_size=0.2)

#best suporting actress 
X_train_best_supporting_actress, X_test_best_supporting_actress, y_train_best_supporting_actress, y_test_best_supporting_actress = train_test_split(X,
                                                                          y_best_supporting_actress,
                                                                          test_size=0.2)


In [37]:
#split 
numeric_features = ['duration', 'rate', 'metascore', 'gross', 'user_reviews', 'critic_reviews', 'popularity']
    
    
categorical_features = ['certificate', 'Oscar_Best_Picture_nominated', 'Oscar_Best_Director_nominated', 'Oscar_Best_Actor_nominated', 'Oscar_Best_Actress_nominated', 'Oscar_Best_Supporting_Actress_nominated', 'Oscar_Best_OriScreen_nominated']

### Processing Pipeline 

In [38]:
# transform the data to standardize the values in the data 
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(drop="first"), categorical_features)])


In [39]:
def get_scores(model, 
                X_train, y_train,
                X_test, y_test, 
                show = True
               ):
    """
    Returns train and validation error given a model
    train and validation X and y portions
    Parameters
    ----------
    model: sklearn classifier model
        The sklearn model
    X_train: numpy.ndarray        
        The X part of the train set
    y_train: numpy.ndarray
        The y part of the train set    
    X_valid: numpy.ndarray        
        The X part of the validation set
    y_valid: numpy.ndarray
        The y part of the validation set    
    Returns
    -------
        train_err: float
        test_err: float
            
    """ 
    
    if show: 
        print("Training error:   %.2f" % (1-model.score(X_train, y_train)))
        print("Validation error: %.2f" % (1-model.score(X_test, y_test)))
        print('\n')
    return (1-model.score(X_train, y_train)), (1-model.score(X_test, y_test))

In [40]:
def diff_class_ml(X_train, X_test, y_train, y_test):
    """
    Returns train error, validation error and time given an ensemble of models 
    Parameters
    ----------
    X_train: numpy.ndarray        
        The X part of the train set
    y_train: numpy.ndarray
        The y part of the train set    
    X_valid: numpy.ndarray        
        The X part of the validation set
    y_valid: numpy.ndarray
        The y part of the validation set    
    Returns
    -------
        pd.DataFrame
            
    """ 
    # Lets create an empty dictionary to store all the results
    results_dict = {}
    
    models = {
          'dummy': DummyClassifier(), 
          'decision tree': DecisionTreeClassifier(),
          'kNN': KNeighborsClassifier(),
          'logistic regression': LogisticRegression(),
          'random forest' : RandomForestClassifier(), 
          'xgboost' : XGBClassifier(),
          'lgbm': LGBMClassifier()
         }

    for model_name, model in models.items():
        t = time.time()
        #print(model_name, ":")    
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        clf.fit(X_train, y_train);
        tr_err, valid_err = get_scores(clf, X_train, y_train, 
                                       X_test, y_test, show = False)
        elapsed_time = time.time() - t
        results_dict[model_name] = [round(tr_err,3), round(valid_err,3), round(elapsed_time,4)]
        #print("Elapsed time: %.1f s" % elapsed_time)
    
    results_df = pd.DataFrame(results_dict).T
    results_df.columns = ["Train error", "Validation error", "Time in seconds"]
    return results_df

In [41]:
def oscar_predict(X_train, y_train, dataframe, model):
    """
    Shows Classifier and Probability for the Oscar nomination (or other movie)
    
    Parameters
    ----------
    X_train: numpy.ndarray 
        The X training set of the selected catagory 
    y_train: numpy.ndarray
        The y training set the selected catagory 
    dataframe: numpy.ndarray
        The movies that need to be predicted if they won their category 
    model: sklearn classifier model
        The sklearn model such as LogisticRegression()
    
    Returns
    -------
        pd.DataFrame
    
    """
    # Lets create an empty dictionary to store all the results
    results_dict = {}
    
    movies = list(dataframe.index)
    movies_names = list(dataframe["movie"])
    dataframe_X = dataframe.drop(['year', 'movie', 'index', 'Oscar_Best_Actor_won', 'Oscar_Best_Actress_won', 'Oscar_Best_AdaScreen_won', 'Oscar_Best_Director_won', 'Oscar_Best_OriScreen_won', 'Oscar_Best_Picture_won', "Oscar_Best_Supporting_Actor_won", "Oscar_Best_Supporting_Actress_won"], axis = 1)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
    model = clf.fit(X_train, y_train);
        
    for movie, movie_name in zip(movies, movies_names):
        movie_predict = dataframe_X.loc[[movie]]
        classifier = model.predict(movie_predict)
        classifier = classifier[0]
        prob = model.predict_proba(movie_predict)
        prob = prob[0][1]
        results_dict[movie_name] = [classifier, round(prob,5)]
    
    results_df = pd.DataFrame(results_dict).T
    results_df.columns = ["Classifier", "Probability of it Winning"]
    return results_df

In [42]:
def normalized_winner(predict_df):
    """
    Normalized the chance of a movie winning an oscar
    
    Parameters
    ----------
    predict_df: pd.DataFram from oscar_predict()
        
    Returns
    -------
        pd.DataFrame
    
    """
    predict_df = predict_df.drop(["Classifier"], axis = 1)
    predict_df["Chance of Winning"] = predict_df["Probability of it Winning"]/predict_df["Probability of it Winning"].sum()
    predict_df = predict_df.drop(["Probability of it Winning"], axis = 1)
    predict_df ["Chance of Winning"] = pd.Series(["{0:.2f}%".format(val * 100) for val in predict_df ["Chance of Winning"]], index = predict_df.index)
    return predict_df

### Base Model Best Picture

In [43]:
diff_class_ml(X_train_best_picture, X_test_best_picture, y_train_best_picture, y_test_best_picture)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.037,0.047,0.0206
decision tree,0.0,0.034,0.0194
kNN,0.015,0.009,0.07
logistic regression,0.015,0.009,0.0294
random forest,0.003,0.009,0.032
xgboost,0.0,0.009,0.1117
lgbm,0.0,0.017,0.0838


In [69]:
#winner was Green Book 
predict_picture_2018 = oscar_predict(X_train_best_picture, y_train_best_picture, best_picture_2018_identifiable, LGBMClassifier())
predict_picture_2018

Unnamed: 0,Classifier,Probability of it Winning
A Star Is Born,No,0.00018
Bohemian Rhapsody,No,6e-05
Black Panther,No,7e-05
The Favourite,Yes,0.84858
Roma,No,0.28867
Vice,No,0.14596
Green Book,No,0.00012
BlacKkKlansman,No,0.00215


In [68]:
normalized_winner(predict_picture_2018)

Unnamed: 0,Chance of Winning
A Star Is Born,0.85%
Bohemian Rhapsody,0.07%
Black Panther,1.03%
The Favourite,50.65%
Roma,46.32%
Vice,0.22%
Green Book,0.00%
BlacKkKlansman,0.86%


### Base Model Best Actor

In [46]:
diff_class_ml(X_train_best_actor, X_test_best_actor, y_train_best_actor, y_test_best_actor)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.022,0.038,0.021
decision tree,0.0,0.021,0.0186
kNN,0.011,0.026,0.0603
logistic regression,0.013,0.026,0.0204
random forest,0.002,0.026,0.0322
xgboost,0.0,0.021,0.1021
lgbm,0.0,0.021,0.0859


In [47]:
#winner was Bohemian Rhapsody
predict_actor_2018 = oscar_predict(X_train_best_actor, y_train_best_actor, best_actor_2018_identifiable, XGBClassifier())
predict_actor_2018

Unnamed: 0,Classifier,Probability of it Winning
A Star Is Born,No,0.04638
Bohemian Rhapsody,No,0.02913
Vice,No,0.06742
At Eternity's Gate,No,0.0245
Green Book,No,0.1682


In [48]:
normalized_winner(predict_actor_2018)

Unnamed: 0,Chance of Winning
A Star Is Born,13.82%
Bohemian Rhapsody,8.68%
Vice,20.09%
At Eternity's Gate,7.30%
Green Book,50.11%


### Base Model Best Actress

In [49]:
diff_class_ml(X_train_best_actress, X_test_best_actress, y_train_best_actress, y_test_best_actress)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.036,0.021,0.018
decision tree,0.0,0.009,0.0233
kNN,0.014,0.013,0.0618
logistic regression,0.014,0.013,0.0196
random forest,0.001,0.017,0.0328
xgboost,0.0,0.017,0.1027
lgbm,0.0,0.013,0.0846


In [50]:
#winner was The Favourite
predict_actress_2018 = oscar_predict(X_train_best_actress, y_train_best_actress, best_actress_2018_identifiable, XGBClassifier())
predict_actress_2018

Unnamed: 0,Classifier,Probability of it Winning
A Star Is Born,No,0.18811
The Wife,No,0.01022
Can You Ever Forgive Me?,No,0.00812
The Favourite,Yes,0.91654
Roma,Yes,0.8183


In [51]:
normalized_winner(predict_actress_2018)

Unnamed: 0,Chance of Winning
A Star Is Born,9.69%
The Wife,0.53%
Can You Ever Forgive Me?,0.42%
The Favourite,47.21%
Roma,42.15%


### Base Model Best Adapted Screenplay 

In [52]:
diff_class_ml(X_train_best_ada_screen, X_test_best_ada_screen, y_train_best_ada_screen, y_test_best_ada_screen)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.029,0.034,0.0179
decision tree,0.0,0.034,0.0202
kNN,0.015,0.021,0.0582
logistic regression,0.015,0.017,0.0172
random forest,0.004,0.017,0.0287
xgboost,0.0,0.017,0.1039
lgbm,0.0,0.021,0.0832


In [53]:
#winner BlacKkKlansman
predict_ada_screen_2018 = oscar_predict(X_train_best_ada_screen, y_train_best_ada_screen, best_adascreen_2018_identifiable, LGBMClassifier())
predict_ada_screen_2018

Unnamed: 0,Classifier,Probability of it Winning
A Star Is Born,No,2e-05
Can You Ever Forgive Me?,No,0.0
The Ballad of Buster Scruggs,No,3e-05
If Beale Street Could Talk,No,0.0
BlacKkKlansman,No,0.00595


In [54]:
normalized_winner(predict_ada_screen_2018)

Unnamed: 0,Chance of Winning
A Star Is Born,0.33%
Can You Ever Forgive Me?,0.00%
The Ballad of Buster Scruggs,0.50%
If Beale Street Could Talk,0.00%
BlacKkKlansman,99.17%


### Base Model Best Director

In [55]:
diff_class_ml(X_train_best_director, X_test_best_director, y_train_best_director, y_test_best_director)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.029,0.038,0.0181
decision tree,0.0,0.009,0.0202
kNN,0.019,0.013,0.0613
logistic regression,0.016,0.013,0.0204
random forest,0.003,0.021,0.0321
xgboost,0.001,0.017,0.0978
lgbm,0.0,0.004,0.0873


In [56]:
#Winner Roma
predict_director_2018 = oscar_predict(X_train_best_director, y_train_best_director, best_director_2018_identifiable, LGBMClassifier())
predict_director_2018

Unnamed: 0,Classifier,Probability of it Winning
The Favourite,No,0.07885
Roma,No,0.21045
Vice,No,0.0012
Cold War,No,0.00011
BlacKkKlansman,No,0.06771


In [57]:
normalized_winner(predict_director_2018)

Unnamed: 0,Chance of Winning
The Favourite,22.01%
Roma,58.73%
Vice,0.33%
Cold War,0.03%
BlacKkKlansman,18.90%


### Base Model Best Original Screenplay

In [58]:
diff_class_ml(X_train_best_ori_screen, X_test_best_ori_screen, y_train_best_ori_screen, y_test_best_ori_screen)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.03,0.017,0.0195
decision tree,0.0,0.026,0.018
kNN,0.011,0.013,0.0621
logistic regression,0.014,0.013,0.0177
random forest,0.001,0.021,0.0283
xgboost,0.0,0.017,0.0954
lgbm,0.0,0.017,0.1017


In [59]:
#Winner Green Boook
predict_ori_screen_2018 = oscar_predict(X_train_best_ori_screen, y_train_best_ori_screen, best_oriscreen_2018_identifiable, XGBClassifier())
predict_ori_screen_2018

Unnamed: 0,Classifier,Probability of it Winning
The Favourite,No,0.30961
First Reformed,No,0.02775
Roma,Yes,0.72006
Vice,No,0.03697
Green Book,No,0.00951


In [60]:
normalized_winner(predict_ori_screen_2018)

Unnamed: 0,Chance of Winning
The Favourite,28.05%
First Reformed,2.51%
Roma,65.23%
Vice,3.35%
Green Book,0.86%


### Base Model Best Supporting Actor

In [61]:
diff_class_ml(X_train_best_supporting_actor, X_test_best_supporting_actor, y_train_best_supporting_actor, y_test_best_supporting_actor)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.038,0.03,0.0203
decision tree,0.0,0.055,0.0202
kNN,0.018,0.009,0.0605
logistic regression,0.017,0.009,0.018
random forest,0.001,0.013,0.0303
xgboost,0.003,0.009,0.1061
lgbm,0.0,0.009,0.0899


In [62]:
#Winner Green Book
predict_supporting_actor_2018 = oscar_predict(X_train_best_supporting_actor, y_train_best_supporting_actor, best_supporting_actor_2018_identifiable, LogisticRegression())
predict_supporting_actor_2018

Unnamed: 0,Classifier,Probability of it Winning
A Star Is Born,No,0.09788
Can You Ever Forgive Me?,No,0.01348
Vice,No,0.04596
Green Book,No,0.0443
BlacKkKlansman,No,0.09302


In [63]:
normalized_winner(predict_supporting_actor_2018)

Unnamed: 0,Chance of Winning
A Star Is Born,33.22%
Can You Ever Forgive Me?,4.58%
Vice,15.60%
Green Book,15.04%
BlacKkKlansman,31.57%


### Base Model Best Supporting Actress

In [64]:
diff_class_ml(X_train_best_supporting_actress, X_test_best_supporting_actress, y_train_best_supporting_actress, y_test_best_supporting_actress)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.035,0.021,0.0208
decision tree,0.0,0.038,0.0179
kNN,0.017,0.009,0.061
logistic regression,0.016,0.013,0.0174
random forest,0.005,0.009,0.0294
xgboost,0.0,0.013,0.0957
lgbm,0.0,0.013,0.077


In [65]:
#Winner If Beale Street Could Talk
predict_supporting_actress_2018 = oscar_predict(X_train_best_supporting_actress, y_train_best_supporting_actress, best_supporting_actress_2018_identifiable, XGBClassifier())
predict_supporting_actress_2018

Unnamed: 0,Classifier,Probability of it Winning
The Favourite,No,0.07143
Roma,No,0.18591
Vice,Yes,0.71801
If Beale Street Could Talk,No,0.03552


In [66]:
normalized_winner(predict_supporting_actress_2018)

Unnamed: 0,Chance of Winning
The Favourite,7.07%
Roma,18.39%
Vice,71.03%
If Beale Street Could Talk,3.51%
