## Framework

- Data cleaning and formatting 
- Exploratory data analysis
- Feature engineering and selection
- Compare several machine learning models on a performance metric
- Perform hyper-parameter tuning on the best model 
- Evaluate the best model on the testing set
- Interpret the model results
- Draw conclusions and document work

In [1]:
#load packages

#lm pacakges
from sklearn import tree 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.preprocessing import normalize, scale, Normalizer, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.dummy import DummyClassifier

#other 
import numpy as np
import pandas as pd
import pickle 
import graphviz
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import pandas_profiling
pd.set_option("display.max_colwidth", 200)

import altair as alt
import time

import autotime

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [3]:
df1 = pd.read_csv('data/01_oscar_data.csv')
df2 = pd.read_csv('data/02_oscar_data.csv')

In [4]:
#df1.profile_report(style={'full_width':True})

- remove all the categorical for wins and nominations so model only is concern with # of nominations and wins 
    - might have to return to this later

# Data cleaning and formatting and Exploratory data analysis

In [5]:
#remove all the categorical what the film won and identifiable information 

df1_temp = df1.drop(['year', 'movie', 'movie_id', 'release_date', 'synopsis', 'release_date.year', 'release_date.day-of-month', 'release_date.day-of-week', 'American_Cinema_Editors_nominated_categories', 'American_Cinema_Editors_won_categories', 'Art_Directors_Guild_nominated_categories', 'Art_Directors_Guild_won_categories', 'Austin_Film_Critics_Association_nominated_categories', 'Austin_Film_Critics_Association_won_categories', 'BAFTA_nominated_categories', 'BAFTA_won_categories', 'Boston_Society_of_Film_Critics_nominated_categories', 'Boston_Society_of_Film_Critics_won_categories', 'Costume_Designers_Guild_nominated_categories', 'Costume_Designers_Guild_won_categories', 'Critics_Choice_nominated_categories','Critics_Choice_won_categories', 'Denver_Film_Critics_Society_nominated_categories', 'Denver_Film_Critics_Society_won_categories', 'Directors_Guild_nominated_categories', 'Directors_Guild_won_categories', 'Golden_Globes_nominated_categories', 'Golden_Globes_won_categories', 'Hollywood_Film_nominated_categories', 'Hollywood_Film_won_categories', 'London_Critics_Circle_Film_nominated_categories', 'London_Critics_Circle_Film_won_categories', 'Los_Angeles_Film_Critics_Association_nominated_categories', 'Los_Angeles_Film_Critics_Association_won_categories', 'New_York_Film_Critics_Circle_nominated_categories', 'New_York_Film_Critics_Circle_won_categories', 'Online_Film_Critics_Society_nominated_categories', 'Online_Film_Critics_Society_won_categories', 'Online_Film_Television_Association_nominated_categories', 'Online_Film_Television_Association_won_categories', 'Oscar_nominated_categories', 'People_Choice_nominated_categories', 'People_Choice_won_categories', 'Producers_Guild_nominated_categories', 'Screen_Actors_Guild_nominated_categories', 'Screen_Actors_Guild_won_categories', 'Writers_Guild_nominated_categories', 'Writers_Guild_won_categories', 'Producers_Guild_won_categories'], axis = 1)

In [6]:
#df1_temp.profile_report(style={'full_width':True})

- Remove Hollywood_Film_won and Hollywood_Film_nominated do to high correlation 
- Can keep Online_Film_Television_Association_nominated and awards_nominations because it does not look like they are related
- give gross and popularity missing values the mediumn value
- remove the missing metascore, certificate, and release_date.month values 
- Deal with the categorical nature of genre using MultiLabelBinarizer

In [7]:
df1_temp = df1_temp.drop(['Hollywood_Film_won', 'Hollywood_Film_nominated'], axis = 1)

In [8]:
df1_temp = df1_temp.dropna(subset=['metascore', 'certificate', 'release_date.month'])

In [9]:
df1_temp['gross'].fillna((df1_temp['gross'].median()), inplace=True)
df1_temp['popularity'].fillna((df1_temp['popularity'].median()), inplace=True)

In [10]:
df1_temp['genre'] = [x.split('|') for x in df1_temp['genre']]

In [11]:
mlb = MultiLabelBinarizer()
df1_temp = df1_temp.join(pd.DataFrame(mlb.fit_transform(df1_temp.pop('genre')),
                          columns=mlb.classes_,
                          index=df1_temp.index))

In [12]:
df1_temp.profile_report(style={'full_width':True})



- Fix the histor category history 

In [13]:
df1_temp = df1_temp.reset_index()

In [14]:
df1_temp.query("Histor == 1")

Unnamed: 0,index,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,release_date.month,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Histor,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
682,694,PG-13,128,7.5,89.0,64976,52066000.0,214.0,367.0,489.0,2,20,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No,2,1,4,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,1,0,5,0,3,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1.0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [15]:
df1_temp.iloc[682]["History"] = 1

In [16]:
df1_temp = df1_temp.drop(["Histor", "index"], axis=1) 

- fix the certified column to move tv-ma to r and unrated to not rated

In [17]:
df1_temp = df1_temp.reset_index()

In [18]:
df1_temp.query("certificate == 'TV-MA'")

Unnamed: 0,index,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,release_date.month,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
1159,1159,TV-MA,111,7.8,73.0,631,53465000.0,3.0,19.0,1068.0,0,0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
df1_temp.loc[df1_temp['certificate'].str.contains('TV-MA', case=False), 'certificate'] = 'R'

In [20]:
df1_temp.iloc[1159]['certificate'] 

'R'

In [21]:
df1_temp.loc[df1_temp['certificate'].str.contains('Unrated', case=False), 'certificate'] = 'Not Rated'

In [22]:
df1_temp.query("certificate == 'Unrated'")

Unnamed: 0,index,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,release_date.month,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western


In [23]:
df1_temp = df1_temp.dropna()

In [24]:
df_clean = df1_temp

In [107]:
#structure features 
df_clean = df_clean.drop(['index'], axis = 1)
df_clean.head(1)

Unnamed: 0,certificate,duration,rate,metascore,votes,gross,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Golden_Globes_won,Golden_Globes_nominated,BAFTA_won,BAFTA_nominated,Screen_Actors_Guild_won,Screen_Actors_Guild_nominated,Critics_Choice_won,Critics_Choice_nominated,Directors_Guild_won,Directors_Guild_nominated,Producers_Guild_won,Producers_Guild_nominated,Art_Directors_Guild_won,Art_Directors_Guild_nominated,Writers_Guild_won,Writers_Guild_nominated,Costume_Designers_Guild_won,Costume_Designers_Guild_nominated,Online_Film_Television_Association_won,Online_Film_Television_Association_nominated,Online_Film_Critics_Society_won,Online_Film_Critics_Society_nominated,People_Choice_won,People_Choice_nominated,London_Critics_Circle_Film_won,London_Critics_Circle_Film_nominated,American_Cinema_Editors_won,American_Cinema_Editors_nominated,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_nominated,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_nominated,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_nominated,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_nominated,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_nominated,release_date.month,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,PG-13,118,6.4,44.0,66660,47100000.0,318.0,125.0,2363.0,1,4,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [25]:
X = df_clean.drop(['Oscar_Best_Actor_won', 'Oscar_Best_Actress_won', 'Oscar_Best_AdaScreen_won', 'Oscar_Best_Director_won', 'Oscar_Best_OriScreen_won', 'Oscar_Best_Picture_won', "Oscar_Best_Supporting_Actor_won", "Oscar_Best_Supporting_Actress_won"], axis = 1)
y_best_actor = df_clean['Oscar_Best_Actor_won']
y_best_actress = df_clean['Oscar_Best_Actress_won']
y_best_ada_screen = df_clean['Oscar_Best_AdaScreen_won']
y_best_director = df_clean['Oscar_Best_Director_won']
y_best_ori_screen = df_clean['Oscar_Best_OriScreen_won']
y_best_picture = df_clean['Oscar_Best_Picture_won']
y_best_supporting_actor = df_clean['Oscar_Best_Supporting_Actor_won']
y_best_supporting_actress = df_clean['Oscar_Best_Supporting_Actress_won']

### Split X and Y 

In [92]:
#best actor
X_train, X_test, y_train_best_actor, y_test_best_actor = train_test_split(X,
                                                                          y_best_actor,
                                                                          test_size=0.2)
#best actoress 
X_train, X_test, y_train_best_actress, y_test_best_actress = train_test_split(X,
                                                                          y_best_actress,
                                                                          test_size=0.2)

#best adaptive screen play 
X_train, X_test, y_train_best_ada_screen, y_test_best_ada_screen = train_test_split(X,
                                                                          y_best_ada_screen,
                                                                          test_size=0.2)
#best director
X_train, X_test, y_train_best_director, y_test_best_director = train_test_split(X,
                                                                          y_best_director,
                                                                          test_size=0.2)

#best orginal screen play 
X_train, X_test, y_train_best_ori_screen, y_test_best_ori_screen = train_test_split(X,
                                                                          y_best_ori_screen,
                                                                          test_size=0.2)

#best picture 
X_train, X_test, y_train_best_picture, y_test_best_picture = train_test_split(X,
                                                                          y_best_picture,
                                                                          test_size=0.2)

#best supporting actor 
X_train, X_test, y_train_best_supporting_actor, y_test_best_supporting_actor = train_test_split(X,
                                                                          y_best_supporting_actor,
                                                                          test_size=0.2)

#best suporting actress 
X_train, X_test, y_train_best_supporting_actress, y_test_best_supporting_actress = train_test_split(X,
                                                                          y_best_supporting_actress,
                                                                          test_size=0.2)


In [72]:
#split 
numeric_features = ['duration', 'rate', 'metascore', 'gross', 'user_reviews', 'critic_reviews', 'popularity']
    
    
categorical_features = ['certificate', 'Oscar_Best_Picture_nominated', 'Oscar_Best_Director_nominated', 'Oscar_Best_Actor_nominated', 'Oscar_Best_Actress_nominated', 'Oscar_Best_Supporting_Actress_nominated', 'Oscar_Best_OriScreen_nominated']

### Processing Pipeline 

In [81]:
# transform the data to standardize the values in the data 
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(drop="first"), categorical_features)])


In [82]:
def get_scores(model, 
                X_train, y_train,
                X_test, y_test, 
                show = True
               ):
    if show: 
        print("Training error:   %.2f" % (1-model.score(X_train, y_train)))
        print("Validation error: %.2f" % (1-model.score(X_test, y_test)))
        print('\n')
    return (1-model.score(X_train, y_train)), (1-model.score(X_test, y_test))

In [75]:
def diff_class_ml(y_train, y_test):
    # Lets create an empty dictionary to store all the results
    results_dict = {}
    
    models = {
          'dummy': DummyClassifier(), 
          'decision tree': DecisionTreeClassifier(),
          'kNN': KNeighborsClassifier(),
          'logistic regression': LogisticRegression(),
          'RBF SVM' : SVC(), 
          'random forest' : RandomForestClassifier(), 
          'xgboost' : XGBClassifier(),
          'lgbm': LGBMClassifier()
         }

    for model_name, model in models.items():
        t = time.time()
        #print(model_name, ":")    
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        clf.fit(X_train, y_train);
        tr_err, valid_err = get_scores(clf, X_train, y_train, 
                                       X_test, y_test, show = False)
        elapsed_time = time.time() - t
        results_dict[model_name] = [round(tr_err,3), round(valid_err,3), round(elapsed_time,4)]
        #print("Elapsed time: %.1f s" % elapsed_time)
    
    results_df = pd.DataFrame(results_dict).T
    results_df.columns = ["Train error", "Validation error", "Time in seconds"]
    return results_df

### Base Model Best Picture

In [93]:
diff_class_ml(y_train_best_picture, y_test_best_picture)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.031,0.026,0.021
decision tree,0.0,0.03,0.0201
kNN,0.016,0.013,0.0594
logistic regression,0.016,0.013,0.0195
RBF SVM,0.016,0.013,0.0254
random forest,0.006,0.013,0.0289
xgboost,0.013,0.013,0.1091
lgbm,0.0,0.013,0.0907


### Base Model Best Actor

In [84]:
diff_class_ml(y_train_best_actor, y_test_best_actor)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.03,0.026,0.0196
decision tree,0.0,0.034,0.0219
kNN,0.017,0.009,0.0554
logistic regression,0.017,0.009,0.0182
RBF SVM,0.017,0.009,0.025
random forest,0.005,0.009,0.0303
xgboost,0.014,0.009,0.1111
lgbm,0.0,0.009,0.0922


### Base Model Best Actress

In [85]:
diff_class_ml(y_train_best_actress, y_test_best_actress)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.034,0.026,0.0205
decision tree,0.0,0.021,0.0226
kNN,0.015,0.009,0.0573
logistic regression,0.015,0.009,0.0181
RBF SVM,0.015,0.009,0.0228
random forest,0.006,0.009,0.0318
xgboost,0.012,0.009,0.1076
lgbm,0.0,0.009,0.0969


### Base Model Best Adapted Screenplay 

In [86]:
diff_class_ml(y_train_best_ada_screen, y_test_best_ada_screen)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.031,0.03,0.021
decision tree,0.0,0.026,0.0225
kNN,0.017,0.004,0.0567
logistic regression,0.017,0.004,0.0172
RBF SVM,0.017,0.004,0.0264
random forest,0.009,0.004,0.0303
xgboost,0.014,0.004,0.1149
lgbm,0.0,0.004,0.0956


### Base Model Best Director

In [87]:
diff_class_ml(y_train_best_director, y_test_best_director)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.033,0.038,0.0219
decision tree,0.0,0.034,0.0217
kNN,0.016,0.013,0.0567
logistic regression,0.016,0.013,0.0169
RBF SVM,0.016,0.013,0.0228
random forest,0.004,0.013,0.0306
xgboost,0.007,0.013,0.1064
lgbm,0.0,0.013,0.094


### Base Model Best Original Screenplay

In [88]:
diff_class_ml(y_train_best_ori_screen, y_test_best_ori_screen)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.022,0.026,0.0199
decision tree,0.0,0.043,0.0212
kNN,0.015,0.017,0.0555
logistic regression,0.015,0.017,0.0178
RBF SVM,0.015,0.017,0.0235
random forest,0.002,0.017,0.0302
xgboost,0.014,0.017,0.1028
lgbm,0.0,0.017,0.0911


### Base Model Best Supporting Actor

In [89]:
diff_class_ml(y_train_best_supporting_actor, y_test_best_supporting_actor)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.035,0.017,0.0214
decision tree,0.0,0.051,0.022
kNN,0.015,0.013,0.0569
logistic regression,0.015,0.013,0.0176
RBF SVM,0.015,0.013,0.0245
random forest,0.005,0.013,0.03
xgboost,0.013,0.013,0.1073
lgbm,0.0,0.013,0.0915


### Base Model Best Supporting Actress

In [90]:
diff_class_ml(y_train_best_supporting_actress, y_test_best_supporting_actress)

Unnamed: 0,Train error,Validation error,Time in seconds
dummy,0.042,0.021,0.0204
decision tree,0.0,0.013,0.0237
kNN,0.016,0.004,0.068
logistic regression,0.018,0.0,0.0186
RBF SVM,0.019,0.0,0.021
random forest,0.004,0.0,0.0313
xgboost,0.0,0.009,0.1005
lgbm,0.0,0.004,0.0768


In [95]:
df1["year"].unique()

array([2001, 2000, 2005, 2002, 2003, 2004, 2006, 2009, 2007, 2013, 2008,
       2015, 2010, 2012, 2011, 2014, 2016, 2018, 2017])

In [99]:
df1_short = df1.head(8)

In [100]:
df1_short

Unnamed: 0,year,movie,movie_id,certificate,duration,genre,rate,metascore,synopsis,votes,gross,release_date,user_reviews,critic_reviews,popularity,awards_wins,awards_nominations,Oscar_Best_Picture_won,Oscar_Best_Picture_nominated,Oscar_Best_Director_won,Oscar_Best_Director_nominated,Oscar_Best_Actor_won,Oscar_Best_Actor_nominated,Oscar_Best_Actress_won,Oscar_Best_Actress_nominated,Oscar_Best_Supporting_Actor_won,Oscar_Best_Supporting_Actor_nominated,Oscar_Best_Supporting_Actress_won,Oscar_Best_Supporting_Actress_nominated,Oscar_Best_AdaScreen_won,Oscar_Best_AdaScreen_nominated,Oscar_Best_OriScreen_won,Oscar_Best_OriScreen_nominated,Oscar_nominated,Oscar_nominated_categories,Golden_Globes_won,Golden_Globes_won_categories,Golden_Globes_nominated,Golden_Globes_nominated_categories,BAFTA_won,BAFTA_won_categories,BAFTA_nominated,BAFTA_nominated_categories,Screen_Actors_Guild_won,Screen_Actors_Guild_won_categories,Screen_Actors_Guild_nominated,Screen_Actors_Guild_nominated_categories,Critics_Choice_won,Critics_Choice_won_categories,Critics_Choice_nominated,Critics_Choice_nominated_categories,Directors_Guild_won,Directors_Guild_won_categories,Directors_Guild_nominated,Directors_Guild_nominated_categories,Producers_Guild_won,Producers_Guild_won_categories,Producers_Guild_nominated,Producers_Guild_nominated_categories,Art_Directors_Guild_won,Art_Directors_Guild_won_categories,Art_Directors_Guild_nominated,Art_Directors_Guild_nominated_categories,Writers_Guild_won,Writers_Guild_won_categories,Writers_Guild_nominated,Writers_Guild_nominated_categories,Costume_Designers_Guild_won,Costume_Designers_Guild_won_categories,Costume_Designers_Guild_nominated,Costume_Designers_Guild_nominated_categories,Online_Film_Television_Association_won,Online_Film_Television_Association_won_categories,Online_Film_Television_Association_nominated,Online_Film_Television_Association_nominated_categories,Online_Film_Critics_Society_won,Online_Film_Critics_Society_won_categories,Online_Film_Critics_Society_nominated,Online_Film_Critics_Society_nominated_categories,People_Choice_won,People_Choice_won_categories,People_Choice_nominated,People_Choice_nominated_categories,London_Critics_Circle_Film_won,London_Critics_Circle_Film_won_categories,London_Critics_Circle_Film_nominated,London_Critics_Circle_Film_nominated_categories,American_Cinema_Editors_won,American_Cinema_Editors_won_categories,American_Cinema_Editors_nominated,American_Cinema_Editors_nominated_categories,Hollywood_Film_won,Hollywood_Film_won_categories,Hollywood_Film_nominated,Hollywood_Film_nominated_categories,Austin_Film_Critics_Association_won,Austin_Film_Critics_Association_won_categories,Austin_Film_Critics_Association_nominated,Austin_Film_Critics_Association_nominated_categories,Denver_Film_Critics_Society_won,Denver_Film_Critics_Society_won_categories,Denver_Film_Critics_Society_nominated,Denver_Film_Critics_Society_nominated_categories,Boston_Society_of_Film_Critics_won,Boston_Society_of_Film_Critics_won_categories,Boston_Society_of_Film_Critics_nominated,Boston_Society_of_Film_Critics_nominated_categories,New_York_Film_Critics_Circle_won,New_York_Film_Critics_Circle_won_categories,New_York_Film_Critics_Circle_nominated,New_York_Film_Critics_Circle_nominated_categories,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_won_categories,Los_Angeles_Film_Critics_Association_nominated,Los_Angeles_Film_Critics_Association_nominated_categories,release_date.year,release_date.month,release_date.day-of-month,release_date.day-of-week
0,2001,Kate & Leopold,tt0035423,PG-13,118,Comedy|Fantasy|Romance,6.4,44.0,An English Duke from 1876 is inadvertedly dragged to modern day New York where he falls for a plucky advertising executive.,66660,47100000.0,2001-12-25,318.0,125.0,2363.0,1,4,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,1,"Best Music, Original Song",1,Best Original Song - Motion Picture,2,Best Original Song - Motion Picture|Best Performance by an Actor in a Motion Picture - Comedy or Musical,0,,0,,0,,0,,0,,1,Best Song,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,"Best Music, Original Song",0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2001.0,12.0,25.0,2.0
1,2000,Chicken Run,tt0120630,G,84,Animation|Adventure|Comedy,7.0,88.0,"When a cockerel apparently flies into a chicken farm, the chickens see him as an opportunity to escape their evil owners.",144475,106790000.0,2000-06-23,361.0,186.0,2859.0,5,11,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,,0,,1,Best Motion Picture - Comedy or Musical,0,,2,|Best Achievement in Special Visual Effects,0,,0,,1,Best Animated Film,1,Best Animated Film,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,Best Animated Picture,2,Best Animated Picture|Best Voice-Over Performance,1,Top Ten Films of the Year,1,Top Ten Films of the Year,0,,0,,0,,2,British Film of the Year|British Producer of the Year,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,Best Animated Film,1,Best Animated Film,1,Best Animation,1,Best Animation,2000.0,6.0,23.0,5.0
2,2005,Fantastic Four,tt0120667,PG-13,106,Action|Adventure|Family,5.7,40.0,"A group of astronauts gain superpowers after a cosmic radiation exposure and must use them to oppose the plans of their enemy, Doctor Victor Von Doom.",273203,154700000.0,2005-07-08,1008.0,278.0,1876.0,0,0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2005.0,7.0,8.0,5.0
3,2002,Frida,tt0120679,R,123,Biography|Drama|Romance,7.4,61.0,"A biography of artist Frida Kahlo, who channeled the pain of a crippling injury and her tempestuous marriage into her work.",63852,25780000.0,2002-11-22,272.0,126.0,2508.0,2,12,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,6,"Best Music, Original Score|Best Makeup|Best Performance by an Actress in a Leading Role|Best Art Direction-Set Decoration|Best Costume Design|Best Music, Original Song",1,Best Original Score - Motion Picture,2,Best Original Score - Motion Picture|Best Performance by an Actress in a Motion Picture - Drama,1,Best Make Up/Hair,4,Best Make Up/Hair|Best Performance by an Actress in a Leading Role|Best Performance by an Actor in a Supporting Role|Best Costume Design,0,,2,Outstanding Performance by a Female Actor in a Leading Role|Outstanding Performance by a Male Actor in a Supporting Role,0,,2,Best Supporting Actor|Best Actress,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,Excellence in Period/Fantasy Film,0,,1,Best Costume Design,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2002.0,11.0,22.0,5.0
4,2001,The Lord of the Rings: The Fellowship of the Ring,tt0120737,PG-13,178,Adventure|Drama|Fantasy,8.8,92.0,A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle Earth from the Dark Lord Sauron.,1286275,313840000.0,2001-12-19,5078.0,296.0,204.0,26,67,No,Yes,No,Yes,No,No,No,No,No,Yes,No,No,No,Yes,No,No,13,"Best Cinematography|Best Makeup|Best Music, Original Score|Best Effects, Visual Effects|Best Motion Picture of the Year|Best Performance by an Actor in a Supporting Role|Best Achievement in Direct...",0,,4,Best Motion Picture - Drama|Best Director - Motion Picture|Best Original Score - Motion Picture|Best Original Song - Motion Picture,5,|Best Film|Best Achievement in Special Visual Effects|Best Make Up/Hair|,14,|Best Film|Best Achievement in Special Visual Effects|Best Make Up/Hair|||Best Feature Film|Best Screenplay - Adapted|Best Performance by an Actor in a Leading Role|Best Cinematography|Best Produc...,1,Outstanding Performance by a Male Actor in a Supporting Role,2,Outstanding Performance by a Male Actor in a Supporting Role|Outstanding Performance by the Cast of a Theatrical Motion Picture,3,Favorite Film Franchise|Best Song|Best Composer,5,Favorite Film Franchise|Best Song|Best Composer|Best Director|Best Picture,0,,1,Outstanding Directorial Achievement in Motion Pictures,0,,1,Outstanding Producer of Theatrical Motion Pictures,0,,1,Period or Fantasy Film,0,,1,Best Screenplay Based on Material Previously Produced or Published,0,,0,,13,"Motion Picture|Best Picture|Best Ensemble|Best Casting|Best Director|Best Writing, Screenplay Based on Material from Another Medium|Best Music, Original Score|Best Production Design|Best Makeup an...",22,"Motion Picture|Best Picture|Best Ensemble|Best Casting|Best Director|Best Writing, Screenplay Based on Material from Another Medium|Best Music, Original Score|Best Production Design|Best Makeup an...",1,Top Ten Films of the Year,8,Top Ten Films of the Year|Best Picture|Best Director|Best Supporting Actor|Best Ensemble|Best Adapted Screenplay|Best Cinematography|Best Original Score,2,Favorite Motion Picture|Favorite Dramatic Motion Picture,3,Favorite Move Fan Following|Favorite Motion Picture|Favorite Dramatic Motion Picture,0,,0,,0,,1,Best Edited Feature Film - Dramatic,0,,0,,0,,1,Best Movie of the Decade,0,,0,,0,,1,Best Director,0,,0,,1,Best Music,2,Best Music|Best Production Design,2001.0,12.0,19.0,3.0
5,2000,Mission: Impossible II,tt0120755,PG-13,123,Action|Adventure|Thriller,6.1,59.0,"A secret agent is sent to Sydney, to find and destroy a genetically modified disease called ""Chimera"".",249988,215400000.0,2000-05-24,1431.0,234.0,2153.0,1,2,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,,0,,0,,0,,0,,0,,0,,1,Best Score,1,Best Score,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,Best Titles Sequence,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2000.0,5.0,24.0,3.0
6,2002,Resident Evil,tt0120804,R,100,Action|Horror|Sci-Fi,6.7,33.0,"A special military unit fights a powerful, out-of-control supercomputer and hundreds of scientists who have mutated into flesh-eating creatures after a laboratory accident.",204545,39530000.0,2002-03-15,1145.0,225.0,355.0,0,0,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2002.0,3.0,15.0,5.0
7,2000,X-Men,tt0120903,PG-13,104,Action|Adventure|Sci-Fi,7.4,64.0,Two mutants come to a private academy for their kind whose resident superhero team must oppose a terrorist organization with similar powers.,468351,157300000.0,2000-07-14,1406.0,289.0,655.0,0,3,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,1,Excellence in Contemporary Film,0,,2,Best Makeup and Hairstyling|Best Visual Effects,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,2000.0,7.0,14.0,5.0


x = 1
