### Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

### Preparations

In [50]:
#Load in updated datafile until (n-1)th season
acting=pd.read_pickle('acting_dataframe.pkl')
picture=pd.read_pickle('picture_dataframe.pkl')
director=pd.read_pickle('director_dataframe.pkl')
#Data for new to separate Dataframe
acting_new=pd.read_pickle('acting_2020')
picture_new=pd.read_pickle('picture_2020')
director_new=pd.read_pickle('director_2020')

In [5]:
supporting_acting_predictors=['imdb_score','rt_audience_score', 'rt_critic_score','total_oscar_noms', 'Q1_release','Q2_release', 'Q3_release', 'Q4_release', 'best_film_nom',
 'SAG_nom_1','SAG_nom_2', 'SAG_win_1','SAG_win_2', 'BAFTA_nom','critics_choice_nom_1', 'critics_choice_nom_2','SAG_cast_win_1','SAG_cast_win_2','SAG_cast_nom_1','SAG_cast_nom_2',
'BAFTA_win','critics_choice_win_1', 'critics_choice_win_2','GG_supporting_nom','GG_supporting_win','previous_oscar_noms', 'previous_oscar_wins',
 'previous_nominee','previous_winner',  '<25','25-35', '35-45','45-55','55-65','65-75','75<',
'action','biography', 'crime', 'comedy', 'drama', 'horror', 'fantasy', 'sci-fi','mystery', 'music', 'romance', 'history', 'war',
 'thriller','adventure','family','sport','western','G','PG', 'PG-13', 'R']

acting_lead_predictors=['imdb_score', 'rt_audience_score', 'rt_critic_score','total_oscar_noms', 'Q1_release', 'Q2_release', 'Q3_release', 'Q4_release',
 'best_film_nom', 'SAG_nom_1','SAG_nom_2', 'SAG_win_1','SAG_win_2', 'BAFTA_nom', 'critics_choice_nom_1', 'critics_choice_nom_2',
 'SAG_cast_win_1','SAG_cast_win_2','SAG_cast_nom_1','SAG_cast_nom_2',
 'BAFTA_win', 'critics_choice_win_1', 'critics_choice_win_2', 'GG_comedy_lead_nom', 'GG_drama_lead_nom', 'GG_comedy_lead_win', 'GG_drama_lead_win', 'previous_oscar_noms', 'previous_oscar_wins',
 'previous_nominee','previous_winner',   '<25','25-35', '35-45','45-55','55-65','65-75','75<', 'action', 'biography', 'crime', 'comedy', 'drama',
'horror', 'fantasy', 'sci-fi', 'mystery', 'music', 'romance', 'history', 'war', 'thriller', 'adventure', 'family', 'sport', 'western', 'G', 'PG',
 'PG-13','R']

picture_predictors=['imdb_score', 'rt_audience_score', 'rt_critic_score','total_oscar_noms',
'Q1_release', 'Q2_release', 'Q3_release', 'Q4_release',
 'best_dir_nom', 'PGA_nom_1','PGA_nom_2', 'SAG_nom_1', 'SAG_nom_2', 'DGA_nom', 'BAFTA_nom', 'critics_choice_nom_1', 'critics_choice_nom_2', 'PGA_win_1', 'PGA_win_2', 'SAG_win_1', 'SAG_win_2', 'DGA_win', 'BAFTA_win',
 'critics_choice_win_1','critics_choice_win_2', 'GG_comedy_nom', 'GG_drama_nom', 'GG_comedy_win', 'GG_drama_win',
 'action', 'biography',
 'crime', 'comedy', 'drama', 'horror', 'fantasy', 'sci-fi', 'mystery', 'music', 'romance', 'history', 'war', 'thriller',
 'adventure', 'family', 'sport', 'western',
 'G', 'PG', 'PG-13', 'R']

director_predictors=['imdb_score', 'rt_audience_score', 'rt_critic_score','total_oscar_noms','Q1_release', 'Q2_release', 'Q3_release', 'Q4_release', 'best_film_nom', 'DGA_nom', 'BAFTA_nom',
 'critics_choice_nom_1', 'critics_choice_nom_2', 'DGA_win', 'BAFTA_win', 'critics_choice_win_1', 'critics_choice_win_2','gg_win','gg_nom',
'director_previous_oscar_nom', 'director_previous_oscar_win',  'action', 'biography', 'crime', 'comedy', 'drama', 'horror', 'fantasy',
 'sci-fi', 'mystery', 'music', 'romance', 'history', 'war', 'thriller', 'adventure', 'family', 'sport', 'western', 'G', 'PG',
 'PG-13','R']

In [6]:
#Category lists for indexing:
supporting=['Supporting Actor','Supporting Actress']
lead=['Actor','Actress']
directing=['Director']
bestpicture=['Picture']

In [8]:
#picture
X_p = picture[picture_predictors]
y_p = picture['winner']
#director
X_d = director[director_predictors]
y_d = director['winner']
#lead acting
X_l = acting[acting.category.isin(lead)][acting_lead_predictors]
y_l = acting[acting.category.isin(lead)]['winner']
#supporting acting
X_s = acting[acting.category.isin(supporting)][supporting_acting_predictors]
y_s = acting[acting.category.isin(supporting)]['winner']

In [51]:
#Forecast data (new season)
Xf_p = picture_new[picture_predictors]
Xf_d = director_new[director_predictors]
Xf_l = acting_new[acting_new.category.isin(lead)][acting_lead_predictors]
Xf_s= acting_new[acting_new.category.isin(supporting)][supporting_acting_predictors]


## Train & predict

#### Logistic Regression

In [53]:
def whowillwin(model,X_train,y_train,Xf,data,category):
#Makes prediction for the films stored in data. data has to be a dataframe of the right format
#category can be "supporting","lead","directing" or "bestpicture"
    model.fit(X_train,y_train)
    cat_df=data[data.category.isin(category)] 
    

    probs=model.predict_proba(Xf)[:,1]
    cat_df=cat_df.assign(probs=probs)
    y_class=[]
    for year in cat_df['ceremony year'].unique():
        for cat in cat_df['category'].unique():
            maxprob=max(cat_df[(cat_df['ceremony year']==year)&(cat_df['category']==cat)]['probs'])
            szelet=(cat_df[(cat_df['ceremony year']==year)&(cat_df['category']==cat)]['probs']==maxprob).astype(int).values
            y_class=np.concatenate((y_class,szelet))

    cat_df=cat_df.assign(classification=y_class.astype(int))
    
    return cat_df[cat_df['ceremony year']==2020][['category','film','name','ceremony year','probs','classification']]


In [54]:
logCV=LogisticRegressionCV()
winners_LR  = pd.DataFrame(columns = ['category','film','name','ceremony year','probs','classification'])
preds_LR_s=whowillwin(logCV,X_s,y_s,Xf_s,acting_new,supporting)
preds_LR_l=whowillwin(logCV,X_l,y_l,Xf_l,acting_new,lead)
preds_LR_d=whowillwin(logCV,X_d,y_d,Xf_d,director_new,directing)
preds_LR_p=whowillwin(logCV,X_p,y_p,Xf_p,picture_new,bestpicture)

for cat in [preds_LR_s,preds_LR_l,preds_LR_d,preds_LR_p]:
    winners_LR=pd.concat([winners_LR,cat[cat.classification==1]])
winners_LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,category,film,name,ceremony year,probs,classification
14,Supporting Actor,Once Upon a Time in Hollywood,Brad Pitt,2020,0.94611,1
16,Supporting Actress,Marriage Story,Laura Dern,2020,0.924399,1
3,Actor,Joker,Joaquin Phoenix,2020,0.951379,1
9,Actress,Judy,Renée Zellweger,2020,0.792294,1
2,Director,1917,Sam Mendes,2020,0.92901,1
6,Picture,1917,1917,2020,0.539491,1


#### Random Forest

In [55]:
rfc=RandomForestClassifier(n_estimators=250)

winners_rfc  = pd.DataFrame(columns = ['category','film','name','ceremony year','probs','classification'])
preds_rfc_s=whowillwin(rfc,X_s,y_s,Xf_s,acting_new,supporting)
preds_rfc_l=whowillwin(rfc,X_l,y_l,Xf_l,acting_new,lead)
preds_rfc_d=whowillwin(rfc,X_d,y_d,Xf_d,director_new,directing)
preds_rfc_p=whowillwin(rfc,X_p,y_p,Xf_p,picture_new,bestpicture)

for cat in [preds_rfc_s,preds_rfc_l,preds_rfc_d,preds_rfc_p]:
    winners_rfc=pd.concat([winners_rfc,cat[cat.classification==1]])
winners_rfc

Unnamed: 0,category,film,name,ceremony year,probs,classification
14,Supporting Actor,Once Upon a Time in Hollywood,Brad Pitt,2020,0.756,1
16,Supporting Actress,Marriage Story,Laura Dern,2020,0.788,1
3,Actor,Joker,Joaquin Phoenix,2020,0.704,1
9,Actress,Judy,Renée Zellweger,2020,0.572,1
2,Director,1917,Sam Mendes,2020,0.864,1
6,Picture,1917,1917,2020,0.484,1


#### Support Vector Machines

In [56]:
def whowillwin_SVM(X_train,y_train,Xf,data,category):
#Makes prediction for the films stored in data. data has to be a dataframe of the right format
#category can be "supporting","lead","directing" or "bestpicture"
    param_grid={'C':[0.1,1,10,100,100],'gamma':[1,0.1,0.01,0.001,0.0001]}
    grid=GridSearchCV(SVC(),param_grid,verbose=0)
    grid.fit(X_train,y_train) 
    C=grid.best_params_.get('C')
    gamma=grid.best_params_.get('gamma')
    clf=SVC(probability=True,gamma=gamma,C=C)
    
    clf.fit(X_train,y_train)
    cat_df=data[data.category.isin(category)] 
    probs=clf.predict_proba(Xf)[:,1]
    cat_df=cat_df.assign(probs=probs)
    y_class=[]
    for year in cat_df['ceremony year'].unique():
        for cat in cat_df['category'].unique():
            maxprob=max(cat_df[(cat_df['ceremony year']==year)&(cat_df['category']==cat)]['probs'])
            szelet=(cat_df[(cat_df['ceremony year']==year)&(cat_df['category']==cat)]['probs']==maxprob).astype(int).values
            y_class=np.concatenate((y_class,szelet))

    cat_df=cat_df.assign(classification=y_class.astype(int))
    
    return cat_df[cat_df['ceremony year']==2020][['category','film','name','ceremony year','probs','classification']]


In [57]:
winners_SVM  = pd.DataFrame(columns = ['category','film','name','ceremony year','probs','classification'])
preds_SVM_s=whowillwin_SVM(X_s,y_s,Xf_s,acting_new,supporting)
preds_SVM_l=whowillwin_SVM(X_l,y_l,Xf_l,acting_new,lead)
preds_SVM_d=whowillwin_SVM(X_d,y_d,Xf_d,director_new,directing)
preds_SVM_p=whowillwin_SVM(X_p,y_p,Xf_p,picture_new,bestpicture)

for cat in [preds_SVM_s,preds_SVM_l,preds_SVM_d,preds_SVM_p]:
    winners_SVM=pd.concat([winners_SVM,cat[cat.classification==1]])
winners_SVM

Unnamed: 0,category,film,name,ceremony year,probs,classification
14,Supporting Actor,Once Upon a Time in Hollywood,Brad Pitt,2020,0.686624,1
16,Supporting Actress,Marriage Story,Laura Dern,2020,0.796539,1
3,Actor,Joker,Joaquin Phoenix,2020,0.864938,1
9,Actress,Judy,Renée Zellweger,2020,0.649886,1
2,Director,1917,Sam Mendes,2020,0.936506,1
6,Picture,1917,1917,2020,0.451683,1


In [13]:
acting_new[acting_new['category']=="Actress"]

Unnamed: 0,award,category,film,name,winner,prod year,ceremony year,box_office,budget,country,...,critics_choice_nom_1,critics_choice_nom_2,SAG_win_1,SAG_win_2,SAG_nom_1,SAG_nom_2,SAG_cast_win_1,SAG_cast_win_2,SAG_cast_nom_1,SAG_cast_nom_2
5,Oscar,Actress,Harriet,Cynthia Erivo,,2019,2020,,,,...,0,1,1,0,0,1,1,0,1,0
6,Oscar,Actress,Marriage Story,Scarlett Johansson,,2019,2020,,,,...,0,1,1,0,0,1,1,0,1,0
7,Oscar,Actress,Little Women,Saoirse Ronan,,2019,2020,,,,...,0,1,1,0,1,0,1,0,1,0
8,Oscar,Actress,Bombshell,Charlize Theron,,2019,2020,,,,...,0,1,1,0,0,1,1,0,0,1
9,Oscar,Actress,Judy,Renée Zellweger,,2019,2020,,,,...,0,1,0,1,0,1,1,0,1,0
