## 0. Imports and Data Setup

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
import tensorflow as tf

In [2]:
movies = pd.read_csv('../data/movies_1990_2020_complete.csv', index_col = 'tconst', low_memory = False)
movies_2020 = movies[movies['startYear']==2020]
movies = movies[movies['startYear']!=2020]
movies_2020 = movies_2020.fillna(0)

In [3]:
X_vars_all = ['runtimeMinutes', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
    'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 
    'Warner Bros.', 'Universal Pictures', 'Columbia Pictures',
    'Paramount Pictures', 'Dreamworks Pictures', 'Walt Disney Pictures',
    'Miramax', 'Twentieth Century Fox', 'New Line Cinema', 'Focus Features',
    'Fox Searchlight Pictures', 'Touchstone Pictures',
    'Walt Disney Animation Studios', 'BBC Films', 'TriStar Pictures',
    'New Regency Productions', 'Fox 2000 Pictures', 'The Weinstein Company',
    'Annapurna Pictures', 'Castle Rock Entertainment',
     'oscars_post_release', 'US_open_WE', 'prev_Oscars', 'rel_USA', 'prev_Oscar_wins_nm', 'prev_Oscar_noms_nm', 'GG',
    'rel__aug', 'rel__dec', 'rel__feb', 'rel__jan', 'rel__jul', 'rel__jun',
    'rel__mar', 'rel__may', 'rel__nov', 'rel__oct', 'rel__sep', 'budget_USD']

### 1. Predicting 2021 nominations using RFC (sel. based on cross-validation recall and precision)

In [4]:
nominees_2020_tconst = ['tt6772802','tt6723592','tt6878306','tt13143964','tt5198068','tt7488208', 'tt1070874', 'tt7146812', 'tt2948372', 'tt10618286', 'tt4566758','tt6048922','tt10272386', 'tt10633456', 'tt10514222', 'tt11161474',
 'tt10539608', 'tt3661394', 'tt10612922', 'tt2222042', 'tt10288566', 'tt12888462', 'tt11394298', 'tt11416746', 'tt10627584', 'tt10360862', 'tt8580274', 'tt8633462', 'tt8923484', 'tt9214832',
 'tt9620292', 'tt9770150', 'tt9777644', 'tt8333746', 'tt8521718', 'tt9586294', 'tt9784798', 'tt5363618', 'tt6571548', 'tt10706602', 'tt6193408','tt10514222']

In [5]:
rfc_nom = pickle.load(open('../pickles/rfc_nom.p', 'rb'))
stan_nom = pickle.load(open('../pickles/stan_nom.p', 'rb'))

In [6]:
movies_2020_Z = stan_nom.transform(movies_2020[X_vars_all])
tto_pred_noms = pd.DataFrame(index = movies_2020.index, columns = ['RFC_nom', 'RFC_nom_proba'],
                             data = np.array([rfc_nom.predict(movies_2020_Z), [_[1] for _ in rfc_nom.predict_proba(movies_2020_Z)]]).T)
tto_pred_noms = tto_pred_noms.merge(movies_2020[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_pred_noms.sort_values('RFC_nom_proba', inplace= True, ascending=False)
tto_pred_noms['nominated'] = tto_pred_noms.index.isin(nominees_2020_tconst).astype(int)

tto_pred_noms.head(10)

Unnamed: 0_level_0,RFC_nom,RFC_nom_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt5363618,1.0,0.855553,Sound of Metal,"Drama,Music",120.0,1
tt6772802,1.0,0.842464,Hillbilly Elegy,Drama,116.0,1
tt6723592,1.0,0.81251,Tenet,"Action,Sci-Fi,Thriller",150.0,1
tt7146812,1.0,0.798921,Onward,"Adventure,Animation,Comedy",102.0,1
tt5198068,1.0,0.746361,Wolfwalkers,"Adventure,Animation,Family",103.0,1
tt6878306,1.0,0.701578,News of the World,"Action,Adventure,Drama",118.0,1
tt2948372,1.0,0.699115,Soul,"Adventure,Animation,Comedy",100.0,1
tt13155280,1.0,0.652947,Queen Elizabeth II: End of A Reign,"Biography,Documentary",60.0,0
tt2850386,1.0,0.64934,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,0
tt5807330,1.0,0.643586,Tread,Documentary,89.0,0


#### True positives:

In [7]:
tto_pred_noms[(tto_pred_noms['nominated']==1) & (tto_pred_noms['RFC_nom'] == 1)]

Unnamed: 0_level_0,RFC_nom,RFC_nom_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt5363618,1.0,0.855553,Sound of Metal,"Drama,Music",120.0,1
tt6772802,1.0,0.842464,Hillbilly Elegy,Drama,116.0,1
tt6723592,1.0,0.81251,Tenet,"Action,Sci-Fi,Thriller",150.0,1
tt7146812,1.0,0.798921,Onward,"Adventure,Animation,Comedy",102.0,1
tt5198068,1.0,0.746361,Wolfwalkers,"Adventure,Animation,Family",103.0,1
tt6878306,1.0,0.701578,News of the World,"Action,Adventure,Drama",118.0,1
tt2948372,1.0,0.699115,Soul,"Adventure,Animation,Comedy",100.0,1
tt7488208,1.0,0.559584,Over the Moon,"Adventure,Animation,Comedy",95.0,1
tt13143964,1.0,0.540816,Borat Subsequent Moviefilm,Comedy,95.0,1


#### False positives:

In [8]:
tto_pred_noms[(tto_pred_noms['nominated']==0) & (tto_pred_noms['RFC_nom'] == 1)].sample(10)

Unnamed: 0_level_0,RFC_nom,RFC_nom_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt6587640,1.0,0.51531,Trolls World Tour,"Adventure,Animation,Comedy",90.0,0
tt11433696,1.0,0.511253,Solo Brathuke So Better,Drama,122.0,0
tt12590534,1.0,0.601128,Dear Santa,"Documentary,Family",84.0,0
tt13155280,1.0,0.652947,Queen Elizabeth II: End of A Reign,"Biography,Documentary",60.0,0
tt5807330,1.0,0.643586,Tread,Documentary,89.0,0
tt12801356,1.0,0.584232,MLK/FBI,Documentary,104.0,0
tt12786756,1.0,0.523922,Freak Power: The Ballot or the Bomb,Documentary,94.0,0
tt2850386,1.0,0.64934,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,0
tt10256238,1.0,0.546822,Created Equal: Clarence Thomas in His Own Words,Documentary,116.0,0
tt9893250,1.0,0.582063,I Care a Lot,"Comedy,Crime,Thriller",118.0,0


False negatives:

In [9]:
tto_pred_noms[(tto_pred_noms['nominated']==1) & (tto_pred_noms['RFC_nom'] == 0)]

Unnamed: 0_level_0,RFC_nom,RFC_nom_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt4566758,0.0,0.39978,Mulan,"Action,Adventure,Drama",115.0,1
tt8333746,0.0,0.39032,Pinocchio,"Drama,Fantasy",125.0,1
tt9214832,0.0,0.333783,Emma.,"Comedy,Drama,Romance",124.0,1
tt11161474,0.0,0.28583,Pieces of a Woman,Drama,126.0,1
tt10618286,0.0,0.284501,Mank,"Biography,Comedy,Drama",131.0,1
tt10633456,0.0,0.277813,Minari,Drama,115.0,1
tt6048922,0.0,0.275088,Greyhound,"Action,Drama,History",91.0,1
tt1070874,0.0,0.248955,The Trial of the Chicago 7,"Drama,History,Thriller",129.0,1
tt10539608,0.0,0.225611,The Midnight Sky,"Drama,Fantasy,Sci-Fi",118.0,1
tt10706602,0.0,0.211454,Collective,Documentary,109.0,1


### 2. Predicting 2021 winners using a Voting Classifier (based on precision-recall results from cross-validation)

In [10]:
winners_2020_tconst = ['tt10272386', 'tt10288566', 'tt10514222' , 'tt10618286', 'tt10633456', 'tt12888462', 'tt2948372', 'tt5363618', 'tt6723592', 'tt9620292',
                      'tt9770150', 'tt9784798']

In [11]:
stan_win = pickle.load(open('../pickles/stan_win.p', 'rb'))
movies_2020_Z = stan_win.transform(movies_2020[X_vars_all])

In [12]:
vote_win = pickle.load(open('../pickles/vote_win.p', 'rb'))

tto_pred_wins = pd.DataFrame(index = movies_2020.index, columns = ['vote_win_pred'],
                             data = vote_win.predict(movies_2020_Z))
tto_pred_wins = tto_pred_wins.merge(movies_2020[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_pred_wins.sort_values('vote_win_pred', inplace= True, ascending=False)
tto_pred_wins['won'] = tto_pred_wins.index.isin(winners_2020_tconst).astype(int)

tto_pred_wins.head(10)

Unnamed: 0_level_0,vote_win_pred,primaryTitle,genres,runtimeMinutes,won
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt2850386,1,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,0
tt5363618,1,Sound of Metal,"Drama,Music",120.0,1
tt13584878,1,I Am Better Because of You,"Drama,Romance",150.0,0
tt4566758,1,Mulan,"Action,Adventure,Drama",115.0,0
tt6673612,1,Dolittle,"Adventure,Comedy,Family",101.0,0
tt6772802,1,Hillbilly Elegy,Drama,116.0,0
tt6878306,1,News of the World,"Action,Adventure,Drama",118.0,0
tt2948372,1,Soul,"Adventure,Animation,Comedy",100.0,1
tt7146812,1,Onward,"Adventure,Animation,Comedy",102.0,0
tt9893250,1,I Care a Lot,"Comedy,Crime,Thriller",118.0,0


#### True positives:

In [13]:
tto_pred_wins[(tto_pred_wins['won']==1) & (tto_pred_wins['vote_win_pred'] == 1)]

Unnamed: 0_level_0,vote_win_pred,primaryTitle,genres,runtimeMinutes,won
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt5363618,1,Sound of Metal,"Drama,Music",120.0,1
tt2948372,1,Soul,"Adventure,Animation,Comedy",100.0,1
tt6723592,1,Tenet,"Action,Sci-Fi,Thriller",150.0,1


#### False positives:

In [14]:
tto_pred_wins[(tto_pred_wins['won']==0) & (tto_pred_wins['vote_win_pred'] == 1)].sample(10)

Unnamed: 0_level_0,vote_win_pred,primaryTitle,genres,runtimeMinutes,won
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt4566758,1,Mulan,"Action,Adventure,Drama",115.0,0
tt3794354,1,Sonic the Hedgehog,"Action,Adventure,Comedy",99.0,0
tt13143964,1,Borat Subsequent Moviefilm,Comedy,95.0,0
tt9893250,1,I Care a Lot,"Comedy,Crime,Thriller",118.0,0
tt6878306,1,News of the World,"Action,Adventure,Drama",118.0,0
tt8521718,1,The United States vs. Billie Holiday,"Biography,Drama,Music",130.0,0
tt6772802,1,Hillbilly Elegy,Drama,116.0,0
tt7146812,1,Onward,"Adventure,Animation,Comedy",102.0,0
tt1634106,1,Bloodshot,"Action,Drama,Sci-Fi",109.0,0
tt2850386,1,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,0


#### False negatives:

In [15]:
tto_pred_wins[(tto_pred_wins['won']==1) & (tto_pred_wins['vote_win_pred'] == 0)]

Unnamed: 0_level_0,vote_win_pred,primaryTitle,genres,runtimeMinutes,won
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt12888462,0,My Octopus Teacher,Documentary,85.0,1
tt9784798,0,Judas and the Black Messiah,"Biography,Drama,History",126.0,1
tt9770150,0,Nomadland,Drama,108.0,1
tt9620292,0,Promising Young Woman,"Crime,Drama,Thriller",113.0,1
tt10272386,0,The Father,Drama,97.0,1
tt10288566,0,Another Round,"Comedy,Drama",117.0,1
tt10514222,0,Ma Rainey's Black Bottom,"Drama,Music",94.0,1
tt10633456,0,Minari,Drama,115.0,1
tt10618286,0,Mank,"Biography,Comedy,Drama",131.0,1


## 3. Predicting 2021 Best Picture nominees using RFC (based on precision-recall results from cross-validation)

In [16]:
bp_nominees_2020_tconst = ['tt10633456', 'tt1070874', 'tt10272386', 'tt10618286', 'tt9770150', 'tt9784798', 'tt5363618', 'tt9620292']

In [17]:
rfc_bp= pickle.load(open('../pickles/rfc_bp.p', 'rb'))
stan_bp = pickle.load(open('../pickles/stan_bp.p', 'rb'))

In [18]:
movies_2020_Z = stan_bp.transform(movies_2020[X_vars_all])
tto_pred_noms = pd.DataFrame(index = movies_2020.index, columns = ['RFC_bp_nom', 'RFC_bp_proba'],
                             data = np.array([rfc_bp.predict(movies_2020_Z), [_[1] for _ in rfc_bp.predict_proba(movies_2020_Z)]]).T)
tto_pred_noms = tto_pred_noms.merge(movies_2020[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_pred_noms.sort_values('RFC_bp_proba', inplace= True, ascending=False)
tto_pred_noms['nominated'] = tto_pred_noms.index.isin(bp_nominees_2020_tconst).astype(int)

tto_pred_noms.head(10)

Unnamed: 0_level_0,RFC_bp_nom,RFC_bp_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt6772802,1.0,0.669925,Hillbilly Elegy,Drama,116.0,0
tt6723592,1.0,0.55996,Tenet,"Action,Sci-Fi,Thriller",150.0,0
tt7146812,0.0,0.499911,Onward,"Adventure,Animation,Comedy",102.0,0
tt6878306,0.0,0.399946,News of the World,"Action,Adventure,Drama",118.0,0
tt2850386,0.0,0.379942,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,0
tt2948372,0.0,0.369944,Soul,"Adventure,Animation,Comedy",100.0,0
tt13143964,0.0,0.33995,Borat Subsequent Moviefilm,Comedy,95.0,0
tt7504726,0.0,0.3199,The Call of the Wild,"Adventure,Drama,Family",100.0,0
tt5198068,0.0,0.28994,Wolfwalkers,"Adventure,Animation,Family",103.0,0
tt6048922,0.0,0.269846,Greyhound,"Action,Drama,History",91.0,0


#### True positives

In [19]:
tto_pred_noms[(tto_pred_noms['nominated']==1) & (tto_pred_noms['RFC_bp_nom'] == 1)]

Unnamed: 0_level_0,RFC_bp_nom,RFC_bp_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


#### False positives

In [20]:
tto_pred_noms[(tto_pred_noms['nominated']==0) & (tto_pred_noms['RFC_bp_nom'] == 1)]

Unnamed: 0_level_0,RFC_bp_nom,RFC_bp_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt6772802,1.0,0.669925,Hillbilly Elegy,Drama,116.0,0
tt6723592,1.0,0.55996,Tenet,"Action,Sci-Fi,Thriller",150.0,0


#### False negatives

In [21]:
tto_pred_noms[(tto_pred_noms['nominated']==1) & (tto_pred_noms['RFC_bp_nom'] == 0)]

Unnamed: 0_level_0,RFC_bp_nom,RFC_bp_proba,primaryTitle,genres,runtimeMinutes,nominated
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt10633456,0.0,0.199916,Minari,Drama,115.0,1
tt10272386,0.0,0.159926,The Father,Drama,97.0,1
tt5363618,0.0,0.139942,Sound of Metal,"Drama,Music",120.0,1
tt10618286,0.0,0.119964,Mank,"Biography,Comedy,Drama",131.0,1
tt1070874,0.0,0.099971,The Trial of the Chicago 7,"Drama,History,Thriller",129.0,1
tt9784798,0.0,0.029925,Judas and the Black Messiah,"Biography,Drama,History",126.0,1
tt9620292,0.0,0.009981,Promising Young Woman,"Crime,Drama,Thriller",113.0,1
tt9770150,0.0,0.0,Nomadland,Drama,108.0,1


## 4. Predicting 2021 Best Picture winner

Based on poor results of trained classifiers on test data and in cross-validation, I did not attempt to apply any of them to predicting the Best Picture.