In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#np.random.seed(113) 

## Setup (display format, reading in data and previously trained estimator)

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
colors = ['#dbbd5a', '#d2ad32', '#ae8e25']  #shades of "gold", from lightest to darkest (with help from icolorpalette.com)
cmap_name_ = 'golds'
golds = LinearSegmentedColormap.from_list(cmap_name_, colors, N=100)

colors_ext = ['#f5edd3', '#ecddab', '#e4cd82', '#dbbd5a', '#d2ad32', '#ae8e25', '#866d1c', '#5e4c14', '#352b0b']  #shades of "gold", from lightest to darkest (with help from icolorpalette.com)
name = 'golds_ext'
golds_ext = LinearSegmentedColormap.from_list(name, colors_ext, N=100)

In [4]:
titles = pd.read_csv('../data/title_basics_1990_2020.csv', index_col = 'tconst')

movies = pd.read_csv('../data/movies_1990_2020_with_detail_oscars_complete.csv', index_col= 'tconst', low_memory=False)

movies_2020 = movies[movies['startYear']==2020] # the 2020 ceremony took place on 2/9/2020, so all movies made that year can be considered potential contenders for 2021 (release dates are susceptible to nans and thus less reliable than startYear)
movies = movies[movies['startYear']<2020]

movies_btstrp = pd.concat([movies, movies[movies['Oscars']==1].sample(n = 100000, replace = True, random_state=113)], axis = 0)

In [5]:
X_vars_all = ['runtimeMinutes', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 
       'Warner Bros.', 'Universal Pictures', 'Columbia Pictures',
       'Paramount Pictures', 'Dreamworks Pictures', 'Walt Disney Pictures',
       'Miramax', 'Twentieth Century Fox', 'New Line Cinema', 'Focus Features',
       'Fox Searchlight Pictures', 'Touchstone Pictures',
       'Walt Disney Animation Studios', 'BBC Films', 'TriStar Pictures',
       'New Regency Productions', 'Fox 2000 Pictures', 'The Weinstein Company',
       'Annapurna Pictures', 'Castle Rock Entertainment',
        'oscars_post_release', 'US_open_WE', 'prev_Oscars', 'rel_USA', 'prev_Oscar_wins_nm', 'prev_Oscar_noms_nm', 'GG',
       'rel__aug', 'rel__dec', 'rel__feb', 'rel__jan', 'rel__jul', 'rel__jun',
       'rel__mar', 'rel__may', 'rel__nov', 'rel__oct', 'rel__sep',
             'budget_USD']

## Scaling independent variables

### Approach #1: scaling data to the historical data on which classifier was trained

In [6]:
X = movies_btstrp[X_vars_all]

X_train, X_test = train_test_split(X)

stan_t = StandardScaler()
stan_t.fit(X_train)

movies_2020_Z = stan_t.transform(movies_2020[X_vars_all])

### Approach #2: scaling data to itself (without seeing historical data on which classifier was trained)

In [7]:
#stan = StandardScaler()

#movies_2020_Z = stan.fit_transform(movies_2020[X_vars_all])

## Predicting 2021 nominations

In [8]:
rfc_nom = pickle.load(open('../pickles/rfc_nom.p', 'rb'))

In [9]:
tto_pred_noms = pd.DataFrame(index = movies_2020.index, columns = ['RFC_win_nom', 'RFC_win_nom_proba'], 
                         data = np.array([rfc_nom.predict(movies_2020_Z), [_[1] for _ in rfc_nom.predict_proba(movies_2020_Z)]]).T)
tto_pred_noms = tto_pred_noms.merge(titles[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_pred_noms.sort_values('RFC_win_nom_proba', inplace= True, ascending=False)

tto_pred_noms.head(10)

Unnamed: 0_level_0,RFC_win_nom,RFC_win_nom_proba,primaryTitle,genres,runtimeMinutes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt13623860,1.0,0.76195,Laura's Choice,Documentary,106
tt13642438,1.0,0.76195,No Turning Back,Documentary,106
tt6772802,1.0,0.74,Hillbilly Elegy,Drama,116
tt7146812,1.0,0.72,Onward,"Adventure,Animation,Comedy",102
tt12239604,1.0,0.72,Paint Until Dawn,Documentary,100
tt13155280,1.0,0.631905,Queen Elizabeth II: End of A Reign,"Biography,Documentary",60
tt5198068,1.0,0.62,Wolfwalkers,"Adventure,Animation,Family",103
tt13430676,1.0,0.587819,Birth,Documentary,102
tt6723592,1.0,0.55,Tenet,"Action,Sci-Fi,Thriller",150
tt2948372,1.0,0.53,Soul,"Adventure,Animation,Comedy",100


In [10]:
tto_pred_top10_ind = tto_pred_noms[tto_pred_noms['RFC_win_nom_proba']>=tto_pred_noms.iloc[10,1]].index
movies_2020.loc[tto_pred_top10_ind,['primaryTitle', 'genres', 'runtimeMinutes', 'summary', 'produced_by', 'rel_date', 'budget_USD', 'usa_open_we', 'prev_Oscars', 'nominated_for']]

Unnamed: 0_level_0,primaryTitle,genres,runtimeMinutes,summary,produced_by,rel_date,budget_USD,usa_open_we,prev_Oscars,nominated_for
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt13623860,Laura's Choice,Documentary,106.0,Three generations of women navigate a radical ...,"Factor 30 Films, Virgo Productions",12 December 2020,242408.8,,0.0,
tt13642438,No Turning Back,Documentary,106.0,,Not listed,18 December 2020,242408.8,,0.0,
tt6772802,Hillbilly Elegy,Drama,116.0,An urgent phone call pulls a Yale Law student ...,"Imagine Entertainment, Netflix",24 November 2020,242408.8,,41.0,Nominated for 1 Golden Globe.
tt7146812,Onward,"Adventure,Animation,Comedy",102.0,Two elven brothers embark on a quest to bring ...,"Walt Disney Pictures, Pixar Animation Studios",6 March 2020,242408.8,39119861.0,15.0,Nominated for 1 Golden Globe.
tt12239604,Paint Until Dawn,Documentary,100.0,ing is to painting what listening is to politi...,Lumikalai Film,8 June 2020,76444.54,,0.0,
tt13155280,Queen Elizabeth II: End of A Reign,"Biography,Documentary",60.0,Ever wonder what it would be like to be royalt...,Not listed,14 July 2020,242408.8,,0.0,
tt5198068,Wolfwalkers,"Adventure,Animation,Family",103.0,A young apprentice hunter and her father journ...,"Apple Original Films, Cartoon Saloon, Mélusin...",13 November 2020,242408.8,,5.0,Nominated for 1 Golden Globe.
tt13430676,Birth,Documentary,102.0,"Everyone is born, but how many of us understan...",Not listed,7 October 2020,242408.8,,0.0,
tt6723592,Tenet,"Action,Sci-Fi,Thriller",150.0,"Armed with only one word, Tenet, and fighting ...","Warner Bros., Syncopy",3 September 2020,205000000.0,20200000.0,16.0,Nominated for 1 Golden Globe.
tt2948372,Soul,"Adventure,Animation,Comedy",100.0,"After landing the gig of a lifetime, a New Yor...","Walt Disney Pictures, Pixar Animation Studios",25 December 2020,242408.8,,10.0,Nominated for 2 Golden Globes.


## Predicting 2021 wins (any category)

In [11]:
rfc_win = pickle.load(open('../pickles/rfc_win.p', 'rb'))

In [12]:
tto_pred_wins = pd.DataFrame(index = movies_2020.index, columns = ['RFC_win', 'RFC_win_proba'], 
                         data = np.array([rfc_win.predict(movies_2020_Z), [_[1] for _ in rfc_win.predict_proba(movies_2020_Z)]]).T)
tto_pred_wins = tto_pred_wins.merge(titles[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_pred_wins.sort_values('RFC_win_proba', inplace= True, ascending=False)

tto_pred_wins.head(20)

Unnamed: 0_level_0,RFC_win,RFC_win_proba,primaryTitle,genres,runtimeMinutes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt6772802,0.0,0.42,Hillbilly Elegy,Drama,116
tt6723592,0.0,0.28,Tenet,"Action,Sci-Fi,Thriller",150
tt6878306,0.0,0.21,News of the World,"Action,Adventure,Drama",118
tt7146812,0.0,0.12,Onward,"Adventure,Animation,Comedy",102
tt2948372,0.0,0.1,Soul,"Adventure,Animation,Comedy",100
tt6048922,0.0,0.08,Greyhound,"Action,Drama,History",91
tt9893250,0.0,0.08,I Care a Lot,"Comedy,Crime,Thriller",118
tt1070874,0.0,0.07,The Trial of the Chicago 7,"Drama,History,Thriller",129
tt13143964,0.0,0.06,Borat Subsequent Moviefilm,Comedy,95
tt12489826,0.0,0.06,WMTv17,"Biography,Comedy,Drama",120


In [13]:
tto_pred_top10_ind = tto_pred_wins[tto_pred_wins['RFC_win_proba']>=tto_pred_wins.iloc[10,1]].index
movies_2020.loc[tto_pred_top10_ind,['primaryTitle', 'genres', 'runtimeMinutes', 'summary', 'produced_by', 'rel_date', 'budget_USD', 'usa_open_we', 'prev_Oscars', 'nominated_for']]

Unnamed: 0_level_0,primaryTitle,genres,runtimeMinutes,summary,produced_by,rel_date,budget_USD,usa_open_we,prev_Oscars,nominated_for
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt6772802,Hillbilly Elegy,Drama,116.0,An urgent phone call pulls a Yale Law student ...,"Imagine Entertainment, Netflix",24 November 2020,242408.8,,41.0,Nominated for 1 Golden Globe.
tt6723592,Tenet,"Action,Sci-Fi,Thriller",150.0,"Armed with only one word, Tenet, and fighting ...","Warner Bros., Syncopy",3 September 2020,205000000.0,20200000.0,16.0,Nominated for 1 Golden Globe.
tt6878306,News of the World,"Action,Adventure,Drama",118.0,"A Civil War veteran agrees to deliver a girl, ...","Perfect World Pictures, Playtone, Pretty Pict...",25 December 2020,242408.8,2250430.0,32.0,Nominated for 2 Golden Globes.
tt7146812,Onward,"Adventure,Animation,Comedy",102.0,Two elven brothers embark on a quest to bring ...,"Walt Disney Pictures, Pixar Animation Studios",6 March 2020,242408.8,39119861.0,15.0,Nominated for 1 Golden Globe.
tt2948372,Soul,"Adventure,Animation,Comedy",100.0,"After landing the gig of a lifetime, a New Yor...","Walt Disney Pictures, Pixar Animation Studios",25 December 2020,242408.8,,10.0,Nominated for 2 Golden Globes.
tt6048922,Greyhound,"Action,Drama,History",91.0,veral months after the U.S. entry into World W...,"Sony Pictures Entertainment (SPE), Stage 6 Fi...",10 July 2020,50300000.0,,30.0,
tt9893250,I Care a Lot,"Comedy,Crime,Thriller",118.0,A crooked legal guardian who drains the saving...,"Black Bear Pictures, Crimple Beck",19 February 2021,242408.8,,8.0,Nominated for 1 Golden Globe.
tt1070874,The Trial of the Chicago 7,"Drama,History,Thriller",129.0,The story of 7 people on trial stemming from v...,"Dreamworks Pictures, Amblin Partners, Apertur...",16 October 2020,242408.8,,15.0,
tt13143964,Borat Subsequent Moviefilm,Comedy,95.0,Follow-up film to the 2006 comedy centering on...,"Amazon Studios, Four by Two Films",23 October 2020,242408.8,,29.0,Nominated for 3 Golden Globes.
tt12489826,WMTv17,"Biography,Comedy,Drama",120.0,WMTv17 is a Video Tv Program formed by a singe...,Bongo Movies,2 April 2020,9876589.0,,0.0,


## Predicting 2021 nominations in the Best Picture category

In [14]:
rfc_bp = pickle.load(open('../pickles/rfc_bp.p', 'rb'))

In [15]:
tto_bp_preds = pd.DataFrame(index = movies_2020.index, columns = ['RFC_bp', 'RFC_bp_proba'], 
                         data = np.array([rfc_bp.predict(movies_2020_Z), [_[1] for _ in rfc_bp.predict_proba(movies_2020_Z)]]).T)
tto_bp_preds = tto_bp_preds.merge(titles[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_bp_preds.sort_values('RFC_bp_proba', inplace= True, ascending=False)

tto_bp_preds.head(15)

Unnamed: 0_level_0,RFC_bp,RFC_bp_proba,primaryTitle,genres,runtimeMinutes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt6772802,0.0,0.43,Hillbilly Elegy,Drama,116
tt6723592,0.0,0.4,Tenet,"Action,Sci-Fi,Thriller",150
tt6878306,0.0,0.29,News of the World,"Action,Adventure,Drama",118
tt13143964,0.0,0.19,Borat Subsequent Moviefilm,Comedy,95
tt2948372,0.0,0.18,Soul,"Adventure,Animation,Comedy",100
tt10539608,0.0,0.14,The Midnight Sky,"Drama,Fantasy,Sci-Fi",118
tt10618286,0.0,0.14,Mank,"Biography,Comedy,Drama",131
tt2850386,0.0,0.13,The Croods: A New Age,"Adventure,Animation,Comedy",95
tt7146812,0.0,0.1,Onward,"Adventure,Animation,Comedy",102
tt5198068,0.0,0.1,Wolfwalkers,"Adventure,Animation,Family",103


In [16]:
tto_bp_preds_top10_ind = tto_bp_preds[tto_bp_preds['RFC_bp_proba']>tto_bp_preds.iloc[10,1]].index
movies_2020.loc[tto_bp_preds_top10_ind,['primaryTitle', 'genres', 'runtimeMinutes', 'summary', 'produced_by', 'rel_date', 'budget_USD', 'usa_open_we', 'prev_Oscars', 'nominated_for']]

Unnamed: 0_level_0,primaryTitle,genres,runtimeMinutes,summary,produced_by,rel_date,budget_USD,usa_open_we,prev_Oscars,nominated_for
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt6772802,Hillbilly Elegy,Drama,116.0,An urgent phone call pulls a Yale Law student ...,"Imagine Entertainment, Netflix",24 November 2020,242408.8,,41.0,Nominated for 1 Golden Globe.
tt6723592,Tenet,"Action,Sci-Fi,Thriller",150.0,"Armed with only one word, Tenet, and fighting ...","Warner Bros., Syncopy",3 September 2020,205000000.0,20200000.0,16.0,Nominated for 1 Golden Globe.
tt6878306,News of the World,"Action,Adventure,Drama",118.0,"A Civil War veteran agrees to deliver a girl, ...","Perfect World Pictures, Playtone, Pretty Pict...",25 December 2020,242408.8,2250430.0,32.0,Nominated for 2 Golden Globes.
tt13143964,Borat Subsequent Moviefilm,Comedy,95.0,Follow-up film to the 2006 comedy centering on...,"Amazon Studios, Four by Two Films",23 October 2020,242408.8,,29.0,Nominated for 3 Golden Globes.
tt2948372,Soul,"Adventure,Animation,Comedy",100.0,"After landing the gig of a lifetime, a New Yor...","Walt Disney Pictures, Pixar Animation Studios",25 December 2020,242408.8,,10.0,Nominated for 2 Golden Globes.
tt10539608,The Midnight Sky,"Drama,Fantasy,Sci-Fi",118.0,"This post-apocalyptic tale follows Augustine, ...","Anonymous Content, Netflix, Smokehouse Pictures",23 December 2020,242408.8,,25.0,
tt10618286,Mank,"Biography,Comedy,Drama",131.0,1930's Hollywood is reevaluated through the ey...,Netflix,4 December 2020,242408.8,,33.0,
tt2850386,The Croods: A New Age,"Adventure,Animation,Comedy",95.0,The prehistoric family the Croods are challeng...,DreamWorks Animation,25 November 2020,242408.8,9724200.0,19.0,Nominated for 1 Golden Globe.


## Predicting 2021 winners in the Best Picture category

In [17]:
rfc_bp_win = pickle.load(open('../pickles/rfc_bp_win.p', 'rb'))

In [18]:
tto_bp_win_preds = pd.DataFrame(index = movies_2020.index, columns = ['RFC_bp', 'RFC_bp_win_proba'], 
                         data = np.array([rfc_bp_win.predict(movies_2020_Z), [_[1] for _ in rfc_bp_win.predict_proba(movies_2020_Z)]]).T)
tto_bp_win_preds = tto_bp_win_preds.merge(titles[['primaryTitle', 'genres', 'runtimeMinutes']], how = 'left', left_index = True, right_index = True)
tto_bp_win_preds.sort_values('RFC_bp_win_proba', inplace= True, ascending=False)

tto_bp_win_preds.head(10)

Unnamed: 0_level_0,RFC_bp,RFC_bp_win_proba,primaryTitle,genres,runtimeMinutes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt1070874,0.0,0.09,The Trial of the Chicago 7,"Drama,History,Thriller",129
tt9893250,0.0,0.06,I Care a Lot,"Comedy,Crime,Thriller",118
tt6878306,0.0,0.04,News of the World,"Action,Adventure,Drama",118
tt7510346,0.0,0.04,The Turning,"Drama,Horror,Mystery",94
tt7435316,0.0,0.02,The Glorias,"Biography,Drama,History",139
tt2420124,0.0,0.02,Wendy,"Drama,Fantasy",111
tt10243992,0.0,0.01,I'm Your Woman,"Crime,Drama",120
tt12888358,0.0,0.01,John Williams: Live in Vienna,Music,128
tt5198068,0.0,0.01,Wolfwalkers,"Adventure,Animation,Family",103
tt6723592,0.0,0.01,Tenet,"Action,Sci-Fi,Thriller",150


In [19]:
tto_bp_win_preds_top10_ind = tto_bp_win_preds[tto_bp_win_preds['RFC_bp_win_proba']>=tto_bp_win_preds.iloc[10,1]].index
movies_2020.loc[tto_bp_win_preds_top10_ind,['primaryTitle', 'genres', 'runtimeMinutes', 'summary', 'produced_by', 'rel_date', 'budget_USD', 'usa_open_we', 'prev_Oscars', 'nominated_for']]

Unnamed: 0_level_0,primaryTitle,genres,runtimeMinutes,summary,produced_by,rel_date,budget_USD,usa_open_we,prev_Oscars,nominated_for
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt1070874,The Trial of the Chicago 7,"Drama,History,Thriller",129.0,The story of 7 people on trial stemming from v...,"Dreamworks Pictures, Amblin Partners, Apertur...",16 October 2020,242408.8,,15.0,
tt9893250,I Care a Lot,"Comedy,Crime,Thriller",118.0,A crooked legal guardian who drains the saving...,"Black Bear Pictures, Crimple Beck",19 February 2021,242408.8,,8.0,Nominated for 1 Golden Globe.
tt6878306,News of the World,"Action,Adventure,Drama",118.0,"A Civil War veteran agrees to deliver a girl, ...","Perfect World Pictures, Playtone, Pretty Pict...",25 December 2020,242408.8,2250430.0,32.0,Nominated for 2 Golden Globes.
tt7510346,The Turning,"Drama,Horror,Mystery",94.0,A young governess is hired to look after an or...,"Dreamworks Pictures, Reliance Entertainment, ...",24 January 2020,14000000.0,6950045.0,2.0,
tt7435316,The Glorias,"Biography,Drama,History",139.0,The story of feminist icon Gloria Steinem's it...,"Artemis Rising Foundation, Page Fifty-Four Pi...",30 September 2020,242408.8,,21.0,
tt2420124,Wendy,"Drama,Fantasy",111.0,Lost on a mysterious island where aging and ti...,"Fox Searchlight Pictures, TSG Entertainment, ...",17 April 2020,6000000.0,26214.0,2.0,
tt10243992,I'm Your Woman,"Crime,Drama",120.0,"In this 1970s set crime drama, a woman is forc...","Big Indie Pictures, Original Headquarters, Sc...",11 December 2020,242408.8,,0.0,
tt12888358,John Williams: Live in Vienna,Music,128.0,Explore the live recordings of the Hollywood l...,"Bernhard Fleischer Moving Images, ServusTV, D...",2 October 2020,242408.8,,21.0,
tt5198068,Wolfwalkers,"Adventure,Animation,Family",103.0,A young apprentice hunter and her father journ...,"Apple Original Films, Cartoon Saloon, Mélusin...",13 November 2020,242408.8,,5.0,Nominated for 1 Golden Globe.
tt6723592,Tenet,"Action,Sci-Fi,Thriller",150.0,"Armed with only one word, Tenet, and fighting ...","Warner Bros., Syncopy",3 September 2020,205000000.0,20200000.0,16.0,Nominated for 1 Golden Globe.


In [20]:
temp = movies_2020.loc[tto_bp_win_preds_top10_ind,['primaryTitle', 'genres', 'runtimeMinutes', 'summary', 'produced_by', 'rel_date', 'budget_USD', 'usa_open_we', 'prev_Oscars', 'nominated_for']]
temp.to_clipboard(excel = True)