In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [18]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(".."))))

In [20]:
config = {
    "data_dirs": {
    "Picture" : "../data/oscardata_bestpicture.csv",
    "Director" : "data/oscardata_bestdirector.csv",
    "Supporting Actor" : "data/oscardata_acting.csv",
    "Supporting Actress" : "data/oscardata_acting.csv",
    "Lead Actress" : "data/oscardata_acting.csv",
    "Lead Acting" : "data/oscardata_acting.csv",},
    "predictor_dir" : "../configs/predictor_selection.xlsx",
    "predictor_set": "model_1",
    "category_map" : {
        "Picture": ["Picture"],
        "Director": ["Director"],
        "Lead Acting": ["Actor", "Actress"],
        "Supporting Acting": ["Supporting Actor", "Supporting Actress"],
    },
}

In [70]:

class OscarPredictor():
    def __init__(
        self,
        new_season,
        model_category,
        config,
        model_type = "logit",
        split_type="use_all",
    ):

        self.new_season = new_season
        self.cfg = config


        if model_category in ["Picture", "Director", "Supporting Acting", "Lead Acting"]:
            self.model_category = model_category
            self.oscar_categories = self.cfg["category_map"][self.model_category]
        else:
            raise ValueError(f"Category: {model_category} not recognized")
        
        self.split_type = split_type
        self.model_type = model_type

        # Load which predictors to use
        variable_selection = pd.read_excel(self.cfg["predictor_dir"],
                                           sheet_name=self.cfg["predictor_set"])
        self.predictors = variable_selection["Variable"][
            variable_selection[self.model_category].fillna(0) == 1
        ].values.tolist()
    

    def load_train_data(self):
        # Load data
        df_train = pd.read_csv(self.cfg["data_dirs"][self.model_category])
        df_train = df_train[df_train["Category"].isin(self.oscar_categories)]

        return df_train

    def load_new_data(self):
        new_data_dir = self.cfg["data_dirs"][self.model_category].replace("_",f"_{self.new_season}_")
        df_new = pd.read_csv(new_data_dir)
        df_new = df_new[df_new["Category"].isin(self.oscar_categories)]

        return df_new
    
    def prepare_train_data(self, df_train):
        X = df_train[self.predictors]
        y = df_train["Winner"]

        if self.split_type == "use_all":
            X_train = X
            y_train = y
            df_train = df_train
        else : 
            raise ValueError(f"split_type : {self.split_type} not implemented")
            # TODO implement other splits

        return df_train, X_train, y_train
        
    def prepare_pred_data(self, df_new):
        X_pred = df_new[self.predictors]

        return X_pred
    
    def fit_model(self, X, y):
        if self.model_type.lower() in ["logit", "logistic regression"]:
            model = LogisticRegressionCV(max_iter=5000, solver="newton-cg")
        elif self.model_type.lower() in ["rf", "random forest"]:
            model = RandomForestClassifier(n_estimators=250)
        else:
            raise ValueError(f"Model type {self.model_type} not supported")
        
        model.fit(X, y)
        self.model = model

    def predict_new_season(self):
        if not hasattr(self, "model"):
            raise ValueError("Model needs to be trained first")
            
        df_new = self.load_new_data()
        X_pred = self.prepare_pred_data(df_new)
        df_res = self.get_predictions(X_pred, df_new)

        self.df_res_new = df_res

    def train_model(self):
        
        df_train = self.load_train_data()
        df_train, X_train, y_train = self.prepare_train_data(df_train)
        self.fit_model(X_train, y_train)
        df_res = self.get_predictions(X_train, df_train)
        
        # Evaluate results
        self.eval(df_res)
        self.df_res_train = df_res

    def get_predictions(self, X_pred, df_pred):

        df_res = df_pred.copy()
        probs = self.model.predict_proba(X_pred)[:, 1]
        df_res["Prob"] = probs

        # Classify the film with the highest probability of winning as the winner
        df_res["Classification"] = 0
        win_idx = df_res.groupby(["Category", "Year"])["Prob"].idxmax()
        df_res.loc[win_idx, "Classification"] = 1

        out_columns =["Category", "Film", "Nominee", "Year", "Winner", "Prob", "Classification"]
        df_res = df_res[out_columns]
        
        return df_res

    def eval(self, df_res):
        print('AUC', roc_auc_score(df_res['Winner'], df_res['Classification']))
        print('TPR', recall_score(df_res['Winner'], df_res['Classification']))

In [71]:
picture_predictor = OscarPredictor(model_category="Picture", new_season="2022", config=config)

In [72]:
picture_predictor.train_model()

AUC 0.8740573942186846
TPR 0.7903225806451613


In [62]:
picture_predictor.df_res_train

Unnamed: 0,Category,Film,Nominee,Year,Winner,Prob,Classification
0,Picture,The Apartment,The Apartment,1961,1,0.880885,1
1,Picture,The Alamo,The Alamo,1961,0,0.007271,0
2,Picture,Elmer Gantry,Elmer Gantry,1961,0,0.017981,0
3,Picture,Sons and Lovers,Sons and Lovers,1961,0,0.017156,0
4,Picture,The Sundowners,The Sundowners,1961,0,0.011077,0
...,...,...,...,...,...,...,...
365,Picture,King Richard,King Richard,2022,0,0.023476,0
366,Picture,Licorice Pizza,Licorice Pizza,2022,0,0.002475,0
367,Picture,Nightmare Alley,Nightmare Alley,2022,0,0.000508,0
368,Picture,The Power of the Dog,The Power of the Dog,2022,0,0.141122,0


In [56]:
res_df = picture_predictor.df_res_train
res_df.loc[(res_df.Classification==1)&(res_df.Classification!=res_df.Winner)]

Unnamed: 0,Category,Film,Nominee,Year,Winner,Prob,Classification
16,Picture,America America,America America,1964,0,0.084081,1
38,Picture,The Graduate,The Graduate,1968,0,0.874776,1
42,Picture,The Lion in Winter,The Lion in Winter,1969,0,0.805617,1
109,Picture,Reds,Reds,1982,0,0.445199,1
128,Picture,Prizzi's Honor,Prizzi's Honor,1986,0,0.173063,1
146,Picture,Born on the Fourth of July,Born on the Fourth of July,1990,0,0.293565,1
176,Picture,Apollo 13,Apollo 13,1996,0,0.802418,1
193,Picture,Saving Private Ryan,Saving Private Ryan,1999,0,0.902131,1
226,Picture,Brokeback Mountain,Brokeback Mountain,2006,0,0.749135,1
283,Picture,American Hustle,American Hustle,2014,0,0.43221,1


In [57]:
res_df.loc[(res_df.Classification==0)&(res_df.Classification!=res_df.Winner)]

Unnamed: 0,Category,Film,Nominee,Year,Winner,Prob,Classification
15,Picture,Tom Jones,Tom Jones,1964,1,0.062867,0
35,Picture,In the Heat of the Night,In the Heat of the Night,1968,1,0.244963,0
40,Picture,Oliver!,Oliver!,1969,1,0.392396,0
105,Picture,Chariots of Fire,Chariots of Fire,1982,1,0.38955,0
125,Picture,Out of Africa,Out of Africa,1986,1,0.144529,0
145,Picture,Driving Miss Daisy,Driving Miss Daisy,1990,1,0.105496,0
175,Picture,Braveheart,Braveheart,1996,1,0.015797,0
190,Picture,Shakespeare in Love,Shakespeare in Love,1999,1,0.581926,0
225,Picture,Crash,Crash,2006,1,0.736456,0
282,Picture,12 Years a Slave,12 Years a Slave,2014,1,0.332701,0


In [73]:
picture_predictor.predict_new_season()

In [74]:
picture_predictor.df_res_new

Unnamed: 0,Category,Film,Nominee,Year,Winner,Prob,Classification
0,Picture,Belfast,Belfast,2022,0,0.02989,0
1,Picture,CODA,CODA,2022,1,0.191626,1
2,Picture,Don't Look Up,Don't Look Up,2022,0,0.013505,0
3,Picture,Drive My Car,Drive My Car,2022,0,0.000569,0
4,Picture,Dune,Dune,2022,0,0.001196,0
5,Picture,King Richard,King Richard,2022,0,0.023476,0
6,Picture,Licorice Pizza,Licorice Pizza,2022,0,0.002475,0
7,Picture,Nightmare Alley,Nightmare Alley,2022,0,0.000508,0
8,Picture,The Power of the Dog,The Power of the Dog,2022,0,0.141122,0
9,Picture,West Side Story,West Side Story,2022,0,0.003981,0
