In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import pickle 
import graphviz
import matplotlib.pyplot as plt
from sklearn import tree 
from sklearn.model_selection import train_test_split 
import time
pd.set_option("display.max_colwidth", 200)

# Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import BernoulliNB, GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Extra
from sklearn.preprocessing import normalize, scale, Normalizer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline

In [2]:
data = pd.read_csv('data/movie_dataset_final.csv')
data.head(3)

Unnamed: 0,Year,Movie,Oscar_winner,Oscar_nominee,Runtime (min),Certificate,Directors,Actors,Metascore,IMDb_rating,...,Golden_Bear_winner,Golden_Bear_nominee,Golden_Lion_winner,Golden_Lion_nominee,PCA_winner,PCA_nominee,NYFCC_winner,NYFCC_nominee,OFCS_winner,OFCS_nominee
0,1999,Fight Club,0,0,139,R(A),David Fincher,"['Brad Pitt', 'Edward Norton', 'Meat Loaf', 'Zach Grenier']",66,8.8,...,0,0,0,0,0,0,0,0,0,1
1,1999,The Matrix,0,0,136,PG,Lana Wachowski Lilly Wachowski,"['Keanu Reeves', 'Laurence Fishburne', 'Carrie-Anne Moss', 'Hugo Weaving']",73,8.7,...,0,0,0,0,0,0,0,0,0,0
2,1999,The Green Mile,0,1,189,R(A),Frank Darabont,"['Tom Hanks', 'Michael Clarke Duncan', 'David Morse', 'Bonnie Hunt']",61,8.6,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#x = data.drop(['Certificate','Directors','Actors','Genre'], axis=1)
#y = data['Oscar_winner']

In [4]:
#X_train_best_picture, X_test_best_picture, y_train_best_picture, y_test_best_picture = train_test_split(x, y, test_size=0.25)

In [5]:
train = data[data['Year'] < 2015]
test = data[data['Year'] >= 2015]

In [6]:
movie_name = np.array(test["Movie"])
year = np.array(test["Year"])
oscar_w = np.array(test["Oscar_winner"])
oscar_n = np.array(test["Oscar_nominee"])

In [7]:
#split 
features = [ 'Runtime (min)','Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 
             'Drama', 'Family','Fantasy', 'History', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
             'Sport','Thriller', 'War', 'Western',
             'Budget','Domestic (US) gross','International gross','Worldwide gross',
             'Metascore', 'IMDb_rating', 'IMDb_votes', 'RT_rating', 'RT_review',
             'GG_drama_winner', 'GG_drama_nominee', 'GG_comedy_winner', 'GG_comedy_nominee',
             'BAFTA_winner', 'BAFTA_nominee', 'DGA_winner', 'DGA_nominee',
             'PGA_winner', 'PGA_nominee', 'CCMA_winner', 'CCMA_nominee',
             'Golden_Palm_winner', 'Golden_Palm_nominee', 'Golden_Bear_winner', 'Golden_Bear_nominee',
             'Golden_Lion_winner', 'Golden_Lion_nominee', 'PCA_winner', 'PCA_nominee',
             'NYFCC_winner', 'NYFCC_nominee', 'OFCS_winner', 'OFCS_nominee'] #53 features

In [8]:
X_train_best_picture = train[features]
y_train_best_picture = train['Oscar_winner']
X_test_best_picture = test[features]
y_test_best_picture = test['Oscar_winner']

In [9]:
# transform the data to standardize the values in the data 
preprocessor = ColumnTransformer(transformers=[('scale', StandardScaler(), features)])

In [10]:
def get_scores(model, X_train, y_train, X_test, y_test, show = True):
    """
    Returns train and validation error given a model
    train and validation X and y portions
    Parameters
    ----------
    model: sklearn classifier model
        The sklearn model
    X_train: numpy.ndarray        
        The X part of the train set
    y_train: numpy.ndarray
        The y part of the train set    
    X_valid: numpy.ndarray        
        The X part of the validation set
    y_valid: numpy.ndarray
        The y part of the validation set    
    Returns
    -------
        train_err: float
        test_err: float
            
    """ 
    
    if show: 
        print("Training error:   %.2f" % (1-model.score(X_train, y_train)))
        print("Validation error: %.2f" % (1-model.score(X_test, y_test)))
        print('\n')
    return (1-model.score(X_train, y_train)), (1-model.score(X_test, y_test))

In [11]:
def diff_class_ml(X_train, X_test, y_train, y_test):
    """
    Returns train error, validation error and time given an ensemble of models 
    Parameters
    ----------
    X_train: numpy.ndarray        
        The X part of the train set
    y_train: numpy.ndarray
        The y part of the train set    
    X_valid: numpy.ndarray        
        The X part of the validation set
    y_valid: numpy.ndarray
        The y part of the validation set    
    Returns
    -------
        pd.DataFrame
            
    """ 
    # Lets create an empty dictionary to store all the results
    results_dict = {}
    
    models = {
          'Dummy': DummyClassifier(), 
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest' : RandomForestClassifier(),
          'Extra Trees' : ExtraTreesClassifier(),
          'K-Nearest Neighbors': KNeighborsClassifier(),
          'Linear SVC' : LinearSVC(dual=False),
          'Logistic Regression': LogisticRegression(), 
          'Bagging' : BaggingClassifier(),
          'XGBoost' : XGBClassifier(),
          'AdaBoost' : AdaBoostClassifier(),
          'Light Gradient Boosting Machine (LGBM)': LGBMClassifier(),
          'Gradient Boosting' : GradientBoostingClassifier(),
          'Gaussian Naive Bayes' : GaussianNB(),
          'Bernoulli Naive Bayes' : BernoulliNB(),
          'Multi Layer Perceptron (Neural Network)' : MLPClassifier(),
          'DecisionTreeRegressor ' : DecisionTreeRegressor(),
          'Random Forest Regressor' : RandomForestRegressor(),
          'K-Nearest Neighbors Regressor' : KNeighborsRegressor()
         }

    for model_name, model in models.items():
        t = time.time()
        #print(model_name, ":")    
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        clf.fit(X_train, y_train);
        tr_err, valid_err = get_scores(clf, X_train, y_train, X_test, y_test, show = False)
        elapsed_time = time.time() - t
        results_dict[model_name] = [round(tr_err,3), round(valid_err,3), round(elapsed_time,4)]
        #print("Elapsed time: %.1f s" % elapsed_time)
    
    results_df = pd.DataFrame(results_dict).T
    results_df.columns = ["Train error", "Validation error", "Time in seconds"]
    return results_df

In [12]:
def oscar_predict(X_train, y_train, dataframe, model):
    """
    Shows Classifier and Probability for the Oscar nomination (or other movie)
    
    Parameters
    ----------
    X_train: numpy.ndarray 
        The X training set of the selected catagory 
    y_train: numpy.ndarray
        The y training set the selected catagory 
    dataframe: numpy.ndarray
        The movies that need to be predicted if they won their category 
    model: sklearn classifier model
        The sklearn model such as LogisticRegression()
    
    Returns
    -------
        pd.DataFrame
    
    """
    # Lets create an empty dictionary to store all the results
    results_dict = {}
    
    movies = list(dataframe.index)
    movies_names = list(dataframe["Movie"])
    dataframe_X = dataframe.drop([ 'Certificate','Directors','Actors','Genre'], axis = 1)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
    model = clf.fit(X_train, y_train);
        
    for movie, movie_name in zip(movies, movies_names):
        movie_predict = dataframe_X.loc[[movie]]
        classifier = model.predict(movie_predict)
        classifier = classifier[0]
        prob = model.predict_proba(movie_predict)
        prob = prob[0][1]
        results_dict[movie_name] = [classifier, round(prob,8)]
    
    results_df = pd.DataFrame(results_dict).T
    results_df.columns = ["Classifier", "Probability of it Winning"]
    
    return results_df


In [13]:
def normalized_winner(predict_df):
    """
    Normalized the chance of a movie winning an oscar
    
    Parameters
    ----------
    predict_df: pd.DataFram from oscar_predict()
        
    Returns
    -------
        pd.DataFrame
    
    """
    predict_df = predict_df.drop(["Classifier"], axis = 1)
    predict_df["Chance of Winning"] = predict_df["Probability of it Winning"]/(predict_df["Probability of it Winning"].sum())
    predict_df = predict_df.drop(["Probability of it Winning"], axis = 1)
    
    predict_df["Chance of Winning"] = pd.Series(["{0:.2f}%".format(val * 100) for val in predict_df ["Chance of Winning"]], 
                                                 index = predict_df.index)
    
    return predict_df

In [14]:
diff_class_ml(X_train_best_picture, X_test_best_picture, y_train_best_picture, y_test_best_picture)



Unnamed: 0,Train error,Validation error,Time in seconds
Dummy,0.023,0.016,0.011
Decision Tree,0.0,0.018,0.012
Random Forest,0.0,0.012,0.1641
Extra Trees,0.0,0.012,0.1212
K-Nearest Neighbors,0.005,0.012,0.2549
Linear SVC,0.0,0.016,0.0309
Logistic Regression,0.0,0.016,0.0249
Bagging,0.002,0.016,0.0429
XGBoost,0.001,0.014,0.1891
AdaBoost,0.0,0.014,0.1616


In [15]:
guess_2015 = data[data['Year']==2015]
guess_2016 = data[data['Year']==2016]
guess_2017 = data[data['Year']==2017]
guess_2018 = data[data['Year']==2018]
guess_2019 = data[data['Year']==2019]

In [16]:
predict_picture_2015 = oscar_predict(X_train_best_picture, y_train_best_picture, guess_2015, ExtraTreesClassifier())
predict_picture_2015.head(10)











Unnamed: 0,Classifier,Probability of it Winning
Star Wars: Episode VII - The Force Awakens,0.0,0.0
Mad Max: Fury Road,0.0,0.0
The Martian,0.0,0.0
Avengers: Age of Ultron,0.0,0.0
The Revenant,0.0,0.4
Inside Out,0.0,0.0
Jurassic World,0.0,0.0
Ant-Man,0.0,0.0
The Hateful Eight,0.0,0.0
Spotlight,0.0,0.3


In [17]:
normalized_predict_picture_2015 = normalized_winner(predict_picture_2015).sort_values("Chance of Winning",ascending=False)
normalized_predict_picture_2015.head(15)

Unnamed: 0,Chance of Winning
The Big Short,5.95%
The Revenant,47.62%
Spotlight,35.71%
Room,3.57%
Bridge of Spies,3.57%
Carol,2.38%
Sicario,1.19%
Paper Towns,0.00%
The Last Witch Hunter,0.00%
Home,0.00%


In [18]:
predict_picture_2016 = oscar_predict(X_train_best_picture, y_train_best_picture, guess_2016, ExtraTreesClassifier())
predict_picture_2016.head(10)











Unnamed: 0,Classifier,Probability of it Winning
Deadpool,0.0,0.0
Captain America: Civil War,0.0,0.0
Batman v Superman: Dawn of Justice,0.0,0.0
Suicide Squad,0.0,0.0
Doctor Strange,0.0,0.0
Arrival,0.0,0.04
Rogue One,0.0,0.0
La La Land,1.0,0.72
Zootopia,0.0,0.0
Hacksaw Ridge,0.0,0.0


In [19]:
normalized_predict_picture_2016 = normalized_winner(predict_picture_2016).sort_values("Chance of Winning",ascending=False)
normalized_predict_picture_2016.head(15)

Unnamed: 0,Chance of Winning
La La Land,87.80%
Arrival,4.88%
Moonlight,3.66%
Manchester by the Sea,2.44%
Lion,1.22%
Deadpool,0.00%
Neighbors 2: Sorority Rising,0.00%
Hunt for the Wilderpeople,0.00%
Swiss Army Man,0.00%
Gods of Egypt,0.00%


In [20]:
predict_picture_2017 = oscar_predict(X_train_best_picture, y_train_best_picture, guess_2017, ExtraTreesClassifier())
predict_picture_2017.head(10)











Unnamed: 0,Classifier,Probability of it Winning
Logan,0.0,0.0
Thor: Ragnarok,0.0,0.0
Guardians of the Galaxy Vol. 2,0.0,0.0
Star Wars: Episode VIII - The Last Jedi,0.0,0.0
Wonder Woman,0.0,0.0
Dunkirk,0.0,0.02
Spider-Man: Homecoming,0.0,0.0
Get Out,0.0,0.06
It,0.0,0.0
Blade Runner 2049,0.0,0.0


In [26]:
normalized_predict_picture_2017 = normalized_winner(predict_picture_2017).sort_values("Chance of Winning",ascending=False)
normalized_predict_picture_2017.head(10)

Unnamed: 0,Chance of Winning
The Shape of Water,66.67%
Get Out,6.67%
Dunkirk,2.22%
Lady Bird,2.22%
"Three Billboards Outside Ebbing, Missouri",17.78%
Mother!,1.11%
Call Me by Your Name,1.11%
"I, Tonya",1.11%
The Post,1.11%
Logan,0.00%


In [22]:
predict_picture_2018 = oscar_predict(X_train_best_picture, y_train_best_picture, guess_2018, ExtraTreesClassifier())
predict_picture_2018.head(10)











Unnamed: 0,Classifier,Probability of it Winning
Avengers: Infinity War,0.0,0.0
Black Panther,0.0,0.02
Deadpool 2,0.0,0.0
Bohemian Rhapsody,0.0,0.06
A Quiet Place,0.0,0.0
Ready Player One,0.0,0.0
Aquaman,0.0,0.0
Venom,0.0,0.0
Spider-Man: Into the Spider-Verse,0.0,0.0
Green Book,0.0,0.22


In [23]:
normalized_predict_picture_2018 = normalized_winner(predict_picture_2018).sort_values("Chance of Winning",ascending=False)
normalized_predict_picture_2018.head(10)

Unnamed: 0,Chance of Winning
Bohemian Rhapsody,7.59%
Roma,41.77%
Vice,3.80%
Green Book,27.85%
Black Panther,2.53%
BlacKkKlansman,2.53%
The Favourite,11.39%
First Man,1.27%
Mary Poppins Returns,1.27%
Avengers: Infinity War,0.00%


In [24]:
predict_picture_2019 = oscar_predict(X_train_best_picture, y_train_best_picture, guess_2019, ExtraTreesClassifier())
predict_picture_2019.head(10)











Unnamed: 0,Classifier,Probability of it Winning
Joker,0.0,0.06
Avengers: Endgame,0.0,0.0
Once Upon a Time in Hollywood,0.0,0.38
Captain Marvel,0.0,0.0
Parasite,0.0,0.17
Star Wars: Episode IX - The Rise of Skywalker,0.0,0.0
Spider-Man: Far from Home,0.0,0.0
The Irishman,0.0,0.07
1917,1.0,0.56
Knives Out,0.0,0.0


In [25]:
normalized_predict_picture_2019 = normalized_winner(predict_picture_2019).sort_values("Chance of Winning",ascending=False)
normalized_predict_picture_2019.head(10)

Unnamed: 0,Chance of Winning
The Irishman,5.30%
1917,42.42%
Jojo Rabbit,4.55%
Joker,4.55%
Once Upon a Time in Hollywood,28.79%
Parasite,12.88%
Marriage Story,0.76%
The Lion King,0.76%
Annabelle Comes Home,0.00%
The Lego Movie 2: The Second Part,0.00%


https://github.com/MrThomasPin/Oscar-Machine-Learning/blob/master/03_oscar-predictor-ml_2019.ipynb