In [1]:
#import libraries
import pandas as pd
from ast import literal_eval
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, recall_score, precision_score
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Since we will be trying a number of different models, we decided to create a function that will store all of the outcomes of interest in a dictionary. The function below does an 80/20 train/test split. We also fit all of our models using a 50/50 split, but had better accuracy when using 80/20.

In [5]:
def evaluate_model(model, predictors, response, cv=False, params=None):
    """
    evaluate_model()
    
    -splits the predictors & response variables into train and test sets. 
    -creates a dictionary of model outcomes that are of interest
    -if specified, this function will use cross-validation to determine the optimal parameters for a given model
    
    inputs:
        -model: a model object to be fitted
        -predictors: an array, series, or dataframe of predictor variable(s)
        -response: an array or series of the response variable
        -cv: whether or not to cross-validate the model's parameters (default=False)
        -params: if cv=True, params are required to indicate what parameters to optimize in the given model (default=None)
        
    outputs:
        -a results dictionary containing the following:
            -a fitted model object
    
    """
    results = {}
    train_x, test_x = train_test_split(predictors, test_size=0.2, random_state=9001)
    train_y, test_y = train_test_split(response, test_size=0.2, random_state=9001)
    
    if cv:
        model = GridSearchCV(model, params, scoring=make_scorer(f1_score, average='micro'))
    
    classif = OneVsRestClassifier(model)
    classif.fit(train_x, train_y)
    
    train_yhat = classif.predict(train_x)
    test_yhat = classif.predict(test_x)
    
    results['fitted_model'] = classif
    
    results['train_yhat'] = train_yhat
    results['test_yhat'] = test_yhat
    
    #train_y_score = classif.decision_function(train_x)
    #test_y_score = classif.decision_function(test_x)
    
    #results['train_average_precision'] = average_precision_score(train_y, train_y_score)
    #results['test_average_precision'] = average_precision_score(test_y, test_y_score)
    
    results['train_recall_score'] = recall_score(train_y, train_yhat, average='weighted')
    results['test_recall_score'] = recall_score(test_y, test_yhat, average='weighted')
    
    results['train_precision_score'] = precision_score(train_y, train_yhat,average='weighted')
    results['test_precision_score'] = precision_score(test_y, test_yhat,average='weighted')
    
    results['train_classification_report'] = classification_report(train_y, train_yhat,target_names=target_names)
    results['test_classification_report'] = classification_report(test_y, test_yhat,target_names=target_names)
    
    return results

When we created the [MultiLabel Binarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html#sklearn.preprocessing.MultiLabelBinarizer) array in a previous notebook, it created 18 classes corresponding to the 18 genres found in our dataset. These classes were labeled as class 0 through class 17, sorted in numeric order of the genre id. In order to improve readability of our report, we will set the target_names to be the name of the genre, rather than the id. The target names need to be in the same order as the MultiLabel Binarizer, so we will do two steps:

    1: Sort the keys of the id_to_genre dictionary in ascending numeric order
    2: Use the sorted keys against the id_to_genre dictionary to create the ordered target_names

In [6]:
import json
id_to_genre = json.load(open('data/id_to_genre.json'))

In [7]:
keys = list(int(key) for key in id_to_genre.keys())
keys.sort()

target_names = [id_to_genre[str(key)] for key in keys]

Next we will load in our arrays to be used as predictor variables.

In [8]:
tmdb_bow = np.load('data/tmdb_bow.npy')
imdb_bow = np.load('data/imdb_bow.npy')
combined_bow = np.load('data/combined_bow.npy')

Our Words2Vector arrays require 2 additional steps in order to be used as predictors:

    1: They need to be converted to an array of lists (they are currently an array of arrays, which is incompatible with the structure of the response variable, which is also an array of lists).
    2: The values need to be standardized between 0 and 1, because (todo - there was an error when they were negative. I will have to re-run and see what caused the error

In [9]:
tmdb_w2v_mean = np.load('data/tmdb_w2v_mean.npy')
imdb_w2v_mean = np.load('data/imdb_w2v_mean.npy')
combined_w2v_mean = np.load('data/combined_w2v_mean.npy')

tmdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, tmdb_w2v_mean)
imdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, imdb_w2v_mean)
combined_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, combined_w2v_mean)

In [10]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()

scale.fit(tmdb_w2v_mean)
tmdb_w2v_mean = scale.transform(tmdb_w2v_mean)

scale.fit(imdb_w2v_mean)
imdb_w2v_mean = scale.transform(imdb_w2v_mean)

scale.fit(combined_w2v_mean)
combined_w2v_mean = scale.transform(combined_w2v_mean)

In [12]:
binary_tmdb = np.load('data/binary_tmdb.npy')
binary_imdb = np.load('data/binary_imdb.npy')

We now have all of the parameters necessary to run our function. We will create models using 6 different predictors:

    1: Bag of words using the TMDB plot
    2: Bag of words using the IMDB plot
    3: Bag of words using the combined plots from both sources
    4: Words2Vectors using the TMDB plot
    5: Words2Vectors using the IMDB plot
    6: Words2Vectors using the combined plots from both sources
    

We will use these predictors to create 3 classification models to predict movie genres:

    1: Naive-Bayes, with a cross-validated smoothing parameter
    2: Stochastic Gradient Descent, with a cross-validated regularization multiplier.
    3: Support Vector machines, with a cross-validated penalty parameter of the error term.
    
    
This will result in 18 total models being created. We will store the results of each model in a dictionary, which will allow us to identify the best-performing models.    

In [13]:
modelDict = {'Naive-Bayes':{'model':MultinomialNB(),
                           'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SGD':{'model':SGDClassifier(loss='hinge',penalty='l2',n_iter=5,random_state=9001),
                   'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SVC':{'model':SVC(class_weight='balanced', kernel='linear'),
                   'params':{'C':[0.01,0.1,1.0]}}
           }

predictorDict = {
                 'tmdb_bow':tmdb_bow,
                 'imdb_bow':imdb_bow,
                 'combined_bow':combined_bow,
                 'tmdb_w2v_mean':tmdb_w2v_mean,
                 'imdb_w2v_mean':imdb_w2v_mean,
                 'combined_w2v_mean':combined_w2v_mean
                }

In [14]:
resultsDict = {}
for model in modelDict:
    for predictor in predictorDict:
        resultsDict['{0}-{1}'.format(model,predictor)] = evaluate_model(model = modelDict[model]['model'],
                                                                        predictors = predictorDict[predictor], 
                                                                        response = binary_tmdb,
                                                                        cv=True,
                                                                        params=modelDict[model]['params'])
        

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

This next cell will be removed when we submit our final report. It is just being used now to temporarily store the results of the models to avoid having to re-run the cell above

In [15]:
#hacky code to temporarily store results in a json file
temp = resultsDict.copy()

#hacky code to remove results that can't be stored in a json
for key in temp:
    for inner_key in temp[key]:
        if inner_key not in ['train_recall_score','test_recall_score','train_precision_score','test_precision_score',
                            'train_classification_report', 'test_classification_report']:
            temp[key][inner_key] = None
        
        
import json

with open('data/resultsDict.json','w') as file:
    json.dump(temp, file)

Next we will make a dataframe of the scores for the sake of readability.

In [16]:
scores = ['train_recall_score','test_recall_score',
          'train_precision_score','test_precision_score']

results_df = pd.DataFrame(resultsDict)

In [17]:
results_df = results_df.loc[results_df.index.isin(scores)]

In [18]:
results_df = results_df.transpose()

In [19]:
best_recall = results_df['test_recall_score'].idxmax()
best_precision = results_df['test_precision_score'].idxmax()

In [20]:
results_df.loc[results_df.index.isin([best_recall, best_precision])]

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
SVC-combined_bow,0.646031,0.538136,0.959229,0.993214
SVC-combined_w2v_mean,0.571619,0.713983,0.714626,0.888027


We see that Support Vector Machines result in the best accuracy - both in terms of precision and recall. Combining the plots from both data sources also results in the best accuracy, which intuitively makes sense as combining the sources results in a more descriptive plot.

Bag-of-words results in the best `test_precision_score`, although the model appears to be overfitting. Perhaps increasing the penalty parameter of the error term would result in better test scores, although our existing results are sufficient enough to move forward without exploring that option.

Words2Vectors results in the best `test_recall_score`, although the test_precision_score is less accurate than the results from the bag-of-words model.

Next we will look at the full results of each model by using the classification report.

In [21]:
print('Results: ', best_recall)
print(resultsDict[best_recall]['train_classification_report'])
print(resultsDict[best_recall]['test_classification_report'])

Results:  SVC-combined_w2v_mean
                 precision    recall  f1-score   support

      Adventure       0.57      0.74      0.65       137
        Fantasy       0.42      0.67      0.52        82
      Animation       0.82      1.00      0.90        78
          Drama       0.82      0.88      0.85       515
         Horror       0.91      1.00      0.95        41
         Action       0.57      0.90      0.70       134
         Comedy       0.58      0.86      0.69       189
        History       0.72      1.00      0.84        55
        Western       0.79      1.00      0.88        23
       Thriller       0.55      0.79      0.65       188
          Crime       0.64      0.82      0.72       148
    Documentary       0.92      1.00      0.96        83
Science Fiction       0.77      1.00      0.87        87
        Mystery       0.88      1.00      0.94        37
          Music       0.69      0.99      0.82       119
        Romance       0.90      1.00      0.95        9

In [22]:
print('Results: ', best_precision)
print(resultsDict[best_precision]['train_classification_report'])
print(resultsDict[best_precision]['test_classification_report'])

Results:  SVC-combined_bow
                 precision    recall  f1-score   support

      Adventure       0.94      1.00      0.97       137
        Fantasy       0.95      1.00      0.98        82
      Animation       0.95      1.00      0.97        78
          Drama       1.00      0.98      0.99       515
         Horror       0.98      1.00      0.99        41
         Action       0.94      1.00      0.97       134
         Comedy       0.94      1.00      0.97       189
        History       0.96      1.00      0.98        55
        Western       0.96      1.00      0.98        23
       Thriller       0.94      1.00      0.97       188
          Crime       0.95      1.00      0.98       148
    Documentary       0.98      1.00      0.99        83
Science Fiction       0.98      1.00      0.99        87
        Mystery       1.00      1.00      1.00        37
          Music       0.94      1.00      0.97       119
        Romance       0.98      1.00      0.99        91
   

todo: add analysis for the classification reports, listed above

In [23]:
results_df

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_bow,0.637348,0.381356,0.981919,0.929714
Naive-Bayes-combined_w2v_mean,0.449889,0.290254,0.545585,0.270965
Naive-Bayes-imdb_bow,0.552091,0.345339,0.981623,0.81047
Naive-Bayes-imdb_w2v_mean,0.240398,0.273305,0.490658,0.254484
Naive-Bayes-tmdb_bow,0.588559,0.324153,0.941385,0.689772
Naive-Bayes-tmdb_w2v_mean,0.253401,0.275424,0.45296,0.261755
SGD-combined_bow,0.173559,0.271186,0.160703,0.249636
SGD-combined_w2v_mean,0.613718,0.434322,0.682537,0.457586
SGD-imdb_bow,0.173559,0.271186,0.160703,0.249636
SGD-imdb_w2v_mean,0.404421,0.360169,0.573536,0.400873
