In [12]:
#import libraries
import pandas as pd
from ast import literal_eval
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, recall_score, precision_score
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Since we will be trying a number of different models, we decided to create a function that will store all of the outcomes of interest in a dictionary. The function below does an 80/20 train/test split. We also fit all of our models using a 50/50 split, but had better accuracy when using 80/20.

In [2]:
def evaluate_model(model, predictors, response, cv=False, params=None):
    """
    evaluate_model()
    
    -splits the predictors & response variables into train and test sets. 
    -creates a dictionary of model outcomes that are of interest
    -if specified, this function will use cross-validation to determine the optimal parameters for a given model
    
    inputs:
        -model: a model object to be fitted
        -predictors: an array, series, or dataframe of predictor variable(s)
        -response: an array or series of the response variable
        -cv: whether or not to cross-validate the model's parameters (default=False)
        -params: if cv=True, params are required to indicate what parameters to optimize in the given model (default=None)
        
    outputs:
        -a results dictionary containing the following:
            -a fitted model object
    
    """
    results = {}
    train_x, test_x = train_test_split(predictors, test_size=0.2, random_state=9001)
    train_y, test_y = train_test_split(response, test_size=0.2, random_state=9001)
    
    if cv:
        model = GridSearchCV(model, params, scoring=make_scorer(f1_score, average='micro'))
    
    classif = OneVsRestClassifier(model)
    classif.fit(train_x, train_y)
    
    train_yhat = classif.predict(train_x)
    test_yhat = classif.predict(test_x)
    
    results['fitted_model'] = classif
    
    results['train_yhat'] = train_yhat
    results['test_yhat'] = test_yhat
    
    #train_y_score = classif.decision_function(train_x)
    #test_y_score = classif.decision_function(test_x)
    
    #results['train_average_precision'] = average_precision_score(train_y, train_y_score)
    #results['test_average_precision'] = average_precision_score(test_y, test_y_score)
    
    results['train_recall_score'] = recall_score(train_y, train_yhat, average='weighted')
    results['test_recall_score'] = recall_score(test_y, test_yhat, average='weighted')
    
    results['train_precision_score'] = precision_score(train_y, train_yhat,average='weighted')
    results['test_precision_score'] = precision_score(test_y, test_yhat,average='weighted')
    
    results['train_classification_report'] = classification_report(train_y, train_yhat,target_names=target_names)
    results['test_classification_report'] = classification_report(test_y, test_yhat,target_names=target_names)
    
    return results

When we created the [MultiLabel Binarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html#sklearn.preprocessing.MultiLabelBinarizer) array in a previous notebook, it created 18 classes corresponding to the 18 genres found in our dataset. These classes were labeled as class 0 through class 17, sorted in numeric order of the genre id. In order to improve readability of our report, we will set the target_names to be the name of the genre, rather than the id. The target names need to be in the same order as the MultiLabel Binarizer, so we will do two steps:

    1: Sort the keys of the id_to_genre dictionary in ascending numeric order
    2: Use the sorted keys against the id_to_genre dictionary to create the ordered target_names

In [3]:
import json
id_to_genre = json.load(open('data/id_to_genre.json'))

id_to_genre = {int(key):value for key, value in id_to_genre.items()} #convert string keys to int keys

In [6]:
target_names = json.load(open('data/target_names.json'))['tmdb']

target_names

['Adventure',
 'Fantasy',
 'Animation',
 'Drama',
 'Horror',
 'Action',
 'Comedy',
 'History',
 'Western',
 'Thriller',
 'Crime',
 'Science Fiction',
 'Mystery',
 'Music',
 'Romance',
 'Family',
 'War',
 'TV Movie']

Next we will load in our arrays to be used as predictor variables.

In [8]:
tmdb_bow = np.load('data/tmdb_bow.npy')
imdb_bow = np.load('data/imdb_bow.npy')
combined_bow = np.load('data/combined_bow.npy')

Our Word2Vector and Docs2Vector arrays require 2 additional steps in order to be used as predictors:

    1: They need to be converted to an array of lists (they are currently an array of arrays, which is incompatible with the structure of the response variable, which is also an array of lists).
    2: The values need to be standardized between 0 and 1, because (todo - there was an error when they were negative. I will have to re-run and see what caused the error

In [14]:
tmdb_w2v_mean = np.load('data/tmdb_w2v_mean.npy')
imdb_w2v_mean = np.load('data/imdb_w2v_mean.npy')
combined_w2v_mean = np.load('data/combined_w2v_mean.npy')

tmdb_doc_vec = np.load('data/tmdb_doc_vec.npy')
imdb_doc_vec = np.load('data/imdb_doc_vec.npy')
combined_doc_vec = np.load('data/combined_doc_vec.npy')

tmdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, tmdb_w2v_mean)
imdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, imdb_w2v_mean)
combined_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, combined_w2v_mean)

tmdb_doc_vec = np.apply_along_axis(lambda x: list(x), 0, tmdb_doc_vec)
imdb_doc_vec = np.apply_along_axis(lambda x: list(x), 0, imdb_doc_vec)
combined_doc_vec = np.apply_along_axis(lambda x: list(x), 0, combined_doc_vec)

In [30]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()


#word2vec scaling
scale.fit(tmdb_w2v_mean)
tmdb_w2v_mean = scale.transform(tmdb_w2v_mean)

scale.fit(imdb_w2v_mean)
imdb_w2v_mean = scale.transform(imdb_w2v_mean)

scale.fit(combined_w2v_mean)
combined_w2v_mean = scale.transform(combined_w2v_mean)

#doc2vec scaling
scale.fit(tmdb_doc_vec)
tmdb_doc_vec = scale.transform(tmdb_doc_vec)

scale.fit(imdb_doc_vec)
imdb_doc_vec = scale.transform(imdb_doc_vec)

scale.fit(combined_doc_vec)
combined_doc_vec = scale.transform(combined_doc_vec)

Load in response variable

In [31]:
binary_tmdb = np.load('data/binary_tmdb.npy')

We now have all of the parameters necessary to run our function. We will create models using 6 different predictors:

    1: Bag of words using the TMDB plot
    2: Bag of words using the IMDB plot
    3: Bag of words using the combined plots from both sources
    4: Word2Vectors using the TMDB plot
    5: Word2Vectors using the IMDB plot
    6: Word2Vectors using the combined plots from both sources
    7: Doc2Vectors using the TMDB plot
    8: Doc2Vectors using the IMDB plot
    9: Doc2Vectors using the combined plots from both sources
    

We will use these predictors to create 3 classification models to predict movie genres:

    1: Naive-Bayes, with a cross-validated smoothing parameter
    2: Stochastic Gradient Descent, with a cross-validated regularization multiplier.
    3: Support Vector machines, with a cross-validated penalty parameter of the error term.
    
    
This will result in 27 total models being created. We will store the results of each model in a dictionary, which will allow us to identify the best-performing models.    

In [32]:
modelDict = {'Naive-Bayes':{'model':MultinomialNB(),
                           'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SGD':{'model':SGDClassifier(loss='hinge',penalty='l2',n_iter=5,random_state=9001),
                   'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SVC':{'model':SVC(class_weight='balanced', kernel='linear'),
                   'params':{'C':[0.01,0.1,1.0]}}
           }

predictorDict = {
                 'tmdb_bow':tmdb_bow,
                 'imdb_bow':imdb_bow,
                 'combined_bow':combined_bow,
                 'tmdb_w2v_mean':tmdb_w2v_mean,
                 'imdb_w2v_mean':imdb_w2v_mean,
                 'combined_w2v_mean':combined_w2v_mean,
                 'tmdb_doc_vec':tmdb_doc_vec,
                 'imdb_doc_vec':imdb_doc_vec,
                 'combined_doc_vec':combined_doc_vec
                }

In [29]:
predictor

'combined_doc_vec'

sklearn returns a warning when the function above uses weighted averages on samples that have no predictors. This will not affect our metrics and outputs repetitive information when run in a loop. The warning, that we are choosing to ignore, is as follows:

```UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)```

In [53]:
resultsDict = {}
import warnings
with warnings.catch_warnings(): #temporarily ignore the warnings described above
    warnings.simplefilter("ignore")
    for model in modelDict:
        for predictor in predictorDict:
            resultsDict['{0}-{1}'.format(model,predictor)] = evaluate_model(model = modelDict[model]['model'],
                                                                            predictors = predictorDict[predictor], 
                                                                            response = binary_tmdb,
                                                                            cv=True,
                                                                            params=modelDict[model]['params'])


This next cell will be removed when we submit our final report. It is just being used now to temporarily store the results of the models to avoid having to re-run the cell above

In [54]:
import pickle #trying to use pickle to see if the model object can be retained while saving, as opposed to json which drops it

pickle.dump(resultsDict, open('data/resultsDict.sav','wb'))

In [1]:
#start_here

import pickle

resultsDict = pickle.load(open('data/resultsDict.sav','rb'))

Next we will make a dataframe of the scores for the sake of readability.

In [19]:
scores = ['train_recall_score','test_recall_score',
          'train_precision_score','test_precision_score']

results_df = pd.DataFrame(resultsDict)

In [20]:
results_df = results_df.loc[results_df.index.isin(scores)]

In [91]:
results_df = results_df.transpose()
results_df

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_bow,0.725255,0.377953,0.982521,0.926914
Naive-Bayes-combined_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-combined_w2v_mean,0.456782,0.267717,0.583491,0.274074
Naive-Bayes-imdb_bow,0.596821,0.340551,0.948176,0.780741
Naive-Bayes-imdb_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-imdb_w2v_mean,0.178442,0.253937,0.605336,0.262716
Naive-Bayes-tmdb_bow,0.619958,0.301181,0.970122,0.79358
Naive-Bayes-tmdb_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-tmdb_w2v_mean,0.316826,0.261811,0.494266,0.265185
SGD-combined_bow,0.166339,0.255906,0.163924,0.254321


A few of our models have identical scores for all outcomes, as demonstrated by the cell below.

In [33]:
duplicate_scores = pd.concat(group for _, group in results_df.groupby((scores)) if len(group) > 1)
duplicate_scores

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-imdb_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-tmdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-combined_bow,0.166339,0.255906,0.163924,0.254321
SGD-combined_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-imdb_bow,0.166339,0.255906,0.163924,0.254321
SGD-imdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-tmdb_bow,0.166339,0.255906,0.163924,0.254321
SGD-tmdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SVC-combined_doc_vec,0.186744,0.525591,0.194323,0.532346


There are 2 duplicate values occuring. One occurs in 9 models, and the other occurs with 2. We will explain these one at a time, beginning with the group of 9 models.

In [56]:
group_1 = list(duplicate_scores.index.values[:9])
group_2 = list(duplicate_scores.index.values[9:])

The classification report shows the issue occuring in group 1. Instead of printing out 2 reports for each of the 9 models, we will instead write code that proves that all 9 models have identical classification reports

In [57]:
for model in range(len(group_1)):
    if resultsDict[group_1[0]]['train_classification_report'] != resultsDict[group_1[model]]['train_classification_report']:
        print('Train classification reports do not match between model 0 and model {}'.format(model))
    
    if resultsDict[group_1[0]]['test_classification_report'] != resultsDict[group_1[model]]['test_classification_report']:
        print('Test classification reports do not match between model 0 and model {}'.format(model))

Now that we have proven that all 9 models have identical classification reports, we can print a single set of reports to observe what is happening across all groups.

In [48]:
print(resultsDict[group_1[0]]['train_classification_report'])
print(resultsDict[group_1[0]]['test_classification_report'])

                 precision    recall  f1-score   support

      Adventure       0.00      0.00      0.00       137
        Fantasy       0.00      0.00      0.00        76
      Animation       0.00      0.00      0.00        82
          Drama       0.64      1.00      0.78       515
         Horror       0.00      0.00      0.00        41
         Action       0.00      0.00      0.00       124
         Comedy       0.00      0.00      0.00       189
        History       0.00      0.00      0.00        54
        Western       0.00      0.00      0.00        25
       Thriller       0.00      0.00      0.00       184
          Crime       0.00      0.00      0.00       143
Science Fiction       0.00      0.00      0.00        81
        Mystery       0.00      0.00      0.00        77
          Music       0.00      0.00      0.00        34
        Romance       0.00      0.00      0.00       124
         Family       0.00      0.00      0.00        92
            War       0.00    

Each of these 9 models are being overfit - for both the train and test dataset, each of these models is predicting that 100% of the movies will be genres. This is because of the large number of drama movies in the dataset (this is demonstrated in our EDA notebook). Fine-tuning the penalization parameters may help with these models being overfit, but that exploration is outside the scope of this analysis. We will instead focus on the models that are not being overfit, after first exploring the other group of duplicates.

Again, let's make sure the classification reports are identical between the two models.

In [58]:
if resultsDict[group_2[0]]['train_classification_report'] != resultsDict[group_2[1]]['train_classification_report']:
    print('Train classification reports do not match between model 0 and model 1')
    
if resultsDict[group_2[0]]['test_classification_report'] != resultsDict[group_2[1]]['test_classification_report']:
    print('Test classification reports do not match between model 0 and model 1')
    

The two reports are identical, so let's take a look.

In [59]:
print(resultsDict[group_2[0]]['train_classification_report'])

print(resultsDict[group_2[0]]['test_classification_report'])

                 precision    recall  f1-score   support

      Adventure       0.29      0.72      0.41       137
        Fantasy       0.10      1.00      0.17        76
      Animation       0.10      1.00      0.19        82
          Drama       0.00      0.00      0.00       515
         Horror       0.05      1.00      0.10        41
         Action       0.32      0.73      0.44       124
         Comedy       0.53      0.16      0.24       189
        History       0.14      0.69      0.23        54
        Western       0.11      0.76      0.19        25
       Thriller       0.23      1.00      0.37       184
          Crime       0.18      1.00      0.30       143
Science Fiction       0.28      0.64      0.39        81
        Mystery       0.16      0.77      0.27        77
          Music       0.09      0.76      0.16        34
        Romance       0.45      0.36      0.40       124
         Family       0.24      0.59      0.34        92
            War       0.07    

In [60]:
group_2

['SVC-combined_doc_vec', 'SVC-tmdb_doc_vec']

# Best Models

In [22]:
best_recall = results_df['test_recall_score'].idxmax()
best_precision = results_df['test_precision_score'].idxmax()

In [23]:
results_df.loc[results_df.index.isin([best_recall, best_precision])]

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_bow,0.725255,0.377953,0.982521,0.926914
SVC-combined_w2v_mean,0.558305,0.688976,0.742788,0.893827


todo: update analysis here. support vector machines no longer has the best accuracy

We see that Support Vector Machines result in the best accuracy - both in terms of precision and recall. Combining the plots from both data sources also results in the best accuracy, which intuitively makes sense as combining the sources results in a more descriptive plot.

Bag-of-words results in the best `test_precision_score`, although the model appears to be overfitting. Perhaps increasing the penalty parameter of the error term would result in better test scores, although our existing results are sufficient enough to move forward without exploring that option.

WordsVectors results in the best `test_recall_score`, although the test_precision_score is less accurate than the results from the bag-of-words model.

Next we will look at the full results of each model by using the classification report.

In [41]:
print('Results: ', best_precision)
print(resultsDict[best_precision]['train_classification_report'])
print(resultsDict[best_precision]['test_classification_report'])

Results:  Naive-Bayes-combined_bow
                 precision    recall  f1-score   support

      Adventure       0.97      0.88      0.92       137
        Fantasy       1.00      0.72      0.84        76
      Animation       1.00      1.00      1.00        82
          Drama       0.96      0.99      0.98       515
         Horror       1.00      1.00      1.00        41
         Action       0.98      0.88      0.93       124
         Comedy       1.00      0.94      0.97       189
        History       1.00      1.00      1.00        54
        Western       1.00      1.00      1.00        25
       Thriller       0.98      0.85      0.91       184
          Crime       1.00      0.90      0.95       143
Science Fiction       0.99      0.93      0.96        81
        Mystery       0.99      0.99      0.99        77
          Music       1.00      0.82      0.90        34
        Romance       1.00      0.85      0.92       124
         Family       1.00      0.90      0.95      

In [40]:
print('Results: ', best_recall)
print(resultsDict[best_recall]['train_classification_report'])
print(resultsDict[best_recall]['test_classification_report'])

Results:  SVC-combined_w2v_mean
                 precision    recall  f1-score   support

      Adventure       0.58      0.73      0.65       137
        Fantasy       0.38      0.68      0.49        76
      Animation       0.89      1.00      0.94        82
          Drama       0.90      0.86      0.88       515
         Horror       0.93      1.00      0.96        41
         Action       0.56      0.92      0.70       124
         Comedy       0.59      0.91      0.71       189
        History       0.90      1.00      0.95        54
        Western       0.78      1.00      0.88        25
       Thriller       0.61      0.86      0.71       184
          Crime       0.62      0.81      0.70       143
Science Fiction       0.89      1.00      0.94        81
        Mystery       0.73      1.00      0.85        77
          Music       0.92      1.00      0.96        34
        Romance       0.70      0.98      0.82       124
         Family       0.88      1.00      0.94        9

todo: add analysis for the classification reports, listed above

In [64]:
results_df.filter(like='doc_vec', axis=0)

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-imdb_doc_vec,0.166339,0.255906,0.163924,0.254321
Naive-Bayes-tmdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-combined_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-imdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SGD-tmdb_doc_vec,0.166339,0.255906,0.163924,0.254321
SVC-combined_doc_vec,0.186744,0.525591,0.194323,0.532346
SVC-imdb_doc_vec,0.153214,0.340551,0.162445,0.37679
SVC-tmdb_doc_vec,0.186744,0.525591,0.194323,0.532346


All of the doc_vec models have very poor predictions

# Subsetting

Subsetting on the data source:

In [110]:
data_source_results = {}

for group in ['imdb', 'tmdb', 'combined']:
    subset = results_df.filter(like=group, axis=0)

    
    data_source_results[group] = {'min_test_precision': subset['test_precision_score'].min(),
                                'max_test_precision': subset['test_precision_score'].max(),
                                'mean_test_precision':subset['test_precision_score'].mean(),
                                'min_test_recall': subset['test_recall_score'].min(),
                                'max_test_recall': subset['test_recall_score'].max(),
                                'mean_test_recall':subset['test_recall_score'].mean()}

In [111]:
column_order = ['min_test_precision','max_test_precision','mean_test_precision',
                                            'min_test_recall','max_test_recall','mean_test_recall']

pd.DataFrame(data_source_results).transpose()[column_order]

Unnamed: 0,min_test_precision,max_test_precision,mean_test_precision,min_test_recall,max_test_recall,mean_test_recall
combined,0.166339,0.725255,0.4021,0.255906,0.688976,0.395888
imdb,0.153214,0.607838,0.32814,0.253937,0.685039,0.375984
tmdb,0.166339,0.619958,0.345431,0.255906,0.633858,0.374016


In [113]:
predictor_results = {}
for group in ['bow', 'w2v', 'doc_vec']:
    subset = results_df.filter(like=group, axis=0)

    predictor_results[group] = {'min_test_precision': subset['test_precision_score'].min(),
                                'max_test_precision': subset['test_precision_score'].max(),
                                'mean_test_precision':subset['test_precision_score'].mean(),
                                'min_test_recall': subset['test_recall_score'].min(),
                                'max_test_recall': subset['test_recall_score'].max(),
                                'mean_test_recall':subset['test_recall_score'].mean()}

In [114]:
pd.DataFrame(predictor_results).transpose()[column_order]

Unnamed: 0,min_test_precision,max_test_precision,mean_test_precision,min_test_recall,max_test_recall,mean_test_recall
bow,0.166339,0.725255,0.466946,0.255906,0.551181,0.37336
doc_vec,0.153214,0.186744,0.169415,0.255906,0.525591,0.325241
w2v,0.178442,0.558305,0.43931,0.253937,0.688976,0.447288
