In [1]:
#import libraries
import pandas as pd
from ast import literal_eval
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

Since we will be trying a number of different models, we decided to create a function that will store all of the outcomes of interest in a dictionary.

In [14]:
def evaluate_model(model, predictors, response, cv=False, params=None):
    """
    evaluate_model()
    
    -splits the predictors & response variables into train and test sets. 
    -creates a dictionary of model outcomes that are of interest
    -if specified, this function will use cross-validation to determine the optimal parameters for a given model
    
    inputs:
        -model: a model object to be fitted
        -predictors: an array, series, or dataframe of predictor variable(s)
        -response: an array or series of the response variable
        -cv: whether or not to cross-validate the model's parameters (default=False)
        -params: if cv=True, params are required to indicate what parameters to optimize in the given model (default=None)
        
    outputs:
        -a results dictionary containing the following:
            -a fitted model object
    
    """
    results = {}
    train_x, test_x = train_test_split(predictors, test_size=0.5, random_state=9001)
    train_y, test_y = train_test_split(response, test_size=0.5, random_state=9001)
    
    if cv:
        model = GridSearchCV(model, params, scoring=make_scorer(f1_score, average='micro'))
    
    classif = OneVsRestClassifier(model)
    classif.fit(train_x, train_y)
    
    train_yhat = classif.predict(train_x)
    test_yhat = classif.predict(test_x)
    
    results['fitted_model'] = classif
    
    results['train_yhat'] = train_yhat
    results['test_yhat'] = test_yhat
    
    train_y_score = classif.decision_function(train_x)
    test_y_score = classif.decision_function(test_x)
    
    results['train_average_precision'] = average_precision_score(train_y, train_y_score)
    results['test_average_precision'] = average_precision_score(test_y, test_y_score)
    
    results['train_classification_report'] = classification_report(train_y, train_yhat)
    results['test_classification_report'] = classification_report(test_y, test_yhat)
    
    return results

# Bag of Words -TMDB Data

In [3]:
#load in previously saved ndarrays
binary_tmdb = np.load('data/binary_tmdb.npy') #response
binary_imdb = np.load('data/binary_imdb.npy') #response

tmdb_bow = np.load('data/tmdb_bow.npy') #predictor
imdb_bow = np.load('data/tmdb_bow.npy') #predictor

In [23]:
tmdb_bow_svc = evaluate_model(model=SVC(class_weight='balanced'), predictors=tmdb_bow, 
                                   response=binary_tmdb, cv=True, 
                                   params={'kernel':['linear'], 'C':[0.01, 0.1, 1.0]})

  'precision', 'predicted', average, warn_for)


In [6]:
print(tmdb_bow_svc['train_classification_report'])
print(tmdb_bow_svc['test_classification_report'])

             precision    recall  f1-score   support

          0       0.91      1.00      0.95        88
          1       0.92      1.00      0.96        55
          2       0.95      1.00      0.97        52
          3       0.99      0.97      0.98       327
          4       1.00      1.00      1.00        26
          5       0.17      1.00      0.29        84
          6       0.87      1.00      0.93       121
          7       0.08      1.00      0.14        39
          8       1.00      1.00      1.00        11
          9       0.92      1.00      0.96       112
         10       0.19      1.00      0.31        93
         11       0.93      1.00      0.96        52
         12       0.93      1.00      0.96        55
         13       1.00      1.00      1.00        24
         14       0.00      0.00      0.00        77
         15       0.97      1.00      0.98        59
         16       0.95      1.00      0.97        35
         17       1.00      1.00      1.00   

  'precision', 'predicted', average, warn_for)


It appears that overfitting may be occuring. The training precision/recall is 0.76/0.93, respectively, whereas the test results are 0.42/0.51, respectively.

# Bag of Words -TMDB and IMDB Combined

In [24]:
combined_bow =  np.load('data/combined_bow.npy')

In [27]:
combined_bow_svc = evaluate_model(model=SVC(class_weight='balanced'), predictors=combined_bow, 
                                   response=binary_tmdb, cv=True, 
                                   params={'kernel':['linear'], 'C':[0.01, 0.1, 1.0]})

  'precision', 'predicted', average, warn_for)


In [29]:
print(combined_bow_svc['train_classification_report'])
print(combined_bow_svc['test_classification_report'])

             precision    recall  f1-score   support

          0       0.96      1.00      0.98        88
          1       1.00      1.00      1.00        55
          2       0.98      1.00      0.99        52
          3       1.00      0.99      1.00       327
          4       0.96      1.00      0.98        26
          5       1.00      1.00      1.00        84
          6       0.96      1.00      0.98       121
          7       0.08      1.00      0.14        39
          8       1.00      1.00      1.00        11
          9       0.97      1.00      0.98       112
         10       0.96      1.00      0.98        93
         11       0.96      1.00      0.98        52
         12       0.98      1.00      0.99        55
         13       0.05      1.00      0.09        24
         14       0.00      0.00      0.00        77
         15       1.00      1.00      1.00        59
         16       0.92      0.94      0.93        35
         17       1.00      1.00      1.00   

# Words2Vec

One additional step is required before we can use w2v as a predictor variable. The binarizer variable is an array of lists, whereas w2v is an array of arrays. To maintain consistency, we will have to turn w2v into an array of lists.

In [7]:
tmdb_w2v = np.load('data/tmdb_w2v.npy')
imdb_w2v = np.load('data/imdb_w2v.npy')

tmdb_w2v = np.apply_along_axis(lambda x: list(x), 0, tmdb_w2v)
imdb_w2v = np.apply_along_axis(lambda x: list(x), 0, tmdb_w2v)


In [15]:
tmdb_w2v_svc = evaluate_model(model=SVC(class_weight='balanced'), predictors=tmdb_w2v, 
                                   response=binary_tmdb, cv=True, 
                                   params={'kernel':['linear'], 'C':[0.01, 0.1, 1.0]})

  'precision', 'predicted', average, warn_for)


In [22]:
print(tmdb_w2v_svc['train_classification_report'])
print(tmdb_w2v_svc['test_classification_report'])

             precision    recall  f1-score   support

          0       0.59      0.92      0.72        88
          1       0.11      1.00      0.20        55
          2       0.59      0.92      0.72        52
          3       0.90      0.84      0.87       327
          4       0.52      0.88      0.66        26
          5       0.17      1.00      0.29        84
          6       0.56      0.95      0.70       121
          7       0.08      1.00      0.14        39
          8       0.92      1.00      0.96        11
          9       0.62      0.62      0.62       112
         10       0.67      0.71      0.69        93
         11       0.62      0.96      0.76        52
         12       0.11      1.00      0.20        55
         13       0.05      1.00      0.09        24
         14       0.00      0.00      0.00        77
         15       0.63      0.97      0.77        59
         16       0.66      1.00      0.80        35
         17       1.00      1.00      1.00   

# Combined w2v

In [30]:
combined_w2v_mean = np.load('data/combined_w2v_mean.npy')

combined_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, combined_w2v_mean)

In [41]:
combined_w2v_mean_svc = evaluate_model(model=SVC(class_weight='balanced'), predictors=combined_w2v_mean, 
                                   response=binary_tmdb, cv=True, 
                                   params={'kernel':['linear'], 'C':[0.01, 0.1, 1.0]})

  'precision', 'predicted', average, warn_for)


In [44]:
print(combined_w2v_mean_svc['train_classification_report'])
print(combined_w2v_mean_svc['test_classification_report'])

             precision    recall  f1-score   support

          0       0.62      0.91      0.74        88
          1       0.11      1.00      0.20        55
          2       0.53      0.94      0.68        52
          3       0.89      0.87      0.88       327
          4       0.48      1.00      0.65        26
          5       0.45      0.73      0.55        84
          6       0.57      0.88      0.69       121
          7       0.08      1.00      0.14        39
          8       1.00      1.00      1.00        11
          9       0.61      0.90      0.73       112
         10       0.66      0.90      0.76        93
         11       0.59      0.96      0.73        52
         12       0.64      0.65      0.65        55
         13       0.05      1.00      0.09        24
         14       0.00      0.00      0.00        77
         15       0.63      0.97      0.77        59
         16       0.62      0.89      0.73        35
         17       1.00      1.00      1.00   

# Combined w2v matrix

In [35]:
combined_w2v_matrix = np.load('data/combined_w2v_matrix.npy')

combined_w2v_matrix = np.apply_along_axis(lambda x: list(x), 0, combined_w2v_mean)

In [40]:
combined_w2v_matrix_svc = evaluate_model(model=SVC(class_weight='balanced'), predictors=combined_w2v_matrix, 
                                   response=binary_tmdb, cv=True, 
                                   params={'kernel':['linear'], 'C':[0.01, 0.1, 1.0]})

  'precision', 'predicted', average, warn_for)


In [45]:
print(combined_w2v_matrix_svc['train_classification_report'])
print(combined_w2v_matrix_svc['test_classification_report'])

             precision    recall  f1-score   support

          0       0.62      0.91      0.74        88
          1       0.11      1.00      0.20        55
          2       0.53      0.94      0.68        52
          3       0.89      0.87      0.88       327
          4       0.48      1.00      0.65        26
          5       0.45      0.73      0.55        84
          6       0.57      0.88      0.69       121
          7       0.08      1.00      0.14        39
          8       1.00      1.00      1.00        11
          9       0.61      0.90      0.73       112
         10       0.66      0.90      0.76        93
         11       0.59      0.96      0.73        52
         12       0.64      0.65      0.65        55
         13       0.05      1.00      0.09        24
         14       0.00      0.00      0.00        77
         15       0.63      0.97      0.77        59
         16       0.62      0.89      0.73        35
         17       1.00      1.00      1.00   