In [1]:
#import libraries
import pandas as pd
from ast import literal_eval
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, recall_score, precision_score
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Since we will be trying a number of different models, we decided to create a function that will store all of the outcomes of interest in a dictionary.

In [61]:
def evaluate_model(model, predictors, response, cv=False, params=None):
    """
    evaluate_model()
    
    -splits the predictors & response variables into train and test sets. 
    -creates a dictionary of model outcomes that are of interest
    -if specified, this function will use cross-validation to determine the optimal parameters for a given model
    
    inputs:
        -model: a model object to be fitted
        -predictors: an array, series, or dataframe of predictor variable(s)
        -response: an array or series of the response variable
        -cv: whether or not to cross-validate the model's parameters (default=False)
        -params: if cv=True, params are required to indicate what parameters to optimize in the given model (default=None)
        
    outputs:
        -a results dictionary containing the following:
            -a fitted model object
    
    """
    results = {}
    train_x, test_x = train_test_split(predictors, test_size=0.5, random_state=9001)
    train_y, test_y = train_test_split(response, test_size=0.5, random_state=9001)
    
    if cv:
        model = GridSearchCV(model, params, scoring=make_scorer(f1_score, average='micro'))
    
    classif = OneVsRestClassifier(model)
    classif.fit(train_x, train_y)
    
    train_yhat = classif.predict(train_x)
    test_yhat = classif.predict(test_x)
    
    results['fitted_model'] = classif
    
    results['train_yhat'] = train_yhat
    results['test_yhat'] = test_yhat
    
    #train_y_score = classif.decision_function(train_x)
    #test_y_score = classif.decision_function(test_x)
    
    #results['train_average_precision'] = average_precision_score(train_y, train_y_score)
    #results['test_average_precision'] = average_precision_score(test_y, test_y_score)
    
    results['train_recall_score'] = recall_score(train_y, train_yhat, average='weighted')
    results['test_recall_score'] = recall_score(test_y, test_yhat, average='weighted')
    
    results['train_precision_score'] = precision_score(train_y, train_yhat,average='weighted')
    results['test_precision_score'] = precision_score(test_y, test_yhat,average='weighted')
    
    results['train_classification_report'] = classification_report(train_y, train_yhat)
    results['test_classification_report'] = classification_report(test_y, test_yhat)
    
    return results

# Create a dictionary to try multiple models in one cell

In [20]:
tmdb_bow = np.load('data/tmdb_bow.npy')
imdb_bow = np.load('data/imdb_bow.npy')
combined_bow = np.load('data/combined_bow.npy')

todo: Incorporate this into our report at some point... this explains why we need to scale between 0 and 1
https://stackoverflow.com/questions/25792012/feature-selection-using-scikit-learn

In [75]:
tmdb_w2v_mean = np.load('data/tmdb_w2v_mean.npy')
imdb_w2v_mean = np.load('data/imdb_w2v_mean.npy')
combined_w2v_mean = np.load('data/combined_w2v_mean.npy')

tmdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, tmdb_w2v_mean)
imdb_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, imdb_w2v_mean)
combined_w2v_mean = np.apply_along_axis(lambda x: list(x), 0, combined_w2v_mean)



scale.fit(imdb_w2v_mean)

test = scale.transform(imdb_w2v_mean)

In [76]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()

scale.fit(tmdb_w2v_mean)
tmdb_w2v_mean = scale.transform(tmdb_w2v_mean)

scale.fit(imdb_w2v_mean)
imdb_w2v_mean = scale.transform(imdb_w2v_mean)

scale.fit(combined_w2v_mean)
combined_w2v_mean = scale.transform(combined_w2v_mean)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
scale.fit(imdb_w2v_mean)

test = scale.transform(imdb_w2v_mean)

In [25]:
binary_tmdb = np.load('data/binary_tmdb.npy') #response
binary_imdb = np.load('data/binary_imdb.npy') #response

In [77]:
modelDict = {'Naive-Bayes':{'model':MultinomialNB(),
                           'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SGD':{'model':SGDClassifier(loss='hinge',penalty='l2',n_iter=5,random_state=9001),
                   'params':{'alpha':[0.01,0.1,1.0]}},
            
            'SVC':{'model':SVC(class_weight='balanced', kernel='linear'),
                   'params':{'C':[0.01,0.1,1.0]}}
           }

predictorDict = {
                 'tmdb_bow':tmdb_bow,
                 'imdb_bow':imdb_bow,
                 'combined_bow':combined_bow,
                 'tmdb_w2v_mean':tmdb_w2v_mean,
                 'imdb_w2v_mean':imdb_w2v_mean,
                 'combined_w2v_mean':combined_w2v_mean
                }

In [2]:
resultsDict = {}
for model in modelDict:
    for predictor in predictorDict:
        resultsDict['{0}-{1}'.format(model,predictor)] = evaluate_model(model = modelDict[model]['model'],
                                                                        predictors = predictorDict[predictor], 
                                                                        response = binary_tmdb,
                                                                        cv=True,
                                                                        params=modelDict[model]['params'])
        

The next 4 cells will be removed when we submit

In [81]:
#hacky code to temporarily store results in a json file
temp = resultsDict.copy()

In [87]:
#hacky code to remove results that can't be stored in a json
for key in temp:
    for inner_key in temp[key]:
        if inner_key not in ['train_recall_score','test_recall_score','train_precision_score','test_precision_score',
                            'train_classification_report', 'test_classification_report']:
            temp[key][inner_key] = None
        

In [89]:
import json

with open('data/resultsDict.json','w') as file:
    json.dump(temp, file)

In [3]:
import json
resultsDict = json.load(open('data/resultsDict.json'))

In [11]:
scores = ['train_recall_score','test_recall_score',
          'train_precision_score','test_precision_score']

results_df = pd.DataFrame(resultsDict)

In [13]:
results_df = results_df.loc[results_df.index.isin( scores)]

In [17]:
results_df = results_df.transpose()

In [19]:
best_recall = results_df['test_recall_score'].idxmax()
best_precision = results_df['test_precision_score'].idxmax()

In [23]:
results_df.loc[results_df.index.isin([best_recall, best_precision])]

Unnamed: 0,test_precision_score,test_recall_score,train_precision_score,train_recall_score
Naive-Bayes-combined_bow,0.626207,0.349427,0.995362,0.884996
SVC-combined_w2v_mean,0.533918,0.684124,0.76288,0.914699


In [22]:
results_df.index.isin([best_recall, best_precision])

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False])

In [26]:
print(resultsDict[best_precision]['train_classification_report'])
print(resultsDict[best_precision]['test_classification_report'])

             precision    recall  f1-score   support

          0       0.99      0.98      0.98        88
          1       1.00      0.91      0.95        55
          2       1.00      1.00      1.00        52
          3       0.99      1.00      0.99       327
          4       1.00      1.00      1.00        26
          5       1.00      0.99      0.99        84
          6       1.00      0.03      0.06       121
          7       0.97      0.87      0.92        39
          8       1.00      1.00      1.00        11
          9       1.00      0.94      0.97       112
         10       1.00      0.96      0.98        93
         11       1.00      1.00      1.00        52
         12       1.00      0.96      0.98        55
         13       1.00      1.00      1.00        24
         14       1.00      0.97      0.99        77
         15       1.00      0.97      0.98        59
         16       1.00      0.91      0.96        35
         17       1.00      1.00      1.00   