In [None]:
# Import dependencies, load spaCy's core model (small)
import spacy
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
nlp = spacy.load('en_core_web_sm')

In [2]:
# Read cleaned csv into dataframe
df = pd.read_csv("Resources/wiki_movie_plots_CLEANED.csv")

In [3]:
# Use spaCy to process each movie plot string.
nostopwords = []
for i in range (len(df)):
    doc = nlp(df.Plot[i])
    
    # Tokenize strings - use full text (verbatim, and lowercased) for each token; remove common stop words and punctuation
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    nostopwords.append(tokens)

In [4]:
# Add column for token lists
df['no_stop_words'] = nostopwords

In [5]:
# Save df with tokens to new csv
df.to_csv("Resources/plots_tokenized.csv")

In [54]:
# Split df into 80:20 train:test split
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [55]:
# text to be classified = 'Plot', labels = 'Genre'; assign for both training and testing data
train1 = train['Plot'].tolist()
train1_labels = train['Genre'].tolist()

test1 = test['Plot'].tolist()
test1_labels = test['Genre'].tolist()

In [56]:
# Dummy function to pass into 'tokenizer' parameter instead of an actual tokenizing function,
# since we already have our list of lists of tokens (nostopwords)
def do_nothing(tokens):
    return tokens

# Use TfidfVectorizer (equivalent to CountVectorizer then Tfidftransformer)
# Generates Bag of Words matrices of token counts (vectors). Use ngrams of size n=1
# Then transforms count matrices to normalized tf-idf representations
vectorizer = TfidfVectorizer(input='content', tokenizer=do_nothing, ngram_range=(1,1))

In [57]:
# Linear dimensionality reduction using truncated singular value decomposition (SVD)
# In the context of working on term count/tf-idf matrices as returned vectorizers - AKA latent semantic analysis (LSA)
# Works with scipy.sparse matrices efficiently
tSVD = TruncatedSVD()

In [58]:
# Debug step that simply prints shape of data. Does not actually fit to or transform data.
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [59]:
# Use the Extra Trees (EXTreme RAndom forests) ensemble estimator as our classifier. 
# Should perform better than Random Forest in presence of noisy features. 
# Lower variance = lower variability of model prediction for a given data point = generalizes better on test data
classifier = ExtraTreesClassifier()

In [60]:
# Create a spaCy pipeline for processing data. 
# 1: create tf-idf vectors; 2: dimensionality reduction; 3: classify using Extra Trees
pipe = Pipeline([('vectorizer', vectorizer), 
#                 ('debug', Debug()),
                 ('tSVD', tSVD),
#                ('debug2', Debug()),
                 ('ETclassifier', classifier)
                ])

In [62]:
# Build hyperparameter grid for RandomizedSearchCV
# Randomly pick combinations of parameters to try for Truncated SVD and Extra Trees Classifier

param_grid = {
    # n_components = desired dimensionality of output data
    # n_iter = number of iterations for randomized SVD solver. default = 5
    'tSVD__n_components': [50, 150, 300, 500],
    'tSVD__n_iter': [5, 7, 10],
    
    # n_estimators = number of trees in the forest
    # max_features = max number of features considered for splitting a node
    # max_depth = max number of levels in each decision tree
    # min_samples_split = min number of data points placed in a node before the node is split
    # min_samples_leaf = min number of data points allowed in a leaf node
    # bootstrap = method for sampling data points (with or without replacement)
    'ETclassifier__n_estimators': [10, 100, 200, 500, 1000],
    'ETclassifier__max_depth': [2, 4, 6, 8, 10, None], 
    'ETclassifier__min_samples_split': [2, 5, 10],
    'ETclassifier__min_samples_leaf': [1, 2, 4],
    'ETclassifier__max_features': ['sqrt', 'log2'],
    'ETclassifier__bootstrap': [True, False],
}

# Number of combinations to sample = 50 (more increases runtime)
# cv = 3 (3-fold cross-validation)
# n_jobs = -1 (uses all available cores/processors)
search = RandomizedSearchCV(estimator=pipe, 
                            param_distributions=param_grid, 
                            n_iter=50, 
                            n_jobs=-1, 
                            cv=3,
                            verbose=3, 
                            random_state=42)

In [None]:
# Run randomizedsearchCV, try different param combos to fit to the training data
search.fit(train1, train1_labels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
# Print best performing combination of parameters
search.best_params_

In [50]:
# (No need to transform test data - pipeline takes care of that)
# Print classification accuracy score
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(test1_labels, preds))

(5381, 594)
(5381, 100)
accuracy: 0.297342501393793


In [None]:
# Use pickle to save best estimator from search to a .sav file
filename = 'movie_genre_classifier.sav'
pickle.dump(search.best_estimator_, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

In [None]:
load_predict = load_model.predict([])

In [None]:
pipe.predict([])