In [72]:
# Import dependencies, load spaCy's core model (small)
import spacy
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
nlp = spacy.load('en_core_web_sm')

In [27]:
# Read cleaned csv into dataframe
df = pd.read_csv("Resources/wiki_movie_plots_CLEANED.csv")

In [7]:
# Use spaCy to process each movie plot string.
nostopwords = []
for i in range (len(df)):
    doc = nlp(df.Plot[i])
    
    # Tokenize strings - use full text (verbatim, and lowercased) for each token; remove common stop words and punctuation
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    nostopwords.append(tokens)

In [28]:
# Add column for token lists
df['no_stop_words'] = nostopwords

In [30]:
# Save df with tokens to new csv
df.to_csv("Resources/plots_tokenized.csv")

In [31]:
# Split df into 80:20 train:test split
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [41]:
# Dummy function to pass into 'tokenizer' parameter instead of an actual tokenizing function,
# since we already have our list of lists of tokens (nostopwords)
def do_nothing(tokens):
    return tokens

# Use TfidfVectorizer (equivalent to CountVectorizer then Tfidftransformer)
# Generates Bag of Words matrices of token counts (vectors). Use ngrams of size n=1
# Then transforms count matrices to normalized tf-idf representations

vectorizer = TfidfVectorizer(input='content', tokenizer=do_nothing, ngram_range=(1,1))

In [73]:
# Linear dimensionality reduction using truncated singular value decomposition (SVD)
# Contrary to principle component analysis (PCA), this estimator does not center the data 
# before computing the SVD. This means it can work with scipy.sparse matrices efficiently
svd = TruncatedSVD(n_components=200, n_iter=5, random_state=42)

In [67]:
# Debug step that prints shape. Does not actually fit to or transform data.
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [64]:
# Use the Linear SVC support vector machine as our classifier
clf = LinearSVC()

In [74]:
# Create a spaCy pipeline for processing data. 1: create count vectors; 2: classify!
pipe = Pipeline([('vectorizer', vectorizer), 
                 ('debug', Debug()),
                 ('svd', svd),
                 ('clf', clf)
                ])

In [75]:
# text to be classified = 'Plot', labels = 'Genre'; assign for both training and testing data
train1 = train['Plot'].tolist()
labels_train1 = train['Genre'].tolist()

test1 = test['Plot'].tolist()
labels_test1 = test['Genre'].tolist()

In [76]:
# Fit our pipeline/model to the training data
pipe.fit(train1, labels_train1)

(21524, 594)


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [77]:
# No need to transform test data - pipeline takes care of that
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labels_test1, preds))

(5381, 594)
accuracy: 0.29920089202750416


In [71]:
pipe.predict([])

(1, 594)


array(['animation'], dtype='<U15')

In [None]:
random forest
extra trees
gradient booster

matt has better + truncated svd
experiment with n_components 120-200, maybe more?