In [18]:
# Import dependencies, load spaCy's core model (small)
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
nlp = spacy.load('en_core_web_sm')

In [3]:
# Read cleaned csv into dataframe
df = pd.read_csv("Resources/wiki_movie_plots_CLEANED.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Release Year,Title,Genre,Wiki Page,Plot,genre_count
0,6,1903,The Great Train Robbery,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,924
1,7,1904,The Suburbanite,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,5637
2,13,1907,Daniel Boone,biography,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...,323
3,14,1907,How Brown Saw the Baseball Game,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...,5637
4,15,1907,Laughing Gas,comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...,5637


In [5]:
# Use spaCy to process each movie plot string
nostopwords = []
for i in range (len(df)):
    doc = nlp(df.Plot[i])
    
    # Tokenize strings - use lemma's (base words) for each token; remove common stop words and punctuation
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    nostopwords.append(tokens)

# Add column to df - each row is a list of tokens
df['NoStopwords'] = nostopwords

In [11]:
# Join tokens into one string, and make new df column for this

nostopwords_joined = []
for i in range (len(df)):
    joined = " ".join(df['NoStopwords'][i])
    nostopwords_joined.append(joined)
df['NoStopwords_joined'] = nostopwords_joined

In [30]:
df['NoStopwords'].head()

0    [film, open, bandit, break, railroad, telegrap...
1    [film, family, suburb, hope, quiet, life, thin...
2    [Boone, daughter, befriend, indian, maiden, Bo...
3    [head, baseball, game, nearby, ballpark, sport...
4    [plot, black, woman, go, dentist, toothache, g...
Name: NoStopwords, dtype: object

In [19]:
# Split df into 80:20 train:test split
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [32]:
# Dummy function to pass into 'tokenizer' parameter instead of an actual tokenizing function,
# since we already have our list of lists of tokens (nostopwords)
def do_nothing(tokens):
    return tokens

# Use CountVectorizer to generate Bag of Words matrix of token counts (vectors). Use ngrams of size n=1
# Use the Linear SVC support vector machine as our classifier
vectorizer = CountVectorizer(input='content', tokenizer=do_nothing, ngram_range=(1,1))
clf = LinearSVC()

In [33]:
# Create a spaCy pipeline for processing data. 1: create count vectors; 2: classify!
pipe = Pipeline([('vectorizer', vectorizer), ('clf', clf)])

In [34]:
# text to be classified = 'Plot', labels = 'Genre'; assign for both training and testing data
train1 = train['Plot'].tolist()
labelsTrain1 = train['Genre'].tolist()

test1 = test['Plot'].tolist()
labelsTest1 = test['Genre'].tolist()

In [35]:
# Fit our pipeline/model to the training data
pipe.fit(train1, labelsTrain1)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [36]:
# test
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labelsTest1, preds))

accuracy: 0.0496190299200892


In [None]:
# Probably need less Genre's..... have ~18?