sources:

https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

https://spandan-madan.github.io/DeepLearningProject/

# Initial Data Prep

In [113]:
import pandas as pd
from ast import literal_eval
import numpy as np
import requests
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords

In [89]:
df = pd.read_csv('data/movie_df.csv', encoding='utf8', converters={'genre_ids':literal_eval})
df = df[df['genre_ids'].str.len() != 0]
df.head()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id
0,"[18, 80]",278,Framed in the 1940s for the double murder of h...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,tt0111161
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",36.965452,1972-03-14,The Godfather,8.5,7394,tt0068646
2,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,19.945455,1993-11-29,Schindler's List,8.4,5518,tt0108052
3,"[18, 80]",240,In the continuing saga of the Corleone crime f...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,tt0071562
4,"[18, 9648]",452522,Standalone version of the series pilot with an...,5.969249,1989-12-31,Twin Peaks,8.4,123,tt0278784


In [90]:
len(df)

1000

In [91]:
df.tail()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id
996,"[27, 9648, 53]",2654,With the disappearance of hack horror writer S...,10.361558,1994-12-12,In the Mouth of Madness,7.2,405,tt0113409
997,"[27, 9648, 878]",837,A sleazy cable-TV programmer begins to see his...,7.721037,1983-02-04,Videodrome,7.2,600,tt0086541
998,[18],205601,BELLE is inspired by the true story of Dido El...,12.82226,2013-09-08,Belle,7.2,285,tt2404181
999,"[35, 9648]",10440,A middle-aged couple suspects foul play when t...,6.992721,1993-08-18,Manhattan Murder Mystery,7.2,223,tt0107507
1000,[18],317557,A young girl overcomes her disadvantaged upbri...,5.595859,2016-09-23,Queen of Katwe,7.2,138,tt4341582


In [92]:
df.head()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id
0,"[18, 80]",278,Framed in the 1940s for the double murder of h...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,tt0111161
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",36.965452,1972-03-14,The Godfather,8.5,7394,tt0068646
2,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,19.945455,1993-11-29,Schindler's List,8.4,5518,tt0108052
3,"[18, 80]",240,In the continuing saga of the Corleone crime f...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,tt0071562
4,"[18, 9648]",452522,Standalone version of the series pilot with an...,5.969249,1989-12-31,Twin Peaks,8.4,123,tt0278784


#### Vectorize genres based on TMDB genres

In [93]:
#request list of TMDB genres
key = open('key.txt','r').read()
payload = '{}'
url = "https://api.themoviedb.org/3/genre/movie/list?api_key={0}&language=en-US&page={1}".format(key, str(1))
response = requests.request("GET", url, data=payload).json()

genreDict = dict(zip([i['id'] for i in response['genres']],
                     [i['name'] for i in response['genres']]))
genreDict
genres = list(genreDict.keys())

In [94]:
genreDict

{12: 'Adventure',
 14: 'Fantasy',
 16: 'Animation',
 18: 'Drama',
 27: 'Horror',
 28: 'Action',
 35: 'Comedy',
 36: 'History',
 37: 'Western',
 53: 'Thriller',
 80: 'Crime',
 99: 'Documentary',
 878: 'Science Fiction',
 9648: 'Mystery',
 10402: 'Music',
 10749: 'Romance',
 10751: 'Family',
 10752: 'War',
 10770: 'TV Movie'}

In [95]:
#use multilabelbinarizer on genre column and add new column 'binary_genre' of genre vectors
mlb=MultiLabelBinarizer()

df['binary_genre'] = pd.Series(list(mlb.fit_transform(df['genre_ids'])))
df.tail()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id,binary_genre
996,"[27, 9648, 53]",2654,With the disappearance of hack horror writer S...,10.361558,1994-12-12,In the Mouth of Madness,7.2,405,tt0113409,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."
997,"[27, 9648, 878]",837,A sleazy cable-TV programmer begins to see his...,7.721037,1983-02-04,Videodrome,7.2,600,tt0086541,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
998,[18],205601,BELLE is inspired by the true story of Dido El...,12.82226,2013-09-08,Belle,7.2,285,tt2404181,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
999,"[35, 9648]",10440,A middle-aged couple suspects foul play when t...,6.992721,1993-08-18,Manhattan Murder Mystery,7.2,223,tt0107507,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1000,[18],317557,A young girl overcomes her disadvantaged upbri...,5.595859,2016-09-23,Queen of Katwe,7.2,138,tt4341582,


In [97]:
df.iloc[997]['binary_genre']

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [60]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.5, random_state=9001)

# Natural Language Processing

We will be using a few sklearn functions to assist in this portion

**CountVectorizer**

This creates a Document-Term matrix with the dimensions [n_samples, n_features]

**TfidfTransformer**

TF stands for *Term Frequency*. It is the count of each word divided by the toal words in the document. TFIDF stands for *Term Frequency Times Inverse Document Frequency*. This function reduces the weights of common words, such as (this, is, an, etc.)

**Pipeline**
This allows us to make multiple manipulations to our data in a single line of code. It makes our code more concise.



#### Clean up overviews
remove punctuation

In [98]:
df.iloc[0].overview

'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'

In [108]:

clean_overviews = []

stopwords = set(stopwords.words('english'))

for i in range(len(df)):
    movie = df.iloc[i]
    overview = movie['overview']
    overview = overview.replace(',','')
    overview = overview.replace('.','')
    overview = overview.replace("'", '')
    overview = overview.replace('"', '')
    for word in overview:
        if word not in stopwords:
            out.append(word)
    clean_overviews.append(overview)

df['clean_overviews'] = pd.Series(clean_overviews)

In [111]:
df.head()

Unnamed: 0,genre_ids,id,overview,popularity,release_date,title,vote_average,vote_count,imdb_id,binary_genre,clean_overviews
0,"[18, 80]",278,Framed in the 1940s for the double murder of h...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,tt0111161,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Framed in the 1940s for the double murder of h...
1,"[18, 80]",238,"Spanning the years 1945 to 1955, a chronicle o...",36.965452,1972-03-14,The Godfather,8.5,7394,tt0068646,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Spanning the years 1945 to 1955 a chronicle of...
2,"[18, 36, 10752]",424,The true story of how businessman Oskar Schind...,19.945455,1993-11-29,Schindler's List,8.4,5518,tt0108052,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",The true story of how businessman Oskar Schind...
3,"[18, 80]",240,In the continuing saga of the Corleone crime f...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,tt0071562,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",In the continuing saga of the Corleone crime f...
4,"[18, 9648]",452522,Standalone version of the series pilot with an...,5.969249,1989-12-31,Twin Peaks,8.4,123,tt0278784,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",Standalone version of the series pilot with an...


In [101]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

**Naive Bayes Classifier**

In [105]:
from sklearn.naive_bayes import MultinomialNB


text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),])

text_clf = text_clf.fit(train['overview'], train['first_genre'])
nb_train_predict = text_clf.predict(train['overview'])
nb_test_predict = text_clf.predict(test['overview'])

nb_train_accuracy = np.mean(nb_train_predict == train['first_genre'])
nb_test_accuracy = np.mean(nb_test_predict == test['first_genre'])

print("Train Accuracy: {0} \nTest Accuracy: {1}".format(nb_train_accuracy,nb_test_accuracy))

Train Accuracy: 0.334 
Test Accuracy: 0.372


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


**Support Vector Machines (SVM)**

In [87]:
from sklearn.linear_model import SGDClassifier

In [109]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf_svm.fit(train['overview'], train['first_genre'])

svm_train_predict = text_clf_svm.predict(train['overview'])
svm_test_predict = text_clf_svm.predict(test['overview'])

svm_train_accuracy = np.mean(svm_train_predict == train['first_genre'])
svm_test_accuracy = np.mean(svm_test_predict == test['first_genre'])

print("Train Accuracy: {0} \nTest Accuracy: {1}".format(svm_train_accuracy,svm_test_accuracy))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Train Accuracy: 1.0 
Test Accuracy: 0.364


It looks like extreme over-fitting is occuring                                   

**Grid Search Cross Validation**

In [94]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
 }

In [110]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train['overview'], train['first_genre'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [113]:
gs_clf.best_score_


0.338

In [116]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

**Removing Stop Words - Naive Bayes**

In [119]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),])

text_clf = text_clf.fit(train['overview'], train['first_genre'])
nb_train_predict = text_clf.predict(train['overview'])
nb_test_predict = text_clf.predict(test['overview'])

nb_train_accuracy = np.mean(nb_train_predict == train['first_genre'])
nb_test_accuracy = np.mean(nb_test_predict == test['first_genre'])

print("Train Accuracy: {0} \nTest Accuracy: {1}".format(nb_train_accuracy,nb_test_accuracy))

Train Accuracy: 0.376 
Test Accuracy: 0.372


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


**Removing Stop Words - Support Vector Machines**

In [114]:
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf_svm.fit(train['overview'], train['first_genre'])

svm_train_predict = text_clf_svm.predict(train['overview'])
svm_test_predict = text_clf_svm.predict(test['overview'])

svm_train_accuracy = np.mean(svm_train_predict == train['first_genre'])
svm_test_accuracy = np.mean(svm_test_predict == test['first_genre'])

print("Train Accuracy: {0} \nTest Accuracy: {1}".format(svm_train_accuracy,svm_test_accuracy))

Train Accuracy: 1.0 
Test Accuracy: 0.352


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
