In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [127]:
df = pd.read_csv('data/movie_df_with_imdb.csv', encoding='utf8', converters={'tmdb_genres':literal_eval,
                                                                            'imdb_genres':literal_eval,
                                                                            'binary_tmdb':literal_eval,
                                                                            'binary_imdb':literal_eval
                                                                            })

In [13]:
df.head()

Unnamed: 0,tmdb_id,imdb_id,tmdb_genres,imdb_genres,binary_tmdb,binary_imdb,tmdb_plot,imdb_plot,popularity,release_date,title,vote_average,vote_count
0,278,tt0111161,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Framed in the 1940s for the double murder of h...,Chronicles the experiences of a formerly succe...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773
1,238,tt0068646,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","Spanning the years 1945 to 1955, a chronicle o...",When the aging head of a famous crime family d...,36.965452,1972-03-14,The Godfather,8.5,7394
2,424,tt0108052,"[18, 36, 10752]","[18, 36]","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",The true story of how businessman Oskar Schind...,Oskar Schindler is a vainglorious and greedy G...,19.945455,1993-11-29,Schindler's List,8.4,5518
3,240,tt0071562,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",In the continuing saga of the Corleone crime f...,The continuing saga of the Corleone crime fami...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249
4,452522,tt0278784,"[18, 9648]","[80, 18, 9648, 53]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...",Standalone version of the series pilot with an...,"When beautiful, young Laura Palmer is found br...",5.969249,1989-12-31,Twin Peaks,8.4,123


# Accuracy Score

Since many of the movies have multiple genres, we want our accuracy score to reflect partial correctness. For example, if a movie has the genres "Romance" and "Comedy", and our model predicts "Comedy", we would consider the accuracy to be 0.5.

In [121]:
y_pred = df.loc[9,'binary_imdb'] #genre = 18
y_true = df.loc[0,'binary_imdb'] #genre = [18,80]

y_pred_2 = df.loc[4,'binary_imdb'] #genre = [80, 18, 9648, 53]

In [128]:
print('Prediction: [18], Actual: [18,80]')
print(classification_report(y_pred, y_true,target_names=['18','80']))
print('\n\n')
print('Prediction: [80, 18, 9648, 53], Actual: [18,80]')
print(classification_report(y_pred_2, y_true,target_names=['18','80']))

Prediction: [18], Actual: [18,80]
             precision    recall  f1-score   support

         18       1.00      0.94      0.97        17
         80       0.50      1.00      0.67         1

avg / total       0.97      0.94      0.95        18




Prediction: [80, 18, 9648, 53], Actual: [18,80]
             precision    recall  f1-score   support

         18       0.88      1.00      0.93        14
         80       1.00      0.50      0.67         4

avg / total       0.90      0.89      0.87        18



# Bag of Words

todo - is returning the array sufficient or should we return the entire CountVectorizer object?

In [144]:
#conda install gensim
from gensim import models

#pip install stop_words
from stop_words import get_stop_words
en_stop = get_stop_words('en')

In [147]:
def bag_of_words(series, removeStopWords=False):
    if removeStopWords:
        vec=CountVectorizer(stop_words=en_stop)
    else:
        vec = CountVectorizer()
    vec.fit(series)
    vec=vec.transform(series)
    return vec.toarray()

In [148]:
bag_of_words(df['tmdb_plot']).shape

(1000, 8873)

In [149]:
bag_of_words(df['imdb_plot']).shape

(1000, 14137)

In [150]:
bag_of_words(df['tmdb_plot'],removeStopWords=True).shape

(1000, 8761)

In [151]:
bag_of_words(df['imdb_plot'],removeStopWords=True).shape

(1000, 14018)

# Word2Vec

Download google's model here, be sure to .gitignore this file:
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [108]:
model = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [157]:
def apply_words2Vec(doc, removeStopWords=False):
    doc=doc.split(' ')
    
    if removeStopWords:
        words=list(filter(lambda x: x not in en_stop, doc))
        
    words=list(filter(lambda x: x in model.vocab, doc))
    return model[words]

In [159]:
df['tmdb_plot'].apply(lambda x: apply_words2Vec(x)).shape

(1000,)

In [160]:
df['imdb_plot'].apply(lambda x: apply_words2Vec(x)).shape

(1000,)

In [161]:
df['tmdb_plot'].apply(lambda x: apply_words2Vec(x, removeStopWords=True)).shape

(1000,)

In [162]:
df['imdb_plot'].apply(lambda x: apply_words2Vec(x, removeStopWords=True)).shape

(1000,)