# This notebook will apply a word2vec transformation to our TMDB and IMDB movie plots

In [71]:
#import libraries
import pandas as pd
from ast import literal_eval
from gensim import models
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import numpy as np


In [88]:
#read in dataset
movies = pd.read_csv('Data/movie_df_with_imdb.csv', encoding='utf8', 
                     converters={'tmdb_genres':literal_eval, 'imdb_genres':literal_eval})
movies.head()

Unnamed: 0,tmdb_id,imdb_id,tmdb_genres,imdb_genres,binary_tmdb,binary_imdb,tmdb_plot,imdb_plot,popularity,release_date,title,vote_average,vote_count
0,278,tt0111161,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Framed in the 1940s for the double murder of h...,Chronicles the experiences of a formerly succe...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773
1,238,tt0068646,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","Spanning the years 1945 to 1955, a chronicle o...",When the aging head of a famous crime family d...,36.965452,1972-03-14,The Godfather,8.5,7394
2,424,tt0108052,"[18, 36, 10752]","[18, 36]","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",The true story of how businessman Oskar Schind...,Oskar Schindler is a vainglorious and greedy G...,19.945455,1993-11-29,Schindler's List,8.4,5518
3,240,tt0071562,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",In the continuing saga of the Corleone crime f...,The continuing saga of the Corleone crime fami...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249
4,452522,tt0278784,"[18, 9648]","[80, 18, 9648, 53]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...",Standalone version of the series pilot with an...,"When beautiful, young Laura Palmer is found br...",5.969249,1989-12-31,Twin Peaks,8.4,123


#### Clean plots

In [89]:
#define tokenizer
tokenizer = RegexpTokenizer(r'\w+')
#set stop words list
english_stop = get_stop_words('en')
len(english_stop)

174

In [90]:
#function to clean plots
def clean_plot(plot):
    plot = plot.lower()
    plot = tokenizer.tokenize(plot)
    plot = [word for word in plot if word not in english_stop]
    return plot
    

In [91]:
#check first movie's clean plot
print(movies.tmdb_plot[0])
print(clean_plot(movies.tmdb_plot[0]))

Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope
['framed', '1940s', 'double', 'murder', 'wife', 'lover', 'upstanding', 'banker', 'andy', 'dufresne', 'begins', 'new', 'life', 'shawshank', 'prison', 'puts', 'accounting', 'skills', 'work', 'amoral', 'warden', 'long', 'stretch', 'prison', 'dufresne', 'comes', 'admired', 'inmates', 'including', 'older', 'prisoner', 'named', 'red', 'integrity', 'unquenchable', 'sense', 'hope']


In [92]:
#apply to movies df for both imdb and tmdb
movies['tmdb_clean_plot'] = [clean_plot(plot) for plot in movies['tmdb_plot']]
movies['imdb_clean_plot'] = [clean_plot(plot) for plot in movies['imdb_plot']]

In [93]:
#check outputs
movies.tmdb_clean_plot[1:15]

1     [spanning, years, 1945, 1955, chronicle, ficti...
2     [true, story, businessman, oskar, schindler, s...
3     [continuing, saga, corleone, crime, family, yo...
4     [standalone, version, series, pilot, alternate...
5     [direction, ruthless, instructor, talented, yo...
6     [creator, popular, video, game, system, dies, ...
7     [burger, loving, hit, man, philosophical, part...
8     [orbiting, quiet, backwater, planet, massed, f...
9     [ticking, time, bomb, insomniac, slippery, soa...
10    [former, prohibition, era, jewish, gangster, r...
11    [supernatural, tale, set, death, row, southern...
12    [larcenous, real, estate, clerk, marion, crane...
13    [serving, time, insanity, state, mental, hospi...
14    [man, low, iq, accomplished, great, things, li...
Name: tmdb_clean_plot, dtype: object

#### Apply word2vec Transformation of Plots

In [94]:
#Load the pretrained google news word2vec model
model = models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [95]:
#vectorize plots

#collect 300 dimension plot representations
w2v_imdb_plot_vectors = []
w2v_tmdb_plot_vectors = []

#loop through each plot and append averaged vector of words
for plot in movies.tmdb_clean_plot:
    vecs = []
    for word in plot:
        #add word vector to list if it is in the google model
        try:
            vecs.append(model.word_vec(word)) 
        except:
            pass
    w2v_tmdb_plot_vectors.append(np.mean(vecs, axis=0))

for plot in movies.imdb_clean_plot:
    vecs = []
    for word in plot:
        #add word vector to list if it is in the google model
        try:
            vecs.append(model.word_vec(word)) 
        except:
            pass
    w2v_imdb_plot_vectors.append(np.mean(vecs, axis=0))
    
#add column to movies db
movies['tmdb_w2v_plot'] = w2v_tmdb_plot_vectors
movies['imdb_w2v_plot'] = w2v_imdb_plot_vectors

In [96]:
#check shapes of first movie vectors
w2v_tmdb_plot_vectors[0].shape, w2v_imdb_plot_vectors[0].shape

((300,), (300,))

In [97]:
movies.head()

Unnamed: 0,tmdb_id,imdb_id,tmdb_genres,imdb_genres,binary_tmdb,binary_imdb,tmdb_plot,imdb_plot,popularity,release_date,title,vote_average,vote_count,tmdb_clean_plot,imdb_clean_plot,tmdb_w2v_plot,imdb_w2v_plot
0,278,tt0111161,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",Framed in the 1940s for the double murder of h...,Chronicles the experiences of a formerly succe...,28.527767,1994-09-23,The Shawshank Redemption,8.5,9773,"[framed, 1940s, double, murder, wife, lover, u...","[chronicles, experiences, formerly, successful...","[0.014165705, 0.035729147, 0.03556685, 0.06695...","[0.004663568, 0.090185866, -0.012476068, 0.054..."
1,238,tt0068646,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","Spanning the years 1945 to 1955, a chronicle o...",When the aging head of a famous crime family d...,36.965452,1972-03-14,The Godfather,8.5,7394,"[spanning, years, 1945, 1955, chronicle, ficti...","[aging, head, famous, crime, family, decides, ...","[-0.016820837, 0.059669778, -0.0068189832, 0.0...","[-0.013326309, 0.081348196, 0.035764806, 0.067..."
2,424,tt0108052,"[18, 36, 10752]","[18, 36]","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",The true story of how businessman Oskar Schind...,Oskar Schindler is a vainglorious and greedy G...,19.945455,1993-11-29,Schindler's List,8.4,5518,"[true, story, businessman, oskar, schindler, s...","[oskar, schindler, vainglorious, greedy, germa...","[0.075890675, 0.02254813, 0.064304896, 0.11778...","[0.053381152, 0.10281134, 0.01086032, 0.044059..."
3,240,tt0071562,"[18, 80]","[80, 18]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",In the continuing saga of the Corleone crime f...,The continuing saga of the Corleone crime fami...,30.191804,1974-12-20,The Godfather: Part II,8.4,4249,"[continuing, saga, corleone, crime, family, yo...","[continuing, saga, corleone, crime, family, te...","[-0.057908002, 0.07111673, -0.06586771, 0.1026...","[-0.05151922, 0.07896285, -0.040689208, 0.0987..."
4,452522,tt0278784,"[18, 9648]","[80, 18, 9648, 53]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...",Standalone version of the series pilot with an...,"When beautiful, young Laura Palmer is found br...",5.969249,1989-12-31,Twin Peaks,8.4,123,"[standalone, version, series, pilot, alternate...","[beautiful, young, laura, palmer, found, bruta...","[-0.05888228, -0.0534557, -0.065662734, 0.0573...","[0.002655877, 0.10140949, 0.0001303355, 0.0537..."
