In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import util

In [None]:
INSTALL = False

# Data Load

In [None]:
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df


def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [None]:
movies = load_tmdb_movies("./upload/tmdb_5000_movies.csv")
# credits = load_tmdb_credits("./upload/tmdb_5000_credits.csv")

In [None]:
movies.info()

In [None]:
movies.loc[0]

In [None]:
movies['keywords_txt'] = movies.loc[:,'keywords'].apply(lambda x: ' '.join([d['name'].lower() for d in x]))
movies['genres_txt'] = movies.loc[:,'genres'].apply(lambda x: ' '.join([d['name'].lower() for d in x]))
movies['overview'] = movies['overview'].str.lower()

In [None]:
keyword_corpus = set(' '.join(movies['keywords_txt'].values).split())
genres_corpus = set(' '.join(movies['genres_txt'].values).split())
overview_corpus = set(' '.join(movies['overview'].astype('str').values).split())

In [None]:
embed = movies.loc[:, ['id', 'title']].copy()
embed.columns = pd.MultiIndex.from_tuples([('id', ''), ('title', '')])

# Explore Input Factors

In [None]:
movies.loc[0, 'keywords_txt']

In [None]:
movies.loc[0, 'keywords']

In [None]:
movies.loc[0, 'genres']

In [None]:
movies.loc[0, 'overview']

In [None]:
wi = pd.DataFrame({
    'Factor': ['Keywords', 'Genres', 'Overviews'],
    'Num of Unique Words': [len(keyword_corpus), len(genres_corpus), len(overview_corpus)],
})

display(wi.style.hide_index())

In [None]:
movies['keywords_len'] = movies['keywords_txt'].str.split().str.len()
movies['genres_len'] = movies['genres_txt'].str.split().str.len()
movies['overview_len'] = movies['overview'].str.split().str.len()

In [None]:
fig,ax = plt.subplots(1, 3, figsize=(12,4))

movies.hist(column='keywords_len', bins=50, ax=ax[0])
movies.hist(column='genres_len', bins=6, ax=ax[1])
movies.hist(column='overview_len', bins=30, ax=ax[2])

ax[0].set_title('Keywords')
ax[1].set_title('Genres')
_ = ax[2].set_title('Overview')

# Create our search terms

In [None]:
user_input = 'alien sci-fi space horror'

# Word2Vec

In [None]:
if INSTALL:
    !pip install gensim

import gensim.downloader as api
from gensim.models.keyedvectors import KeyedVectors

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
def emb_line(txt, mod):
    fill_array = np.zeros(mod.vector_size).astype('float32')
    if txt:
        emb = np.array([mod[kw] if kw in mod else fill_array for kw in txt.split()])
        if emb.size == 0:
            return fill_array
        else:
            return emb.mean(axis=0).astype('float32')
    else:
        return fill_array

In [None]:
%%time
embed[('Word2Vec', 'keywords_emb')] = movies['keywords_txt'].astype('str').apply(emb_line, mod=wv)
embed[('Word2Vec', 'genres_emb')] = movies['genres_txt'].astype('str').apply(emb_line, mod=wv)
embed[('Word2Vec', 'overview_emb')] = movies['overview'].astype('str').apply(emb_line, mod=wv)

In [None]:
wv_inp = emb_line(user_input, mod=wv)

In [None]:
embed[('Word2Vec', 'keyword_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'keywords_emb')]).numpy().squeeze()
embed[('Word2Vec', 'genres_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'genres_emb')]).numpy().squeeze()
embed[('Word2Vec', 'overview_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'overview_emb')]).numpy().squeeze()

In [None]:
wv_kw = set([wd for wd in keyword_corpus if wd in wv])
wv_ge = set([wd for wd in genres_corpus if wd in wv])
wv_ov = set([wd for wd in overview_corpus if wd in wv])

In [None]:
wi['In Word2Vec'] = [len(wv_kw), len(wv_ge), len(wv_ov)]
wi['Word2Vec%'] = [len(wv_kw)/len(keyword_corpus), len(wv_ge)/len(genres_corpus), len(wv_ov)/len(overview_corpus)]

In [None]:
wi.style.format({'Word2Vec%' : '{:.1%}'}).hide_index()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim')]]\
    .nlargest(10, ('Word2Vec', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# Glove

In [None]:
if INSTALL:
    !curl.exe -o glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
glove_model = KeyedVectors.load_word2vec_format("glove/glove.6B.50d.txt", binary=False, no_header=True)

In [None]:
%%time
embed[('Glove', 'keywords_emb')] = movies['keywords_txt'].astype('str').apply(emb_line, mod=glove_model)
embed[('Glove', 'genres_emb')] = movies['genres_txt'].astype('str').apply(emb_line, mod=glove_model)
embed[('Glove', 'overview_emb')] = movies['overview'].astype('str').apply(emb_line, mod=glove_model)

In [None]:
glove_inp = emb_line(user_input, mod=glove_model)

In [None]:
embed[('Glove', 'keyword_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'keywords_emb')]).numpy().squeeze()
embed[('Glove', 'genres_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'genres_emb')]).numpy().squeeze()
embed[('Glove', 'overview_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'overview_emb')]).numpy().squeeze()

In [None]:
gl_kw = set([wd for wd in keyword_corpus if wd in glove_model])
gl_ge = set([wd for wd in genres_corpus if wd in glove_model])
gl_ov = set([wd for wd in overview_corpus if wd in glove_model])

In [None]:
wi['In Glove'] = [len(gl_kw), len(gl_ge), len(gl_ov)]
wi['Glove%'] = [len(gl_kw)/len(keyword_corpus), len(gl_ge)/len(genres_corpus), len(gl_ov)/len(overview_corpus)]

In [None]:
wi.style.format({'Word2Vec%' : '{:.1%}', 'Glove%' : '{:.1%}'}).hide_index()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim')]]\
    .nlargest(10, ('Glove', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# Sentence Transformer (BERT)

In [None]:
from sentence_transformers import SentenceTransformer
sentTrans = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
%%time
embed[('sentTrans', 'keywords_emb')] = list(sentTrans.encode(movies['keywords_txt'].astype('str')))
embed[('sentTrans', 'genres_emb')] = list(sentTrans.encode(movies['genres_txt'].astype('str')))
embed[('sentTrans', 'overview_emb')] = list(sentTrans.encode(movies['overview'].astype('str')))

In [None]:
st_inp = sentTrans.encode(user_input)

In [None]:
embed[('sentTrans', 'keyword_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'keywords_emb')]).numpy().squeeze()
embed[('sentTrans', 'genres_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'genres_emb')]).numpy().squeeze()
embed[('sentTrans', 'overview_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'overview_emb')]).numpy().squeeze()

In [None]:
wi['In SentenceTransformer'] = [len(keyword_corpus), len(genres_corpus), len(overview_corpus)]
wi['SentenceTransformer%'] = [1.0, 1.0, 1.0]

In [None]:
wi.style.format({
    'Word2Vec%': '{:.1%}',
    'Glove%': '{:.1%}', 
    'SentenceTransformer%': '{:.1%}'}).hide_index()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim')]]\
    .nlargest(10, ('sentTrans', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# InferSent (Facebook)

In [None]:
if INSTALL:
    !git clone https://github.com/facebookresearch/InferSent
    !copy InferSent\models.py .

In [None]:
import nltk
import torch
nltk.download('punkt')

In [None]:
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [None]:
# infersent.build_vocab_k_words(K=400000)
infersent.build_vocab(keyword_corpus | genres_corpus | overview_corpus)

In [None]:
%%time
embed[('InferSent', 'keywords_emb')] = list(infersent.encode(movies.loc[:, 'keywords_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'genres_emb')] = list(infersent.encode(movies.loc[:, 'genres_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'overview_emb')] = list(infersent.encode(movies.loc[:, 'overview'].astype('str').values, tokenize=True))

In [None]:
in_inp = infersent.encode(['alien sci-fi space horror'], tokenize=True)
in_inp.shape

In [None]:
embed[('InferSent', 'keyword_sim')] = util.cos_sim(in_inp, embed[('InferSent', 'keywords_emb')]).numpy().squeeze()
embed[('InferSent', 'genres_sim')] = util.cos_sim(in_inp, embed[('InferSent', 'genres_emb')]).numpy().squeeze()
embed[('InferSent', 'overview_sim')] = util.cos_sim(in_inp, embed[('InferSent', 'overview_emb')]).numpy().squeeze()

In [None]:
%%time
in_kw = set([wd for wd in keyword_corpus if wd in infersent.word_vec])
in_ge = set([wd for wd in genres_corpus if wd in infersent.word_vec])
in_ov = set([wd for wd in overview_corpus if wd in infersent.word_vec])

In [None]:
wi['In InferSent'] = [len(in_kw), len(in_ge), len(in_ov)]
wi['InferSent%'] = [len(in_kw)/len(keyword_corpus), len(in_ge)/len(genres_corpus), len(in_ov)/len(overview_corpus)]

In [None]:
wi.style.format({
    'Word2Vec%': '{:.1%}',
    'Glove%': '{:.1%}', 
    'SentenceTransformer%': '{:.1%}',
    'InferSent%': '{:.1%}'}).hide_index()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim'),
              ('InferSent', 'keyword_sim'), 
              ('InferSent', 'genres_sim'), 
              ('InferSent', 'overview_sim')]]\
    .nlargest(10, ('InferSent', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

In [None]:
import sys

In [None]:
objs = list(locals().items())
myvars = pd.DataFrame({
    'var': [var for var,_ in objs],
    'size' : [sys.getsizeof(obj) for _,obj in objs]
})

myvars.sort_values('size', ascending=False)[:20]