In [None]:
import json
import pandas as pd
import numpy as np

from sentence_transformers import util

In [None]:
INSTALL = False

# Data Load

In [None]:
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df


def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [None]:
movies = load_tmdb_movies("./upload/tmdb_5000_movies.csv")
credits = load_tmdb_credits("./upload/tmdb_5000_credits.csv")

In [None]:
movies.info()

In [None]:
credits.info()

In [None]:
movies[movies['overview'].isnull()]

In [None]:
movies.loc[0]

In [None]:
movies['keywords_txt'] = movies.loc[:,'keywords'].apply(lambda x: ' '.join([d['name'].lower() for d in x]))
movies['genres_txt'] = movies.loc[:,'genres'].apply(lambda x: ' '.join([d['name'].lower() for d in x]))
movies['overview'] = movies['overview'].str.lower()

In [None]:
keyword_corpus = set(' '.join(movies['keywords_txt'].values).split())
genres_corpus = set(' '.join(movies['genres_txt'].values).split())
overview_corpus = set(' '.join(movies['overview'].astype('str').values).split())

In [None]:
embed = movies.loc[:, ['id', 'title']].copy()
embed.columns = pd.MultiIndex.from_tuples([('id', ''), ('title', '')])

In [None]:
user_input = 'alien sci-fi space horror'

# Word2Vec

In [None]:
if INSTALL:
    !pip install gensim

import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
def word2vecEmb(kws):
    if kws:
        emb = np.array([wv[kw] if kw in wv else np.zeros(300) for kw in kws.split()])
        if emb.size == 0:
            return np.zeros(300).astype('float32')
        else:
            return emb.mean(axis=0).astype('float32')
    else:
        return np.zeros(300).astype('float32')

In [None]:
%%time
embed[('Word2Vec', 'keywords_emb')] = movies['keywords_txt'].astype('str').apply(word2vecEmb)
embed[('Word2Vec', 'genres_emb')] = movies['genres_txt'].astype('str').apply(word2vecEmb)
embed[('Word2Vec', 'overview_emb')] = movies['overview'].astype('str').apply(word2vecEmb)

In [None]:
wv_inp = np.array([wv[kw] for kw in user_input.split() if kw in wv]).mean(axis=0)

In [None]:
embed[('Word2Vec', 'keyword_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'keywords_emb')]).numpy().squeeze()
embed[('Word2Vec', 'genres_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'genres_emb')]).numpy().squeeze()
embed[('Word2Vec', 'overview_sim')] = util.cos_sim(wv_inp, embed[('Word2Vec', 'overview_emb')]).numpy().squeeze()

In [None]:
wv_kw = set([wd for wd in keyword_corpus if wd in wv])
wv_ge = set([wd for wd in genres_corpus if wd in wv])
wv_ov = set([wd for wd in overview_corpus if wd in wv])

In [None]:
wi = pd.DataFrame({
    'Factor': ['Keywords', 'Genres', 'Overviews'],
    'Total': [len(keyword_corpus), len(genres_corpus), len(overview_corpus)],
    'In Word2Vec' : [len(wv_kw), len(wv_ge), len(wv_ov)],
    'Word2Vec%' : [len(wv_kw)/len(keyword_corpus)*100, len(wv_ge)/len(genres_corpus)*100, len(wv_ov)/len(overview_corpus)*100]
})

In [None]:
with pd.option_context('float_format', '{:,.1f}%'.format):
    display(wi.style.hide_index())

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim')]]\
    .nlargest(10, ('Word2Vec', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

In [None]:
del wv

# Glove

In [None]:
if INSTALL:
    !curl.exe -o glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
import csv
glove = pd.read_csv("glove/glove.6B.50d.txt", encoding="utf8", sep=" ", quoting=csv.QUOTE_NONE, header=None)
glove.columns = ['word'] + ['v' + str(x) for x in range(50)]
glove = glove.set_index('word')

In [None]:
%%time
kw_ix = ('Glove', 'keywords_emb')
ge_ix = ('Glove', 'genres_emb')
ov_ix = ('Glove', 'overview_emb')

kw = movies['keywords_txt'].str.split(expand=True).stack().reset_index()
kw.columns = ['idx', 'cnt', 'word']
kw = kw.merge(glove, how='left', left_on='word', right_index=True).groupby('idx').mean().drop(columns='cnt')

ge = movies['genres_txt'].str.split(expand=True).stack().reset_index()
ge.columns = ['idx', 'cnt', 'word']
ge = ge.merge(glove, how='left', left_on='word', right_index=True).groupby('idx').mean().drop(columns='cnt')

ov = movies['overview'].str.split(expand=True).stack().reset_index()
ov.columns = ['idx', 'cnt', 'word']
ov = ov.merge(glove, how='left', left_on='word', right_index=True).groupby('idx').mean().drop(columns='cnt')

embed[kw_ix] = kw.apply(lambda row: np.array(row.values, dtype='float32'), axis=1)
embed[ge_ix] = ge.apply(lambda row: np.array(row.values, dtype='float32'), axis=1)
embed[ov_ix] = ov.apply(lambda row: np.array(row.values, dtype='float32'), axis=1)

In [None]:
embed.loc[embed[kw_ix].isnull(), kw_ix] = \
    embed.loc[embed[kw_ix].isnull(), kw_ix].apply(lambda x: tuple([0.0] * 50)).apply(np.array, dtype='float32')
embed.loc[embed[ge_ix].isnull(), ge_ix] = \
    embed.loc[embed[ge_ix].isnull(), ge_ix].apply(lambda x: tuple([0.0] * 50)).apply(np.array, dtype='float32')
embed.loc[embed[ov_ix].isnull(), ov_ix] = \
    embed.loc[embed[ov_ix].isnull(), ov_ix].apply(lambda x: tuple([0.0] * 50)).apply(np.array, dtype='float32')

In [None]:
glove_inp = glove.loc[user_input.split()].mean().values.astype('float32')

In [None]:
embed[('Glove', 'keyword_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'keywords_emb')]).numpy().squeeze()
embed[('Glove', 'genres_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'genres_emb')]).numpy().squeeze()
embed[('Glove', 'overview_sim')] = util.cos_sim(glove_inp, embed[('Glove', 'overview_emb')]).numpy().squeeze()

In [None]:
gl_kw = set([wd for wd in keyword_corpus if wd in glove.index])
gl_ge = set([wd for wd in genres_corpus if wd in glove.index])
gl_ov = set([wd for wd in overview_corpus if wd in glove.index])

In [None]:
wi['In Glove'] = [len(gl_kw), len(gl_ge), len(gl_ov)]
wi['Glove%'] = [len(gl_kw)/len(keyword_corpus)*100, len(gl_ge)/len(genres_corpus)*100, len(gl_ov)/len(overview_corpus)*100]

In [None]:
with pd.option_context('float_format', '{:,.1f}%'.format):
    display(wi.style.hide_index())

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim')]]\
    .nlargest(10, ('Glove', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# Sentence Transformer (BERT)

In [None]:
from sentence_transformers import SentenceTransformer
sentTrans = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
%%time
embed[('sentTrans', 'keywords_emb')] = list(sentTrans.encode(movies['keywords_txt'].astype('str')))
embed[('sentTrans', 'genres_emb')] = list(sentTrans.encode(movies['genres_txt'].astype('str')))
embed[('sentTrans', 'overview_emb')] = list(sentTrans.encode(movies['overview'].astype('str')))

In [None]:
st_inp = sentTrans.encode(user_input)

In [None]:
embed[('sentTrans', 'keyword_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'keywords_emb')]).numpy().squeeze()
embed[('sentTrans', 'genres_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'genres_emb')]).numpy().squeeze()
embed[('sentTrans', 'overview_sim')] = util.cos_sim(st_inp, embed[('sentTrans', 'overview_emb')]).numpy().squeeze()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim')]]\
    .nlargest(10, ('sentTrans', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

In [None]:
del sentTrans

# InferSent (Facebook)

In [None]:
if INSTALL:
    !git clone https://github.com/facebookresearch/InferSent
    !copy InferSent\models.py .

In [None]:
import nltk
import torch
nltk.download('punkt')

In [None]:
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [None]:
# infersent.build_vocab_k_words(K=400000)
infersent.build_vocab(keyword_corpus | genres_corpus | overview_corpus)

In [None]:
%%time
embed[('InferSent', 'keywords_emb')] = list(infersent.encode(movies.loc[:, 'keywords_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'genres_emb')] = list(infersent.encode(movies.loc[:, 'genres_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'overview_emb')] = list(infersent.encode(movies.loc[:, 'overview'].astype('str').values, tokenize=True))

In [None]:
inp = infersent.encode(['alien sci-fi space horror'], tokenize=True)
inp.shape

In [None]:
embed[('InferSent', 'keyword_sim')] = util.cos_sim(inp, embed[('InferSent', 'keywords_emb')]).numpy().squeeze()
embed[('InferSent', 'genres_sim')] = util.cos_sim(inp, embed[('InferSent', 'genres_emb')]).numpy().squeeze()
embed[('InferSent', 'overview_sim')] = util.cos_sim(inp, embed[('InferSent', 'overview_emb')]).numpy().squeeze()

In [None]:
%%time
in_kw = set([wd for wd in keyword_corpus if wd in infersent.word_vec])
in_ge = set([wd for wd in genres_corpus if wd in infersent.word_vec])
in_ov = set([wd for wd in overview_corpus if wd in infersent.word_vec])

In [None]:
wi['In InferSent'] = [len(in_kw), len(in_ge), len(in_ov)]
wi['InferSent%'] = [len(in_kw)/len(keyword_corpus)*100, len(in_ge)/len(genres_corpus)*100, len(in_ov)/len(overview_corpus)*100]

In [None]:
with pd.option_context('float_format', '{:,.1f}%'.format):
    display(wi.style.hide_index())

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('Glove', 'keyword_sim'), 
              ('Glove', 'genres_sim'), 
              ('Glove', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim'),
              ('InferSent', 'keyword_sim'), 
              ('InferSent', 'genres_sim'), 
              ('InferSent', 'overview_sim')]]\
    .nlargest(10, ('InferSent', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

In [None]:
del infersent