In [None]:
import json
import pandas as pd
import numpy as np

from sentence_transformers import util

# Data Load

In [None]:
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df


def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [None]:
movies = load_tmdb_movies("./upload/tmdb_5000_movies.csv")
credits = load_tmdb_credits("./upload/tmdb_5000_credits.csv")

In [None]:
movies.info()

In [None]:
credits.info()

In [None]:
movies[movies['overview'].isnull()]

In [None]:
movies.loc[0]

In [None]:
movies['keywords_txt'] = movies.loc[:,'keywords'].apply(lambda x: ' '.join([d['name'] for d in x]))
movies['genres_txt'] = movies.loc[:,'genres'].apply(lambda x: ' '.join([d['name'] for d in x]))

In [None]:
embed = movies.loc[:, ['id', 'title']].copy()
embed.columns = pd.MultiIndex.from_tuples([('id', ''), ('title', '')])

# Word2Vec

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
def word2vecEmb(kws):
    if kws:
        emb = np.array([wv[kw] if kw in wv else np.zeros(300) for kw in kws.split()])
        if emb.size == 0:
            return np.zeros(300).astype('float32')
        else:
            return emb.mean(axis=0).astype('float32')
    else:
        return np.zeros(300).astype('float32')

embed[('Word2Vec', 'keywords_emb')] = movies['keywords_txt'].astype('str').apply(word2vecEmb)
embed[('Word2Vec', 'genres_emb')] = movies['genres_txt'].astype('str').apply(word2vecEmb)
embed[('Word2Vec', 'overview_emb')] = movies['overview'].astype('str').apply(word2vecEmb)

In [None]:
inp = np.array([wv[kw] for kw in 'Alien sci-fi space horror'.split() if kw in wv]).mean(axis=0)
inp.shape

In [None]:
embed[('Word2Vec', 'keyword_sim')] = util.cos_sim(inp, embed[('Word2Vec', 'keywords_emb')]).numpy().squeeze()
embed[('Word2Vec', 'genres_sim')] = util.cos_sim(inp, embed[('Word2Vec', 'genres_emb')]).numpy().squeeze()
embed[('Word2Vec', 'overview_sim')] = util.cos_sim(inp, embed[('Word2Vec', 'overview_emb')]).numpy().squeeze()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim')]]\
    .nlargest(10, ('Word2Vec', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# Sentence Transformer (BERT)

In [None]:
from sentence_transformers import SentenceTransformer
sentTrans = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
%%time
embed[('sentTrans', 'keywords_emb')] = list(sentTrans.encode(movies['keywords_txt'].astype('str')))
embed[('sentTrans', 'genres_emb')] = list(sentTrans.encode(movies['genres_txt'].astype('str')))
embed[('sentTrans', 'overview_emb')] = list(sentTrans.encode(movies['overview'].astype('str')))

In [None]:
inp = sentTrans.encode('Alien sci-fi space horror')

In [None]:
embed[('sentTrans', 'keyword_sim')] = util.cos_sim(inp, embed[('sentTrans', 'keywords_emb')]).numpy().squeeze()
embed[('sentTrans', 'genres_sim')] = util.cos_sim(inp, embed[('sentTrans', 'genres_emb')]).numpy().squeeze()
embed[('sentTrans', 'overview_sim')] = util.cos_sim(inp, embed[('sentTrans', 'overview_emb')]).numpy().squeeze()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim')]]\
    .nlargest(10, ('sentTrans', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()

# InferSent (Facebook)

In [None]:
!git clone https://github.com/facebookresearch/InferSent
!copy InferSent\models.py .

In [None]:
import nltk
import torch
nltk.download('punkt')

In [None]:
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [None]:
infersent.build_vocab_k_words(K=100000)

In [None]:
%%time
embed[('InferSent', 'keywords_emb')] = list(infersent.encode(movies.loc[:, 'keywords_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'genres_emb')] = list(infersent.encode(movies.loc[:, 'genres_txt'].astype('str').values, tokenize=True))
embed[('InferSent', 'overview_emb')] = list(infersent.encode(movies.loc[:, 'overview'].astype('str').values, tokenize=True))

In [None]:
inp = infersent.encode(['Alien sci-fi space horror'], tokenize=True)
inp.shape

In [None]:
embed[('InferSent', 'keyword_sim')] = util.cos_sim(inp, embed[('InferSent', 'keywords_emb')]).numpy().squeeze()
embed[('InferSent', 'genres_sim')] = util.cos_sim(inp, embed[('InferSent', 'genres_emb')]).numpy().squeeze()
embed[('InferSent', 'overview_sim')] = util.cos_sim(inp, embed[('InferSent', 'overview_emb')]).numpy().squeeze()

In [None]:
embed.loc[:, [('title', ''), 
              ('Word2Vec', 'keyword_sim'), 
              ('Word2Vec', 'genres_sim'), 
              ('Word2Vec', 'overview_sim'),
              ('sentTrans', 'keyword_sim'), 
              ('sentTrans', 'genres_sim'), 
              ('sentTrans', 'overview_sim'),
              ('InferSent', 'keyword_sim'), 
              ('InferSent', 'genres_sim'), 
              ('InferSent', 'overview_sim')]]\
    .nlargest(10, ('InferSent', 'keyword_sim'))\
    .style.background_gradient(vmin=-1.0, vmax=1.0)\
    .hide_index()