In [1]:
!pip install tensorflow-recommenders
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import requests
from wordcloud import WordCloud, STOPWORDS
import json
import string
import re
from typing import Dict, Text
from datetime import datetime
import warnings; warnings.simplefilter('ignore')

In [2]:
ratings_df = pd.read_csv('ratings_small.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
movies = pd.read_csv('movies_metadata.csv').drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).drop([19730, 29503, 35587])
movies['id'] = movies['id'].astype('int64')
df = pd.merge(movies, keywords, on='id')
df = pd.merge(df, credits, on='id')
df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')
df.dropna(inplace=True)
df['genres'] = df['genres'].apply(lambda x: literal_eval(x))
df['genres'] = df['genres'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['production_companies'] = df['production_companies'].apply(lambda x: literal_eval(x))
df['production_companies'] = df['production_companies'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['production_countries'] = df['production_countries'].apply(lambda x: literal_eval(x))
df['production_countries'] = df['production_countries'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['crew'] = df['crew'].apply(lambda x: literal_eval(x))
df['crew'] = df['crew'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: literal_eval(x))
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['keywords'] = df['keywords'].apply(lambda x: literal_eval(x))
df['keywords'] = df['keywords'].apply(lambda x: ', '.join([i['name'] for i in x]))
df['characters'] = df['cast'].apply(lambda x: literal_eval(x))
df['characters'] = df['characters'].apply(lambda x: ', '.join([i['character'] for i in x]))
df['actors'] = df['cast'].apply(lambda x: literal_eval(x))
df['actors'] = df['actors'].apply(lambda x: ', '.join([i['name'] for i in x]))
df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
ratings_df = ratings_df.assign(date=pd.to_datetime(ratings_df['timestamp'], unit='s'))
ratings_df.drop('timestamp', axis=1, inplace=True)
ratings_df = ratings_df.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId', right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()].drop('id', axis=1).reset_index(drop=True)
movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)

display(credits.head(2))
display(keywords.head(2))
display(movies.head(2))
display(ratings_df.head(2))

ratings_df['userId'] = ratings_df['userId'].astype(str)
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))
ratings = ratings.map(lambda x: {"original_title": x["original_title"],"userId": x["userId"],"rating": float(x["rating"])})
movies = movies.map(lambda x: x["original_title"])
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = ratings.take(35_000)
test = ratings.skip(35_000).take(8_188)
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["userId"])
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


Unnamed: 0,adult,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,vote_average,vote_count
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,7.7,5415.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,6.9,2413.0


Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,1,1371,2.5,2009-12-14 02:52:15,Rocky III,Drama,"Now the world champion, Rocky Balboa is living..."
1,1,1405,1.0,2009-12-14 02:53:23,Greed,"Drama, History",Greed is the classic 1924 silent film by Erich...


In [22]:
class TensorFlowMovieModel(tfrs.models.Model):
  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    super().__init__()
    embedding_dimension = 64
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)])
    
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)])
    
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1)])
    
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)))
    
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight
  
  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    user_ids = features["userId"]
    movie_titles = features["original_title"]
    user_embeddings = self.user_model(user_ids)
    movie_embeddings = self.movie_model(movie_titles)
    embeddings = tf.concat([user_embeddings, movie_embeddings], axis=1)
    rating_predictions = self.rating_model(embeddings)
    return (user_embeddings, movie_embeddings, rating_predictions)

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    ratings = features["rating"]
    user_embeddings, movie_embeddings, rating_predictions = self(features)
    rating_loss = tf.reduce_mean(tf.square(ratings - rating_predictions))
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)
    return self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss

    
tensorFlowModel = TensorFlowMovieModel(rating_weight=1.0, retrieval_weight=1.0)
tensorFlowModel.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
cached_train = train.shuffle(100_000).batch(1_000).cache()
cached_test = test.batch(1_000).cache()
tensorFlowModel.fit(cached_train, epochs=3)

tensorFlowModelMetrics = tensorFlowModel.evaluate(cached_test, return_dict=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
print("Accuracy: " + str(tensorFlowModelMetrics['factorized_top_k/top_100_categorical_accuracy']))
print("Ranking RMSE: " + str(tensorFlowModelMetrics['root_mean_squared_error']))

Accuracy: 0.08622374385595322
Ranking RMSE: 0.0


In [25]:
def getPredictedMovie(user, top_n=10):
    index = tfrs.layers.factorized_top_k.BruteForce(tensorFlowModel.user_model)
    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(tensorFlowModel.movie_model))))
    _, titles = index(tf.constant([str(user)]))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

In [26]:
getPredictedMovie(123)

1. The Greatest Story Ever Told
2. Un long dimanche de fiançailles
3. Dog Day Afternoon
4. The Party at Kitty and Stud's
5. El otro lado de la cama
6. Un éléphant ça trompe énormément
7. Furankenshutain no Kaijū: Sanda tai Gaira
8. L.A. Story
9. The Brasher Doubloon
10. Pitch Black


In [27]:
index = tfrs.layers.factorized_top_k.BruteForce(tensorFlowModel.user_model)
index.index_from_dataset(tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(tensorFlowModel.movie_model))))
_, titles = index(tf.constant(['123']))
recommended_titles = [title.decode('utf-8') for title in titles[0, :5].numpy()]
prediction_df = ratings_df[ratings_df['original_title'].isin(recommended_titles)]
prediction_df.drop_duplicates(subset=['original_title'], inplace=True)
prediction_df.reset_index(drop=True, inplace=True)
prediction_df.index = np.arange(1, len(prediction_df) + 1)
prediction_df

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
1,3,2841,4.0,2011-02-28 02:55:33,Un long dimanche de fiançailles,Drama,"In 1919, Mathilde was 19 years old. Two years ..."
2,15,2428,1.0,2002-09-30 00:13:02,The Greatest Story Ever Told,"Drama, History",All-star epic retelling of Christ's life.
3,15,2722,2.0,2002-09-30 00:12:12,El otro lado de la cama,"Romance, Drama","When Paula leaves her mate Pedro, he misses he..."
4,19,968,5.0,1997-02-06 01:29:42,Dog Day Afternoon,"Crime, Drama, Thriller",A man robs a bank to pay for his lover's opera...
5,73,4255,0.5,2009-10-15 07:34:42,The Party at Kitty and Stud's,"Drama, Action, Comedy",Kitty and Stud are lovers. They enjoy a robust...


In [28]:
ratings_df[ratings_df['userId'] == '123'].head(6)

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
8053,123,233,4.0,2001-07-01 20:57:06,The Wanderers,Drama,The streets of the Bronx are owned by 60’s you...
8054,123,288,5.0,2001-07-01 19:32:47,High Noon,Western,High Noon is about a recently freed leader of ...
8055,123,407,5.0,2001-07-01 20:57:57,Kurz und schmerzlos,"Drama, Thriller",Three friends get caught in a life of major cr...
8056,123,968,3.0,2001-07-01 20:59:01,Dog Day Afternoon,"Crime, Drama, Thriller",A man robs a bank to pay for his lover's opera...
8057,123,1968,4.0,2001-07-01 19:30:36,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...
8058,123,1976,4.0,2001-07-01 19:31:51,Jezebel,"Drama, Romance","In 1850s Louisiana, the willfulness of a tempe..."
