In [2]:
import pandas as pd

# Load Data

In [3]:
movies = pd.read_csv('/content/movies.csv', encoding='latin-1')
users = pd.read_csv('/content/users.csv')
ratings = pd.read_csv('/content/ratings.csv', dtype= {'movie': int, 'timestamp': int})

In [4]:
movies.head(2)

Unnamed: 0,movie,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [5]:
ratings.head(2)

Unnamed: 0,user,movie,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [6]:
users.head(2)

Unnamed: 0,user,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072


## Extract Year from movie title

In [7]:
movies[['title', 'year']] = movies['title'].str.extract(r'(.*?)\s\((\d{4})\)')

In [8]:
movies.head(2)

Unnamed: 0,movie,title,genres,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995


# Split different genres

In [9]:
movies['genres'] = movies['genres'].apply(lambda x: str(x).split('|') if not pd.isnull(x) else [])

In [10]:
movies.head(2)

Unnamed: 0,movie,title,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995


# Drop uncesserary features

In [11]:
ratings.drop('timestamp', inplace=True, axis=1)

In [12]:
users.drop(['zip', 'occupation'], inplace=True, axis=1)

# Merge all tables

In [13]:
all_data = ratings.join(users.set_index('user'), on='user', how='inner', rsuffix='2').join(movies.set_index('movie'), on='movie', how='inner', rsuffix='2')
print(f'Samples count: {len(all_data.index)}')
all_data.head()

Samples count: 1000209


Unnamed: 0,user,movie,rating,gender,age,title,genres,year
0,1,1193,5,F,1,One Flew Over the Cuckoo's Nest,[Drama],1975
120,2,1193,5,M,56,One Flew Over the Cuckoo's Nest,[Drama],1975
1339,12,1193,4,M,25,One Flew Over the Cuckoo's Nest,[Drama],1975
1518,15,1193,4,M,25,One Flew Over the Cuckoo's Nest,[Drama],1975
1747,17,1193,5,M,50,One Flew Over the Cuckoo's Nest,[Drama],1975


In [14]:
all_data['age'].nunique()

7

# One Hot Encoding for genres

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# Transform genres column into multi-hot encoded vectors
multi_hot_encoded = mlb.fit_transform(all_data['genres'])

In [16]:
# Create a new DataFrame with the multi-hot encoded vectors
encoded_df = pd.DataFrame(multi_hot_encoded, columns=mlb.classes_)

In [17]:
# Concatenate the new DataFrame with the original one
preprocessed_df = pd.concat([all_data, encoded_df], axis=1)

In [18]:
preprocessed_df.drop(['genres', 'year'], inplace=True, axis=1)

In [19]:
preprocessed_df.head(1)

Unnamed: 0,user,movie,rating,gender,age,title,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,F,1,One Flew Over the Cuckoo's Nest,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Use the glove-twitter-25 pretrained to transform the title to a 25-dimensions vector

In [20]:
import numpy as np
import re
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import gensim.downloader as api

In [21]:
# Download the GloVe model
glove_model = api.load("glove-twitter-25")

In [22]:
# Function to clean and tokenize the title
def clean_and_tokenize(title):
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove non-alphanumeric characters
    title = title.lower()  # Convert to lowercase
    tokens = title.split()  # Tokenization
    return tokens

title = "One Flew Over the Cuckoo's Nest"
tokens = clean_and_tokenize(title)
print(tokens)

['one', 'flew', 'over', 'the', 'cuckoos', 'nest']


In [23]:
# Function to get the sum of embeddings for a list of words
def get_embedding_sum(tokens):
    embedding_sum = np.zeros(25)
    for token in tokens:
        if token in glove_model:
            embedding_sum += glove_model[token]
    return embedding_sum

print(get_embedding_sum(tokens))

[ -0.191339     0.04330699   2.46325295  -0.18650998  -5.14390003
  -0.61849102   3.19921003  -4.94781996  -1.65898596   2.01279497
   2.01219501   4.02643998 -20.57420015   2.19508894   0.21093699
   3.79346988   4.47298001   1.42998942  -0.42103002  -0.24349003
  -3.02258502   0.160147     4.05436006  -4.68331991   0.31828001]


In [24]:
# Apply cleaning, tokenization, and embedding summation
preprocessed_df['cleaned_tokens'] = preprocessed_df['title'].apply(clean_and_tokenize)
preprocessed_df['embedding_sum'] = preprocessed_df['cleaned_tokens'].apply(get_embedding_sum)

In [25]:
preprocessed_df.head(1)

Unnamed: 0,user,movie,rating,gender,age,title,Action,Adventure,Animation,Children's,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,cleaned_tokens,embedding_sum
0,1,1193,5,F,1,One Flew Over the Cuckoo's Nest,0,0,0,0,...,0,0,0,0,0,0,0,0,"[one, flew, over, the, cuckoos, nest]","[-0.19133899826556444, 0.0433069933205843, 2.4..."


In [26]:
embedding_columns = [f'embedding_{i+1}' for i in range(25)]
embedding_df = pd.DataFrame(preprocessed_df['embedding_sum'].tolist(), columns=embedding_columns)

In [27]:
final_df = pd.concat([preprocessed_df, embedding_df], axis=1)

In [28]:
final_df.drop(['title'], inplace=True, axis=1)

In [29]:
final_df.head(1)

Unnamed: 0,user,movie,rating,gender,age,Action,Adventure,Animation,Children's,Comedy,...,embedding_16,embedding_17,embedding_18,embedding_19,embedding_20,embedding_21,embedding_22,embedding_23,embedding_24,embedding_25
0,1,1193,5,F,1,0,0,0,0,0,...,3.79347,4.47298,1.429989,-0.42103,-0.24349,-3.022585,0.160147,4.05436,-4.68332,0.31828


# Split train and test data

In [30]:
# Reset index
final_df.reset_index(drop=True, inplace=True)

In [31]:
test_indices = final_df.index[final_df.index % 20 == 0]
train_indices = final_df.index[final_df.index % 20 != 0]

In [32]:
# Split the data into train and test sets
train_data = final_df.loc[train_indices]
test_data = final_df.loc[test_indices]

# Baselines
Will compute the RMSE of the following baselines, each representing a naive approach to predict ratings using basic statistics alone

## Random guessing
Predict ratings randomly using uniform sampling of the five possible ratings.

In [None]:
from sklearn.metrics import mean_squared_error

np.random.seed(42)
random_ratings = np.random.randint(1, 6, size=len(train_data))

rmse = np.sqrt(mean_squared_error(train_data['rating'], random_ratings))

print("\nRMSE of Random Guessing:", rmse)


RMSE of Random Guessing: 1.8927228186609004


## Weighted sampling
Predict ratings randomly, giving each rating value a weight based on the number of its appearances in training data.

In [None]:
# Calculate the frequency of each rating
rating_frequency = train_data['rating'].value_counts()

# Compute weights based on the frequency
total_ratings = len(train_data)
weights = rating_frequency / total_ratings

random_ratings_weighted = np.random.choice(weights.index, size=len(train_data), p=weights)

rmse = np.sqrt(mean_squared_error(train_data['rating'], random_ratings_weighted))

print("\nRMSE of Weighted Sampling:", rmse)


RMSE of Weighted Sampling: 1.5787861666414635


## Majorit Class

In [None]:
majority_class = train_data['rating'].value_counts().idxmax()

# Generate predictions with the majority class
majority_predictions = np.full(len(train_data), majority_class)

# Compute RMSE
rmse_majority = np.sqrt(mean_squared_error(train_data['rating'], majority_predictions))

print("\nRMSE of Majority Class Prediction:", rmse_majority)


RMSE of Majority Class Prediction: 1.1927920782845602


#Mean value
Predict the mean rating over the training data (this probably won’t be an integer).

In [None]:
mean_value = train_data['rating'].mean()

predictions = np.full(len(train_data['rating']), mean_value)

# Compute RMSE
rmse_mean = np.sqrt(mean_squared_error(train_data['rating'], predictions))

print("\nRMSE of Mean Value Prediction:", rmse_mean)


RMSE of Mean Value Prediction: 1.1170133846611283


# Use Tensorflow Recommenders

In [33]:
! pip install -q tensorflow-recommenders

In [34]:
from collections import Counter, OrderedDict
du = OrderedDict()
for k,v in sorted(Counter(all_data['user']).items(), key=lambda item: -item[1]):
    du[k] = v
lu = list(du.values())
sum(lu[:2500])/sum(lu)

0.7983711404316498

In [35]:
dm = OrderedDict()
for k,v in sorted(Counter(all_data['movie']).items(), key=lambda item: -item[1]):
    dm[k] = v
lm = list(dm.values())
sum(lm[:1200])/sum(lm)

0.8047208133500099

In [36]:
all_users = [str(int(i)) for i in list(du.keys())[:2500]]
all_movies = [str(int(i)) for i in list(dm.keys())[:1200]]

In [None]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

def convert_to_dataset(df):
    d = {k:v.to_numpy() for k,v in dict(df).items()}
    return tf.data.Dataset.from_tensor_slices(d)

train_dataset = convert_to_dataset(train_data[['user', 'movie', 'rating']])
test_dataset = convert_to_dataset(test_data[['user', 'movie', 'rating']])

In [None]:
class RatingPredictionModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()

        # User tower
        self.user_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='user_input')
        self.user_sl = tf.keras.layers.StringLookup(vocabulary=all_users, name='user_string_lookup')(self.user_input)
        self.user_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_users)+1, 25, name='user_emb')(self.user_sl), axis=1)
        self.user_dense = tf.keras.layers.Dense(20, activation='relu', name='user_dense')(self.user_emb)

        # Movie tower
        self.movie_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='movie_input')
        self.movie_sl = tf.keras.layers.StringLookup(vocabulary=all_movies, name='movie_string_lookup')(self.movie_input)
        self.movie_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_movies)+1, 25, name='movie_emb')(self.movie_sl), axis=1)
        self.movie_dense = tf.keras.layers.Dense(20, activation='relu', name='movie_dense')(self.movie_emb)

        # Merging towers
        self.towers_multiplied = tf.keras.layers.Multiply(name='towers_multiplied')([self.user_dense, self.movie_dense])
        self.towers_dense = tf.keras.layers.Dense(10, activation='relu', name='towers_dense')(self.towers_multiplied)
        self.output_node = tf.keras.layers.Dense(1, name='output_node')(self.towers_dense)

        # Model definition
        self.model = tf.keras.Model(inputs={'user': self.user_input,
                                            'movie': self.movie_input},
                                    outputs=self.output_node)

        self.task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        return self.model({'user': tf.strings.as_string(features["user"]),
                           'movie': tf.strings.as_string(features["movie"])})

    def compute_loss(self, features, **kwargs):
        return self.task(labels=features["rating"], predictions=self(features))

In [None]:
model = RatingPredictionModel()
learning_rate = 2e-3
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
cached_train = train_dataset.shuffle(15_000).batch(10_000).cache()
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', restore_best_weights=True, patience=5)
model.fit(cached_train, epochs=1000, callbacks=[early_stopping])

In [None]:
cached_test = test_dataset.batch(5000).cache()
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.9757083654403687,
 'loss': 1.8328609466552734,
 'regularization_loss': 0,
 'total_loss': 1.8328609466552734}

In [None]:
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie_input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user_string_lookup (StringLook  (None, 1)           0           ['user_input[0][0]']             
 up)                                                                                              
                                                                                                  
 movie_string_lookup (StringLoo  (None, 1)           0           ['movie_input[0][0]']        

# A better TF Recommender

In [39]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [42]:
train_data.drop(['age', 'gender', 'cleaned_tokens', 'embedding_sum'], axis=1, inplace=True)
test_data.drop(['age', 'gender', 'cleaned_tokens', 'embedding_sum'], axis=1, inplace=True)

In [58]:
all_genres = all_data['genres'].explode().dropna().unique()

def convert_to_dataset(df):
    d = {k: v.to_numpy() for k, v in dict(df).items()}
    d['genres'] = np.transpose(np.array([d[x] for x in all_genres]))
    d['title_emb'] = np.transpose(np.array([d[f'embedding_{i+1}'] for i in range(25)]))

    for x in [f'embedding_{i+1}' for i in range(25)]:
        d.pop(x)
    return tf.data.Dataset.from_tensor_slices(d)

train_dataset = convert_to_dataset(train_data)
test_dataset = convert_to_dataset(test_data)

In [66]:
class RatingPredictionModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()

        tower_last_layer_size = 50
        large_embedding_size = 25
        medium_embedding_size = 5
        small_embedding_size = 3

        # User tower

        self.user_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='user_input')
        self.user_sl = tf.keras.layers.StringLookup(vocabulary=all_users, name='user_string_lookup')(self.user_input)
        self.user_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_users)+1, large_embedding_size, name='user_emb')(self.user_sl), axis=1)

        self.user_merged = tf.keras.layers.concatenate([self.user_emb],
                                                       axis=-1, name='user_merged')

        self.user_dense = tf.keras.layers.Dense(100, activation='relu', name='user_dense')(self.user_merged)
        self.user_last_layer = tf.keras.layers.Dense(tower_last_layer_size, activation='relu', name='user_last_layer')(self.user_dense)

        # Movie tower

        self.movie_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='movie_input')
        self.movie_sl = tf.keras.layers.StringLookup(vocabulary=all_movies, name='movie_string_lookup')(self.movie_input)
        self.movie_emb = tf.squeeze(tf.keras.layers.Embedding(len(all_movies)+1, large_embedding_size, name='movie_emb')(self.movie_sl), axis=1)

        self.title_input = tf.keras.Input(shape=(25,), name='title_input')
        self.title_dense = tf.keras.layers.Dense(25, activation='softmax', name='title_softmax')(self.title_input)

        self.genres_input = tf.keras.Input(shape=(len(all_genres),), name='genres_input')

        self.movie_merged = tf.keras.layers.concatenate([self.movie_emb, self.title_dense, self.genres_input],axis=-1, name='movie_merged')
        self.movie_dense = tf.keras.layers.Dense(100, activation='relu', name='movie_dense')(self.movie_merged)
        self.movie_last_layer = tf.keras.layers.Dense(tower_last_layer_size, activation='relu', name='movie_last_layer')(self.movie_dense)

        # Combining towers

        self.towers_multiplied = tf.keras.layers.Multiply(name='towers_multiplied')([self.user_last_layer, self.movie_last_layer])
        self.towers_dense1 = tf.keras.layers.Dense(40, activation='relu', name='towers_dense1')(self.towers_multiplied)
        self.towers_dense2 = tf.keras.layers.Dense(20, activation='relu', name='towers_dense2')(self.towers_dense1)
        self.output_node = tf.keras.layers.Dense(1, name='output_node')(self.towers_dense2)

        # Model definition

        self.model = tf.keras.Model(inputs={'user': self.user_input,
                                            'movie': self.movie_input,
                                            'title': self.title_input,
                                            'genres': self.genres_input
                                            },
                                    outputs=self.output_node)

        self.task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        return self.model({'user': tf.strings.as_string(features["user"]),
                           'movie': tf.strings.as_string(features["movie"]),
                           'title': features["title_emb"],
                           'genres': features["genres"]
                           })

    def compute_loss(self, features, **kwargs):
        return self.task(labels=features["rating"], predictions=self(features))

In [None]:
model = RatingPredictionModel()
learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(2e-3, decay_steps=4000, decay_rate=0.95)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
cached_train = train_dataset.shuffle(15_000).batch(10_000).cache()

# Define the EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    restore_best_weights=True,
    patience=5,
    mode='min'  # Specify mode as 'min' to look for decreasing validation loss
)

model.fit(cached_train, epochs=25, callbacks=[early_stopping])

In [99]:
x = {k:np.asarray([v]) for k,v in list(test_dataset.take(50).as_numpy_iterator())[47].items()}
print(f"Prediction for user {x['user'][0]} and movie {x['movie'][0]}: {model.predict(x)[0][0]:.2f} (rating: {x['rating'][0]})")

Prediction for user 3413 and movie 1193: 4.48 (rating: 5)


In [101]:
model.model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie_input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user_string_lookup (StringLook  (None, 1)           0           ['user_input[0][0]']             
 up)                                                                                              
                                                                                                  
 movie_string_lookup (StringLoo  (None, 1)           0           ['movie_input[0][0]']      