In [1]:
import os
import random
import joblib             # pip install joblib (optional, used to save encoders)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


### Load raw datasets and prepare as dataframes

In [3]:
#data_source = 'small'
data_source = 'full'

if data_source == 'small':
    root = "../data/raw/ml-latest-small/"
else:
    root = "../data/raw/ml-latest/"


In [4]:
ratings_df = pd.read_csv(f"{root}ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [5]:
movies_df = pd.read_csv(f"{root}movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
tags_df = pd.read_csv(f"{root}tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746


In [7]:
tags_df = tags_df[['movieId', 'tag']]

In [8]:
tags_df["tag"] = (
    tags_df["tag"]
    .astype(str)
    .str.lower()
    .str.strip()
)


In [9]:
tags_df = tags_df.dropna(subset=["tag"])
tags_df = tags_df.drop_duplicates()


In [10]:
len(set(tags_df['tag']))

143264

In [11]:
movie_tags = (
    tags_df.groupby("movieId")["tag"]
    .unique()
    .reset_index()
)

movie_tags["tags"] = movie_tags["tag"].apply(lambda x: " ".join(x))
movie_tags = movie_tags.drop(columns=["tag"])


In [12]:
movie_tags

Unnamed: 0,movieId,tags
0,1,animation friendship toys disney pixar cgi cla...
1,2,animals based on a book fantasy magic board ga...
2,3,sequel moldy old old age old men wedding old p...
3,4,characters chick flick girl movie revenge clv ...
4,5,family pregnancy wedding 4th wall aging baby d...
...,...,...
53447,288765,post-apocalyptic survival tw suicide apocalyps...
53448,288779,don camillo series
53449,288849,addiction animation short film
53450,288937,anime


### Prepare datasets for pytorch use

In [13]:
# User encoding:
user_encoder = preprocessing.LabelEncoder()

ratings_df["user_idx"] = user_encoder.fit_transform(ratings_df["userId"])



In [14]:
import psutil
process = psutil.Process()
print(f"Memory used: {process.memory_info().rss / 1024**3:.2f} GB")


Memory used: 0.59 GB


In [15]:
# Movie encoding:
movie_encoder = preprocessing.LabelEncoder()

movies_df["movie_idx"] = movie_encoder.fit_transform(movies_df["movieId"]) # Start with the more comprehensive list (must include all possible) 
ratings_df["movie_idx"] = movie_encoder.transform(ratings_df["movieId"]) # We're just using an existing encoder, so use transform
movie_tags["movie_idx"] = movie_encoder.transform(movie_tags["movieId"]) # We're just using an existing encoder, so use transform


In [16]:
joblib.dump(user_encoder, "../artifacts/label_encoders/user_encoder.joblib")
joblib.dump(movie_encoder, "../artifacts/label_encoders/movie_encoder.joblib")

['../artifacts/label_encoders/movie_encoder.joblib']

In [17]:
num_users = ratings_df["user_idx"].nunique()
num_movies = ratings_df["movie_idx"].nunique()
num_ratings = len(ratings_df)
num_tags = len(set(tags_df['tag']))

print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")
print(f"Number of ratings: {num_ratings}")
print(f"Number of unique tags: {num_tags}")




Number of users: 330975
Number of movies: 83239
Number of ratings: 33832162
Number of unique tags: 143264


In [18]:
# Merge tags data with movies
movies_with_tags = movies_df.merge(movie_tags, on="movie_idx", how="left")
movies_with_tags["tags"] = movies_with_tags["tags"].fillna("")


In [19]:
'''
movie genres are pipe delimited, so I want to split these into dimensions
'''
# Split genres into lists
movies_with_tags["genres"] = movies_with_tags["genres"].apply(lambda x: x.split("|"))

# Get all unique genres
all_genres = sorted({g for sublist in movies_with_tags["genres"] for g in sublist})

# One-hot encode genres
for g in all_genres:
    movies_with_tags[g] = movies_with_tags["genres"].apply(lambda x: int(g in x))



In [20]:
'''
movie tags are more varied than genres - there are 1475 unique tags in this sample data.
Rather than one-hot encoding, we want to process this using TF-IDF.
This selects the top N words (to simplify) based on how often they occur, and how consistently they occur.
'''
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')  # use top N tag words
tfidf_matrix = tfidf.fit_transform(movies_with_tags["tags"])
tfidf_features = pd.DataFrame(
    tfidf_matrix.toarray(), 
    columns=[f"tag_{t}" for t in tfidf.get_feature_names_out()]
)


In [21]:
tfidf_features.columns

Index(['tag_01', 'tag_06', 'tag_10', 'tag_100', 'tag_11', 'tag_12', 'tag_13',
       'tag_1940s', 'tag_1950s', 'tag_1960s',
       ...
       'tag_wound', 'tag_writer', 'tag_writing', 'tag_written', 'tag_year',
       'tag_york', 'tag_young', 'tag_younger', 'tag_youth', 'tag_zombie'],
      dtype='object', length=1000)

In [22]:
movies_features = pd.concat(
    [movies_with_tags.set_index("movie_idx")[all_genres].reset_index(drop=True),
     tfidf_features.reset_index(drop=True)],
    axis=1
)


In [23]:
num_movie_features = movies_features.shape[1]

In [24]:
movies_features.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,tag_wound,tag_writer,tag_writing,tag_written,tag_year,tag_york,tag_young,tag_younger,tag_youth,tag_zombie
0,0,0,1,1,1,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.034604,0.0,0.0,0.034178,0.0,0.0,0.0
2,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Calculate sparsity
sparsity = (num_ratings / (num_users * num_movies))
print(f"Sparsity: {sparsity:.4f} ({sparsity*100:.2f}%)")

Sparsity: 0.0012 (0.12%)


In [27]:
class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings, movie_features):
        self.users = np.array(users)
        self.movies = np.array(movies)
        self.ratings = np.array(ratings, dtype=np.float32)
        self.movie_features = movie_features  # keep as DataFrame, don’t convert to tensor yet, as it will be LARGE

    def __len__(self):
        return len(self.users)

    def __getitem__(self, i):
        '''
        These will be loaded in batches, so easier to handle large tensors
        '''
        user = torch.tensor(self.users[i], dtype=torch.long)
        movie = torch.tensor(self.movies[i], dtype=torch.long)
        rating = torch.tensor(self.ratings[i], dtype=torch.float)

        # convert movie features to tensor for this sample only
        movie_feat = torch.tensor(self.movie_features.loc[movie.item()].values, dtype=torch.float32)

        return {
            "users": user,
            "movies": movie,
            "ratings": rating,
            "movie_features": movie_feat,
        }


In [28]:
train_df, val_df = train_test_split(ratings_df, test_size=0.2, stratify=ratings_df.rating.values)

In [29]:
BATCH_SIZE = 16

train_dataset = MovieLensDataset(
    users=train_df["user_idx"].values,
    movies=train_df["movie_idx"].values,
    ratings=train_df["rating"].values,
    movie_features=movies_features
)
val_dataset = MovieLensDataset(
    users=val_df["user_idx"].values,
    movies=val_df["movie_idx"].values,
    ratings=val_df["rating"].values,
    movie_features=movies_features
)


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)


In [31]:
# 7️⃣ Update model to accept movie features
class RecommendationSystemModel(nn.Module):
    def __init__(self, num_users, num_movies, num_movie_features,
                 embedding_size=64, hidden_dim=128, dropout_rate=0.5):
        super(RecommendationSystemModel, self).__init__()

        # self.num_users = num_users
        # self.num_movies = num_movies
        # self.num_movie_features = num_movie_features
        # self.embedding_size = embedding_size
        # self.hidden_dim = hidden_dim



        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

         # Dense projection for movie features (genres)
        self.genre_fc = nn.Linear(num_movie_features, embedding_size)
        
        
        self.fc1 = nn.Linear(3 * embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)

        
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, users, movies, movie_features):
        u = self.user_embedding(users)
        m = self.movie_embedding(movies)
        
        genre_embedded = self.relu(self.genre_fc(movie_features))
        x = torch.cat([u, m, genre_embedded], dim=1)

        
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        out = self.fc3(x)
        return out

### Instantiate model

In [32]:
# Instantiate model and send to device (CPU/GPU)
model = RecommendationSystemModel(
    num_users=num_users,
    num_movies=num_movies,
    num_movie_features=num_movie_features,
    embedding_size = 64,
    hidden_dim = 128,
    dropout_rate = 0.1
).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3)


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch in train_loader:
        users = batch["users"].to(device)
        movies = batch["movies"].to(device)
        ratings = batch["ratings"].to(device)
        movie_features_batch = batch.get("movie_features")
        if movie_features_batch is not None:
            movie_features_batch = movie_features_batch.to(device)

        optimizer.zero_grad()
        preds = model(users, movies, movie_features_batch)
        loss = criterion(preds.squeeze(), ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * users.size(0)

    train_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            users = batch["users"].to(device)
            movies = batch["movies"].to(device)
            ratings = batch["ratings"].to(device)
            movie_features_batch = batch.get("movie_features")
            if movie_features_batch is not None:
                movie_features_batch = movie_features_batch.to(device)

            preds = model(users, movies, movie_features_batch)
            loss = criterion(preds.squeeze(), ratings)
            val_loss += loss.item() * users.size(0)

    val_loss = val_loss / len(val_loader.dataset)
    scheduler.step(val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")