# Data Preprocessing

## SVD

![Fig 1. Movie Genres](imgs/sparse_matrix.png)

As data exploration has shown, the matrices of user reviews and movie genres are too sparse. Storing those matrices without any amendments will require a lot of RAM. However, SVD will implicitly help us by making the matrices more dense.

In [1]:
# All required imports are here

import torch
import numpy as np
import pandas as pd
import warnings

from torch.autograd import Variable
from tqdm import tqdm
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [2]:
# Let's also filter the warnings because who needs them, right?
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
class MatrixFactorization(torch.nn.Module):

    '''

    The MatrixFactorization class is designed for matrix factorization-based collaborative
    filtering using PyTorch. It learns embeddings for users and items in a matrix,
    enabling the prediction of user-item interactions.

    '''
    
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()

        # User embeddings (users to their features)
        self.user_factors = torch.nn.Embedding(n_users, n_factors)

        # Movie embeddings (movies to their features)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
    
    def forward(self, data):
        users, movies = data[:, 0], data[:, 1]
        return (self.user_factors(users) * self.item_factors(movies)).sum(1)
    
    def predict(self, user, movie):
        return self.forward(user, movie)

In [4]:
# Data loader

class Loader(Dataset):

    '''

    The Loader class is a PyTorch Dataset used for handling rating data.
    It transforms the input ratings dataset into a format suitable for training
    machine learning models.

    '''

    def __init__(self, ratings_df):
        self.ratings = ratings_df.copy()
        
        # Obtaining all unique user and movie ids
        users = ratings_df["user_id"].unique()
        movies = ratings_df["movie_id"].unique()
        
        # We need to create mappings from unique vals to indices
        self.userid2idx = {o: i for i, o in enumerate(users)}
        self.movieid2idx = {o: i for i, o in enumerate(movies)}
        
        # Doing the opposite thing
        self.idx2userid = {i: o for o, i in self.userid2idx.items()}
        self.idx2movieid = {i: o for o, i in self.movieid2idx.items()}
        
        # We also need to replace initial ids with indices
        self.ratings["movie_id"] = ratings_df["movie_id"].apply(lambda x: self.movieid2idx[x])
        self.ratings["user_id"]= ratings_df["user_id"].apply(lambda x: self.userid2idx[x])
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    # Return item by its index
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    # Return len of ratings
    def __len__(self):
        return len(self.ratings)
     

# Alrighty, training time!

In [5]:
# Loading the dataset

# Dataset root folder
dataset_root_path = "../data/interim/ml-100k"

# We use this df to obtain info about movies
u_item_df = pd.read_csv(f"{dataset_root_path}/u.item",
                        delimiter="|",
                     #    index_col=0,
                        names=["movie_id", "movie_title", "release_date",
                               "video_release_date", "IMDb_URL",
                               "unknown", "Action", "Adventure", "Animation", "Childrens",
                               "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
                               "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
                               "War", "Western"],
                        encoding="cp1252")

# We need this df to know how each user rated the movies they watched
u_data_df = pd.read_csv(f"{dataset_root_path}/u.data", delimiter="\t", names=["user_id", "movie_id", "rating", "timestamp"])

In [6]:
u_item_df.tail()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# We need to set index to "movie_id" because otherwise our ids will start with 0
# and won't correspond to true ids in the df
movie_titles = u_item_df.set_index("movie_id")['movie_title'].to_dict()

In [8]:
list(movie_titles.items())[:10]

[(1, 'Toy Story (1995)'),
 (2, 'GoldenEye (1995)'),
 (3, 'Four Rooms (1995)'),
 (4, 'Get Shorty (1995)'),
 (5, 'Copycat (1995)'),
 (6, 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)'),
 (7, 'Twelve Monkeys (1995)'),
 (8, 'Babe (1995)'),
 (9, 'Dead Man Walking (1995)'),
 (10, 'Richard III (1995)')]

In [9]:
class TrainingConfig:
    '''
    
    TrainingConfig class contains training parameters like number of epochs
    or device that is to be used for training

    '''

    num_epochs = 128
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("mps")
    n_factors = 8
    batch_size = 128

config = TrainingConfig()

In [10]:
train_data, test_data = train_test_split(u_data_df, test_size=0.2, random_state=42)

train_set = Loader(train_data)
test_set = Loader(test_data)

train_loader = DataLoader(train_set, config.batch_size, shuffle=True)
test_loader = DataLoader(test_set, config.batch_size, shuffle=True)

In [11]:
# We need to count the number of unique users and movies
num_unique_movies = len(u_item_df['movie_id'].unique())
num_unique_users = len(u_data_df['user_id'].unique())

In [12]:
model = MatrixFactorization(num_unique_users, num_unique_movies, n_factors=config.n_factors)
print(model)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

model = model.to(config.device)

# We will be using MSE
loss_fn = torch.nn.MSELoss()

# And Adam optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

MatrixFactorization(
  (user_factors): Embedding(943, 8)
  (item_factors): Embedding(1682, 8)
)
user_factors.weight tensor([[0.0097, 0.0357, 0.0344,  ..., 0.0273, 0.0399, 0.0004],
        [0.0177, 0.0316, 0.0452,  ..., 0.0409, 0.0219, 0.0284],
        [0.0137, 0.0331, 0.0010,  ..., 0.0212, 0.0199, 0.0410],
        ...,
        [0.0188, 0.0443, 0.0257,  ..., 0.0497, 0.0361, 0.0229],
        [0.0424, 0.0274, 0.0378,  ..., 0.0461, 0.0134, 0.0172],
        [0.0307, 0.0500, 0.0127,  ..., 0.0254, 0.0348, 0.0248]])
item_factors.weight tensor([[0.0144, 0.0250, 0.0271,  ..., 0.0485, 0.0326, 0.0013],
        [0.0018, 0.0428, 0.0124,  ..., 0.0251, 0.0243, 0.0365],
        [0.0015, 0.0025, 0.0471,  ..., 0.0342, 0.0207, 0.0091],
        ...,
        [0.0325, 0.0008, 0.0116,  ..., 0.0130, 0.0260, 0.0454],
        [0.0219, 0.0225, 0.0450,  ..., 0.0329, 0.0373, 0.0384],
        [0.0259, 0.0096, 0.0337,  ..., 0.0075, 0.0499, 0.0078]])


In [13]:
# Training!

for it in tqdm(range(config.num_epochs)):
    losses = []
    for x, y in train_loader:

        x = x.to(config.device)
        y = y.to(config.device)

        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    if it % 10 == 0:
        print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  0%|          | 0/128 [00:00<?, ?it/s]

  1%|          | 1/128 [00:00<00:45,  2.79it/s]

iter #0 Loss: 11.584645626831055


  9%|▊         | 11/128 [00:04<00:43,  2.67it/s]

iter #10 Loss: 0.8774093044281006


 16%|█▋        | 21/128 [00:07<00:37,  2.83it/s]

iter #20 Loss: 0.8574844573974609


 24%|██▍       | 31/128 [00:11<00:32,  3.00it/s]

iter #30 Loss: 0.8535347853660583


 32%|███▏      | 41/128 [00:14<00:30,  2.89it/s]

iter #40 Loss: 0.846245800113678


 40%|███▉      | 51/128 [00:17<00:25,  2.98it/s]

iter #50 Loss: 0.8121151856422424


 48%|████▊     | 61/128 [00:21<00:23,  2.90it/s]

iter #60 Loss: 0.7336387606620789


 55%|█████▌    | 71/128 [00:24<00:19,  2.91it/s]

iter #70 Loss: 0.6601696525096893


 63%|██████▎   | 81/128 [00:28<00:15,  3.05it/s]

iter #80 Loss: 0.6141407647132874


 71%|███████   | 91/128 [00:31<00:12,  2.93it/s]

iter #90 Loss: 0.5882021836280823


 79%|███████▉  | 101/128 [00:35<00:09,  2.86it/s]

iter #100 Loss: 0.5717136323928833


 87%|████████▋ | 111/128 [00:38<00:05,  2.94it/s]

iter #110 Loss: 0.5602877286911011


 95%|█████████▍| 121/128 [00:42<00:02,  2.94it/s]

iter #120 Loss: 0.5515826260089874


100%|██████████| 128/128 [00:44<00:00,  2.87it/s]


In [14]:
c = 0   # counter
uw = 0  # user embeddings
iw = 0  # movie embeddings

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c += 1
        else:
          iw = param.data

user_factors.weight tensor([[ 0.8476,  0.8253,  0.6640,  ...,  0.8427,  0.8989,  0.7390],
        [ 0.2304,  0.6788,  0.4748,  ...,  0.7793,  0.6262,  1.0967],
        [-0.0579,  0.4145,  0.1376,  ...,  0.7945,  1.0740,  1.3434],
        ...,
        [ 1.0189,  1.0269,  0.9430,  ...,  0.9768,  1.0965,  0.3477],
        [ 1.2472,  0.8574,  0.9157,  ...,  0.3859, -0.4035,  1.2180],
        [ 1.4851,  0.6886,  0.8468,  ...,  0.9567,  0.2881,  0.1356]])
item_factors.weight tensor([[ 1.7173e+00,  6.6317e-01,  1.9755e-02,  ...,  2.8416e-01,
          4.0860e-01,  7.8730e-01],
        [ 1.3372e-01,  7.7562e-01,  8.1220e-01,  ...,  4.7229e-01,
          1.3030e+00,  4.9817e-01],
        [-1.5361e-01,  6.0543e-01,  1.0321e+00,  ...,  5.8205e-01,
          1.6133e+00, -1.6326e-01],
        ...,
        [ 3.2501e-02,  7.8330e-04,  1.1577e-02,  ...,  1.3008e-02,
          2.5963e-02,  4.5415e-02],
        [ 2.1864e-02,  2.2469e-02,  4.4998e-02,  ...,  3.2919e-02,
          3.7335e-02,  3.8382e-02]

In [15]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

print(len(trained_movie_embeddings)) # Here are unique movie factor weights

1682


In [16]:
# Let's use KMeans to fit the clusters using the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [17]:
# We can notice that movies of the same cluster are more likely to have
# similar genre names

num_clusters_to_check = 5

for cluster in range(num_clusters_to_check):
    print("Cluster #{}".format(cluster))
    movs = []
    for movie_idx in np.where(kmeans.labels_ == cluster)[0]:
        try:
            movie_id = train_set.idx2movieid[movie_idx]
            rat_count = u_data_df.loc[u_data_df['movie_id'] == movie_id].count()[0]
            movs.append((movie_titles[movie_id], rat_count))
        except:
            pass
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Crow: City of Angels, The (1996)
	 Bio-Dome (1996)
	 Barb Wire (1996)
	 Lawnmower Man 2: Beyond Cyberspace (1996)
	 Children of the Corn: The Gathering (1996)
	 Big Bully (1996)
	 Mr. Magoo (1997)
	 Mighty Morphin Power Rangers: The Movie (1995)
	 House Party 3 (1994)
	 Meet Wally Sparks (1997)
Cluster #1
	 Toy Story (1995)
	 Raiders of the Lost Ark (1981)
	 Godfather, The (1972)
	 Silence of the Lambs, The (1991)
	 Jerry Maguire (1996)
	 Empire Strikes Back, The (1980)
	 Back to the Future (1985)
	 Titanic (1997)
	 Mission: Impossible (1996)
	 Fugitive, The (1993)
Cluster #2
	 Contact (1997)
	 Rock, The (1996)
	 Star Trek: First Contact (1996)
	 Saint, The (1997)
	 Conspiracy Theory (1997)
	 Mr. Holland's Opus (1995)
	 Twister (1996)
	 Truth About Cats & Dogs, The (1996)
	 Ransom (1996)
	 Game, The (1997)
Cluster #3
	 Liar Liar (1997)
	 Air Force One (1997)
	 Independence Day (ID4) (1996)
	 Dante's Peak (1997)
	 Top Gun (1986)
	 Volcano (1997)
	 Murder at 1600 (1997)
	 Am

In [18]:
# Let's save the model!

model_path = f'../models/supermodel.pth'
torch.save(model, model_path)