We are going to start with a simple implementation of Matrix Factorization without any kind baselines using PyTorch.

In [1]:
import os
import random
import matplotlib.pyplot as plt
import scienceplots
import numpy as np
import pandas as pd
import torch

from surprise import Dataset, Reader

def set_df_to_index_based(df):
    """
    Transforms the Movielens Index into Python indices for PyTorch embeddings computation
    """
    df = df.astype({"uid": "int64", "iid": "int64"})
    df["uid"] = df["uid"] - 1
    df["iid"] = df["iid"] - 1
    return df

# Define a random state so that the train test splits can be reproducible
RANDOM_STATE = 128

# Add this path to use SciencePlots (it uses latex)
os.environ["PATH"] += os.pathsep + "/Library/TeX/texbin"

plt.rcParams["figure.figsize"] = (100,100)
plt.style.use(['science', 'notebook'])

# Problem Specific constants
N_USERS = 943
N_ITEMS = 1682
N_FACTORS = 100 

### Generate train and test sets

In [15]:
# path to dataset file
file_path = "../ml-100k/u.data"
reader = Reader(line_format="user item rating timestamp", sep="\t")
data = Dataset.load_from_file(file_path, reader=reader)

# Get raw ratings from `Dataset` object
raw_ratings = data.raw_ratings

# Shuffle the raw ratings and select train and test splits (75 % / 25 %)
random.Random(RANDOM_STATE).shuffle(raw_ratings)
threshold = int(0.75 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

threshold_val = int(0.20 * len(train_raw_ratings))
val_raw_ratings = train_raw_ratings[:threshold_val]
train_raw_ratings = train_raw_ratings[threshold_val:]

In [16]:
# Transform surprise based objects to pandas dataframes
df_train = pd.DataFrame.from_records(train_raw_ratings, columns=["uid", "iid", "rating", "timestamp"])
df_val = pd.DataFrame.from_records(val_raw_ratings, columns=["uid", "iid", "rating", "timestamp"])
df_test = pd.DataFrame.from_records(test_raw_ratings, columns=["uid", "iid", "rating", "timestamp"])

In [17]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,497,53,3.0,879362178
1,123,165,5.0,879872672
2,873,348,3.0,891392577
3,790,249,3.0,884461849
4,372,100,3.0,876869388


In [18]:
df_train = set_df_to_index_based(df_train)

In [19]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,496,52,3.0,879362178
1,122,164,5.0,879872672
2,872,347,3.0,891392577
3,789,248,3.0,884461849
4,371,99,3.0,876869388


In [23]:
df_val.head()

Unnamed: 0,uid,iid,rating,timestamp
0,194,423,3.0,879548121
1,104,333,2.0,888442305
2,41,174,4.0,890687264
3,489,751,5.0,891362773
4,593,288,4.0,877728878


In [24]:
df_val = set_df_to_index_based(df_val)

In [20]:
df_test.head()

Unnamed: 0,uid,iid,rating,timestamp
0,5,24,4.0,879198229
1,716,501,5.0,879796215
2,590,476,3.0,879439345
3,601,98,3.0,876348526
4,896,1672,2.0,887159554


In [21]:
df_test = set_df_to_index_based(df_test)

In [22]:
df_test.head()

Unnamed: 0,uid,iid,rating,timestamp
0,4,23,4.0,879198229
1,715,500,5.0,879796215
2,589,475,3.0,879439345
3,600,97,3.0,876348526
4,895,1671,2.0,887159554


In [25]:
df_train.shape

(60000, 4)

In [26]:
df_val.shape

(15000, 4)

In [27]:
df_test.shape

(25000, 4)

In [28]:
df_train.to_csv("../data/df_train.csv", header=True, index=False)
df_val.to_csv("../data/df_val.csv", header=True, index=False)
df_test.to_csv("../data/df_test.csv", header=True, index=False)

### Import generated train and test sets

In [29]:
df_train = pd.read_csv("../data/df_train.csv")
df_val = pd.read_csv("../data/df_val.csv")
df_test = pd.read_csv("../data/df_test.csv")

In [30]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,496,52,3.0,879362178
1,122,164,5.0,879872672
2,872,347,3.0,891392577
3,789,248,3.0,884461849
4,371,99,3.0,876869388


In [34]:
train_arr = df_train.values
val_arr = df_val.values
test_arr = df_test.values

In [35]:
train_arr[:3]

array([[4.96000000e+02, 5.20000000e+01, 3.00000000e+00, 8.79362178e+08],
       [1.22000000e+02, 1.64000000e+02, 5.00000000e+00, 8.79872672e+08],
       [8.72000000e+02, 3.47000000e+02, 3.00000000e+00, 8.91392577e+08]])

### MF without baselines

In [36]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=100):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
    
    def forward(self, user, item):
        return torch.dot(self.user_factors(user)[0], self.item_factors(item)[0])

In [37]:
model = MatrixFactorization(N_USERS, N_ITEMS, n_factors=N_FACTORS)

In [38]:
loss_func = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.002, weight_decay=1e-5)

In [40]:
train_loss = []
val_loss = []

for e in range(20):
    train_losses = []
    val_losses = []
    
    for interaction in train_arr:
        # Set gradients to zero
        optimizer.zero_grad()

        uid = torch.LongTensor([interaction[0]])
        iid = torch.LongTensor([interaction[1]])
        rating = torch.FloatTensor([interaction[2]])

        rating_hat = model(uid, iid)
        
        loss = loss_func(rating_hat, rating)

        # Backpropagate
        loss.backward()

        # Update the parameters
        optimizer.step()
        train_losses.append(float(loss))
    
    for interaction in val_arr:
        
        uid = torch.LongTensor([interaction[0]])
        iid = torch.LongTensor([interaction[1]])
        rating = torch.FloatTensor([interaction[2]])

        rating_hat = model(uid, iid)
        
        loss = loss_func(rating_hat, rating)
        val_losses.append(float(loss))
    
    train_loss.append(np.mean(train_losses))
    val_loss.append(np.mean(val_losses))
    print(f"Epoch {e + 1} | Training loss: {train_loss[-1]} | Validation loss: {val_loss[-1]}")
    

  uid = torch.LongTensor([interaction[0]])
  iid = torch.LongTensor([interaction[1]])
  uid = torch.LongTensor([interaction[0]])
  iid = torch.LongTensor([interaction[1]])


Epoch 1 | Training loss: 3.972188458793133 | Validation loss: 34.314484101028846
Epoch 2 | Training loss: 2.398287098287923 | Validation loss: 32.47558789188046
Epoch 3 | Training loss: 1.6549674052904328 | Validation loss: 31.264149099869005
Epoch 4 | Training loss: 1.2340123988197893 | Validation loss: 30.40162498673394
Epoch 5 | Training loss: 0.970989415047435 | Validation loss: 29.754060780232592
Epoch 6 | Training loss: 0.7942208567933868 | Validation loss: 29.248790428473605
Epoch 7 | Training loss: 0.6686392661061095 | Validation loss: 28.842937024121024
Epoch 8 | Training loss: 0.5754886085985726 | Validation loss: 28.509516144748886
Epoch 9 | Training loss: 0.5039863726529644 | Validation loss: 28.230657813648296
Epoch 10 | Training loss: 0.447558192449192 | Validation loss: 27.99402981247156
Epoch 11 | Training loss: 0.40199672648828383 | Validation loss: 27.790826766528166
Epoch 12 | Training loss: 0.36449868007167924 | Validation loss: 27.614580550391995
Epoch 13 | Trainin