We are going to start with a simple implementation of Matrix Factorization without any kind baselines using PyTorch.

In [4]:
import os
import random
import matplotlib.pyplot as plt
import scienceplots
import numpy as np
import pandas as pd
import torch

from surprise import Dataset, Reader

def set_df_to_index_based(df):
    """
    Transforms the Movielens Index into Python indices for PyTorch embeddings computation
    """
    df = df.astype({"uid": "int64", "iid": "int64"})
    df["uid"] = df["uid"] - 1
    df["iid"] = df["iid"] - 1
    return df

# Define a random state so that the train test splits can be reproducible
RANDOM_STATE = 128

# Add this path to use SciencePlots (it uses latex)
os.environ["PATH"] += os.pathsep + "/Library/TeX/texbin"

plt.rcParams["figure.figsize"] = (100,100)
plt.style.use(['science', 'notebook'])

# Problem Specific constants
N_USERS = 943
N_ITEMS = 1682
N_FACTORS = 100 

### Generate train and test sets

In [3]:
# path to dataset file
file_path = "../ml-100k/u.data"
reader = Reader(line_format="user item rating timestamp", sep="\t")
data = Dataset.load_from_file(file_path, reader=reader)

# Get raw ratings from `Dataset` object
raw_ratings = data.raw_ratings

# Shuffle the raw ratings and select train and test splits (75 % / 25 %)
random.Random(RANDOM_STATE).shuffle(raw_ratings)
threshold = int(0.75 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

In [4]:
# Transform surprise based objects to pandas dataframes
df_train = pd.DataFrame.from_records(train_raw_ratings, columns=["uid", "iid", "rating", "timestamp"])
df_test = pd.DataFrame.from_records(test_raw_ratings, columns=["uid", "iid", "rating", "timestamp"])

In [5]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,194,423,3.0,879548121
1,104,333,2.0,888442305
2,41,174,4.0,890687264
3,489,751,5.0,891362773
4,593,288,4.0,877728878


In [6]:
df_train = set_df_to_index_based(df_train)

In [7]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,193,422,3.0,879548121
1,103,332,2.0,888442305
2,40,173,4.0,890687264
3,488,750,5.0,891362773
4,592,287,4.0,877728878


In [8]:
df_test.head()

Unnamed: 0,uid,iid,rating,timestamp
0,5,24,4.0,879198229
1,716,501,5.0,879796215
2,590,476,3.0,879439345
3,601,98,3.0,876348526
4,896,1672,2.0,887159554


In [9]:
df_test = set_df_to_index_based(df_test)

In [10]:
df_test.head()

Unnamed: 0,uid,iid,rating,timestamp
0,4,23,4.0,879198229
1,715,500,5.0,879796215
2,589,475,3.0,879439345
3,600,97,3.0,876348526
4,895,1671,2.0,887159554


In [11]:
df_train.to_csv("../data/df_train.csv", header=True, index=False)
df_test.to_csv("../data/df_test.csv", header=True, index=False)

### Import generated train and test sets

In [5]:
df_train = pd.read_csv("../data/df_train.csv")
df_test = pd.read_csv("../data/df_test.csv")

In [6]:
df_train.head()

Unnamed: 0,uid,iid,rating,timestamp
0,193,422,3.0,879548121
1,103,332,2.0,888442305
2,40,173,4.0,890687264
3,488,750,5.0,891362773
4,592,287,4.0,877728878


In [7]:
df_train.dtypes

uid            int64
iid            int64
rating       float64
timestamp      int64
dtype: object

In [8]:
df_test.head()

Unnamed: 0,uid,iid,rating,timestamp
0,4,23,4.0,879198229
1,715,500,5.0,879796215
2,589,475,3.0,879439345
3,600,97,3.0,876348526
4,895,1671,2.0,887159554


In [9]:
df_test.dtypes

uid            int64
iid            int64
rating       float64
timestamp      int64
dtype: object

In [10]:
train_arr = df_train.values
test_arr = df_test.values

In [11]:
train_arr[:3]

array([[1.93000000e+02, 4.22000000e+02, 3.00000000e+00, 8.79548121e+08],
       [1.03000000e+02, 3.32000000e+02, 2.00000000e+00, 8.88442305e+08],
       [4.00000000e+01, 1.73000000e+02, 4.00000000e+00, 8.90687264e+08]])

### MF without baselines

In [12]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=100):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
    
    def forward(self, user, item):
        return torch.dot(self.user_factors(user)[0], self.item_factors(item)[0])

In [13]:
model = MatrixFactorization(N_USERS, N_ITEMS, n_factors=N_FACTORS)

In [14]:
loss_func = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [16]:
for e in range(20):
    losses = []
    for interaction in train_arr:
        # Set gradients to zero
        optimizer.zero_grad()

        uid = torch.LongTensor([interaction[0]])
        iid = torch.LongTensor([interaction[1]])
        rating = torch.FloatTensor([interaction[2]])

        rating_hat = model(uid, iid)
        
        loss = loss_func(rating_hat, rating)

        # Backpropagate
        loss.backward()

        # Update the parameters
        optimizer.step()
        losses.append(float(loss))
    print(float(rating_hat), float(rating))
    print(np.mean(losses))

  uid = torch.LongTensor([interaction[0]])
  iid = torch.LongTensor([interaction[1]])


-1.3763679265975952 3.0
18.678148506933
-0.5110791921615601 3.0
9.97163856405116
0.0740247368812561 3.0
6.243807148733473
0.4678950905799866 3.0
4.340074148541971
0.7513111233711243 3.0
3.2362368476229526
0.9718310236930847 3.0
2.53661331903631
1.1541986465454102 3.0
2.063295223851057
1.310887336730957 3.0
1.7267331345520889
1.4483671188354492 3.0
1.477839172643969
1.5702874660491943 3.0
1.2878527026050977
1.6790037155151367 3.0
1.1389981501069297
1.7762291431427002 3.0
1.0197961978025207
1.8633568286895752 3.0
0.9225561234628455
1.941561222076416 3.0
0.8419619365242246
2.01187801361084 3.0
0.7742388595543321
2.0752124786376953 3.0
0.7166431872082812
2.1323513984680176 3.0
0.6671392323364135
2.1839938163757324 3.0
0.6241889512010361
2.230754852294922 3.0
0.5866113728424904
2.27315616607666 3.0
0.5534864734516612
