In [1]:
!wget https://files.grouplens.org/datasets/movielens/ml-25m.zip

--2023-12-11 05:29:29--  https://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2023-12-11 05:29:32 (86.6 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]



In [2]:
!pip install patool
import patoolib
patoolib.extract_archive('/content/ml-25m.zip')

Collecting patool
  Downloading patool-2.0.0-py2.py3-none-any.whl (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m71.7/93.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool
Successfully installed patool-2.0.0


INFO patool: Extracting /content/ml-25m.zip ...
INFO:patool:Extracting /content/ml-25m.zip ...
INFO patool: running /usr/bin/7z x -o./Unpack_cqx8kfkz -- /content/ml-25m.zip
INFO:patool:running /usr/bin/7z x -o./Unpack_cqx8kfkz -- /content/ml-25m.zip
INFO patool:     with input=''
INFO:patool:    with input=''
INFO patool: ... /content/ml-25m.zip extracted to `ml-25m'.
INFO:patool:... /content/ml-25m.zip extracted to `ml-25m'.


'ml-25m'

In [3]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader,Dataset
import numpy as np



In [56]:
ratings = pd.read_csv('/content/ml-25m/ratings.csv')

In [57]:
ratings.drop(axis=1 , labels = 'timestamp',inplace = True)
ratings = ratings.iloc[:50000,:]


In [58]:
class CustomTabularDataset(Dataset):
    def __init__(self, data):
        self.labels = data.iloc[:,-1]
        self.data = data.iloc[:,:-1]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, indx):
        data = torch.tensor(self.data.iloc[indx])
        label = torch.tensor(self.labels.iloc[indx],dtype=torch.float64)

        return data, label

In [59]:
Training , Test = train_test_split(ratings,test_size=0.2)

Training_dataset = CustomTabularDataset(Training)
Test_dataset = CustomTabularDataset(Test)

train_d = DataLoader(Training_dataset, batch_size = 256, drop_last=False, shuffle = True)
test_d = DataLoader(Test_dataset, batch_size = 256, drop_last=False, shuffle = True)

In [60]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

user_dict = pd.unique(ratings.userId)
movie_dict = pd.unique(ratings.movieId)

In [61]:
def create_params(size):
    return torch.nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [62]:
class DotProductBias(torch.nn.Module):
    def __init__(self, n_users, n_movies, n_factors, movie_code=np.array([]) , user_code=np.array([])):
        super(DotProductBias, self).__init__()

        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])

        self.double()


        self.movie_code = movie_code
        self.user_code = user_code

    def forward(self, x):
        index_user_list = [int(np.where(self.user_code == np.array(a[0]))[0]) for a in x]
        index_movie_list = [int(np.where(self.movie_code == np.array(a[1]))[0]) for a in x]


        users = self.user_factors[index_user_list]
        movies = self.movie_factors[index_movie_list]


        res = (users*movies).sum(dim=1)
        res += self.user_bias[index_user_list] + self.movie_bias[index_movie_list]

        return torch.special.expit(res) * 5.5

In [83]:
model = DotProductBias(n_users, n_movies, 10,movie_code = movie_dict,user_code = user_dict)

In [84]:
for param in model.parameters():
  print(type(param), param.size())

<class 'torch.nn.parameter.Parameter'> torch.Size([406, 10])
<class 'torch.nn.parameter.Parameter'> torch.Size([406])
<class 'torch.nn.parameter.Parameter'> torch.Size([6489, 10])
<class 'torch.nn.parameter.Parameter'> torch.Size([6489])


In [85]:
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [86]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [87]:
def test(dataloader, model, loss_fn):
    size = float(len(dataloader.dataset))
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches


    print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")

In [88]:
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_d, model, loss_fn, optimizer)
    test(test_d, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 453.222779  [  256/40000]
loss: 420.735058  [25856/40000]
Test Error: 
 Avg loss: 388.005614 

Epoch 2
-------------------------------
loss: 416.626887  [  256/40000]
loss: 348.598168  [25856/40000]
Test Error: 
 Avg loss: 306.266594 

Epoch 3
-------------------------------
loss: 260.534174  [  256/40000]
loss: 255.193151  [25856/40000]
Test Error: 
 Avg loss: 247.681232 

Epoch 4
-------------------------------
loss: 210.801596  [  256/40000]
loss: 214.317632  [25856/40000]
Test Error: 
 Avg loss: 223.102042 

Epoch 5
-------------------------------
loss: 210.833920  [  256/40000]
loss: 204.171844  [25856/40000]
Test Error: 
 Avg loss: 211.885550 

Epoch 6
-------------------------------
loss: 147.705291  [  256/40000]
loss: 159.645735  [25856/40000]
Test Error: 
 Avg loss: 206.011366 

Epoch 7
-------------------------------
loss: 181.120640  [  256/40000]
loss: 173.262252  [25856/40000]
Test Error: 
 Avg loss: 202.308113 

Epoch 8
-----

In [89]:
data = ratings.head(40).drop(axis = 1 , labels ='rating')
tt = CustomTabularDataset(data)
predicts = pd.Series([float(a) for a in model(tt)])

In [90]:
show = ratings.head(40)
show['predict'] = predicts
show

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  show['predict'] = predicts


Unnamed: 0,userId,movieId,rating,predict
0,1,296,5.0,5.155449
1,1,306,3.5,4.823556
2,1,307,5.0,4.858306
3,1,665,5.0,4.926375
4,1,899,3.5,4.437815
5,1,1088,4.0,2.980213
6,1,1175,3.5,4.076943
7,1,1217,3.5,4.488943
8,1,1237,5.0,4.853171
9,1,1250,4.0,4.425707
