<a href="https://colab.research.google.com/github/Ma-Sheikhani/collab-matris-Movielens/blob/main/colab_matris%2C_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-25m.zip

--2023-12-12 11:45:27--  https://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2023-12-12 11:45:32 (56.8 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]



In [None]:
!pip install patool
import patoolib
patoolib.extract_archive('/content/ml-25m.zip')

Collecting patool
  Downloading patool-2.0.0-py2.py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool
Successfully installed patool-2.0.0


INFO patool: Extracting /content/ml-25m.zip ...
INFO:patool:Extracting /content/ml-25m.zip ...
INFO patool: running /usr/bin/7z x -o./Unpack_7tc0hl3l -- /content/ml-25m.zip
INFO:patool:running /usr/bin/7z x -o./Unpack_7tc0hl3l -- /content/ml-25m.zip
INFO patool:     with input=''
INFO:patool:    with input=''
INFO patool: ... /content/ml-25m.zip extracted to `ml-25m'.
INFO:patool:... /content/ml-25m.zip extracted to `ml-25m'.


'ml-25m'

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader,Dataset
import numpy as np



In [None]:
ratings = pd.read_csv('/content/ml-25m/ratings.csv')

In [None]:
ratings.drop(axis=1 , labels = 'timestamp',inplace = True)
ratings = ratings.iloc[:50000,:]


In [None]:
class CustomTabularDataset(Dataset):
    def __init__(self, data):
        self.labels = data.iloc[:,-1]
        self.data = data.iloc[:,:-1]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, indx):
        data = torch.tensor(self.data.iloc[indx])
        label = torch.tensor(self.labels.iloc[indx],dtype=torch.float64)

        return data, label

In [None]:
Training , Test = train_test_split(ratings,test_size=0.2)

Training_dataset = CustomTabularDataset(Training)
Test_dataset = CustomTabularDataset(Test)

train_d = DataLoader(Training_dataset, batch_size = 256, drop_last=False, shuffle = True)
test_d = DataLoader(Test_dataset, batch_size = 256, drop_last=False, shuffle = True)

In [None]:
def create_params(size):
    return torch.nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [None]:
class DotProductBias(torch.nn.Module):
    def __init__(self, n_users, n_movies, n_factors, movie_code, user_code):
        super(DotProductBias, self).__init__()

        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])

        self.double()


        self.movie_code = movie_code
        self.user_code = user_code

    def forward(self, x):
        index_user_list = [int(np.where(self.user_code == np.array(a[0]))[0]) for a in x]
        index_movie_list = [int(np.where(self.movie_code == np.array(a[1]))[0]) for a in x]


        users = self.user_factors[index_user_list]
        movies = self.movie_factors[index_movie_list]


        res = (users*movies).sum(dim=1)
        res += self.user_bias[index_user_list] + self.movie_bias[index_movie_list]

        return torch.special.expit(res) * 5.5

In [None]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

user_dict = pd.unique(ratings.userId)
movie_dict = pd.unique(ratings.movieId)

In [None]:
model = DotProductBias(n_users, n_movies, 10,movie_code = movie_dict,user_code = user_dict)

In [None]:
for param in model.parameters():
  print(type(param), param.size())

<class 'torch.nn.parameter.Parameter'> torch.Size([406, 10])
<class 'torch.nn.parameter.Parameter'> torch.Size([406])
<class 'torch.nn.parameter.Parameter'> torch.Size([6489, 10])
<class 'torch.nn.parameter.Parameter'> torch.Size([6489])


In [None]:
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test(dataloader, model, loss_fn):
    size = float(len(dataloader.dataset))
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches


    print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")

In [None]:
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_d, model, loss_fn, optimizer)
    test(test_d, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 494.909381  [  256/40000]
loss: 448.698235  [25856/40000]
Test Error: 
 Avg loss: 385.005409 

Epoch 2
-------------------------------
loss: 424.383682  [  256/40000]
loss: 333.649951  [25856/40000]
Test Error: 
 Avg loss: 304.979796 

Epoch 3
-------------------------------
loss: 325.750266  [  256/40000]
loss: 234.945362  [25856/40000]
Test Error: 
 Avg loss: 245.934938 

Epoch 4
-------------------------------
loss: 240.672258  [  256/40000]
loss: 188.456969  [25856/40000]
Test Error: 
 Avg loss: 222.865779 

Epoch 5
-------------------------------
loss: 201.677911  [  256/40000]
loss: 183.653652  [25856/40000]
Test Error: 
 Avg loss: 213.117817 

Epoch 6
-------------------------------
loss: 183.137046  [  256/40000]
loss: 127.731001  [25856/40000]
Test Error: 
 Avg loss: 207.639847 

Epoch 7
-------------------------------
loss: 166.826911  [  256/40000]
loss: 145.952593  [25856/40000]
Test Error: 
 Avg loss: 204.284928 

Epoch 8
-----

In [None]:
data = ratings.head(40).drop(axis = 1 , labels ='rating')
tt = CustomTabularDataset(data)
predicts = pd.Series([float(a) for a in model(tt)])

In [None]:
show = ratings.head(40)
show['predict'] = predicts
show

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  show['predict'] = predicts


Unnamed: 0,userId,movieId,rating,predict
0,1,296,5.0,4.575167
1,1,306,3.5,4.661409
2,1,307,5.0,4.763078
3,1,665,5.0,4.282281
4,1,899,3.5,4.056901
5,1,1088,4.0,3.577064
6,1,1175,3.5,4.32446
7,1,1217,3.5,3.940863
8,1,1237,5.0,4.520153
9,1,1250,4.0,4.124517
