# `Recommendation System using AutoEncoder`

## Libaries  
Using pytorch

In [123]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## `1) Data Preprocessing`

### Importing the dataset

#### Movies

In [124]:
movies = pd.read_csv("ml-1m/movies.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1" # encoding for special characters
                     )

In [125]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [126]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3883 non-null   int64 
 1   1       3883 non-null   object
 2   2       3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


#### Users

In [127]:
users = pd.read_csv("ml-1m/users.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1"
                     )

In [128]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [129]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6040 non-null   int64 
 1   1       6040 non-null   object
 2   2       6040 non-null   int64 
 3   3       6040 non-null   int64 
 4   4       6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


#### Ratings

In [130]:
ratings = pd.read_csv("ml-1m/ratings.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1"
                     )

In [131]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [132]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000209 non-null  int64
 1   1       1000209 non-null  int64
 2   2       1000209 non-null  int64
 3   3       1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


### Prepare the training and test sets

#### Training set

In [133]:
train_set = pd.read_csv("ml-100k/u1.base",
                        delimiter="\t"
                        )

In [134]:
train_set.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [135]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79999 entries, 0 to 79998
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1          79999 non-null  int64
 1   1.1        79999 non-null  int64
 2   5          79999 non-null  int64
 3   874965758  79999 non-null  int64
dtypes: int64(4)
memory usage: 2.4 MB


In [136]:
train_set = np.array(train_set, dtype="int")

In [137]:
train_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

#### Test set

In [138]:
test_set = pd.read_csv("ml-100k/u1.test", delimiter="\t")

In [139]:
test_set.head()

Unnamed: 0,1,6,5,887431973
0,1,10,3,875693118
1,1,12,5,878542960
2,1,14,5,874965706
3,1,17,3,875073198
4,1,20,4,887431883


In [140]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1          19999 non-null  int64
 1   6          19999 non-null  int64
 2   5          19999 non-null  int64
 3   887431973  19999 non-null  int64
dtypes: int64(4)
memory usage: 625.1 KB


In [141]:
test_set = np.array(test_set, dtype="int")

In [142]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

### Getting the number of users and movies

In [143]:
no_users = int(max(max(train_set[:, 0]), max(test_set[:, 0])))
no_movies = int(max(max(train_set[:, 1]), max(test_set[:, 1])))

### Converting the data into an array with users in rows and movies in columns

In [144]:
def convert(data):
    new_data = []
    for id_users in range(1, no_users + 1):
        id_movies = data[:, 1][data[:, 0] == id_users] # all movies rated by user
        id_ratings = data[:, 2][data[:, 0] == id_users] # all ratings given by user
        ratings = np.zeros(no_movies)
        ratings[id_movies - 1] = id_ratings # ratings for each movie
        new_data.append(list(ratings))
    
    return new_data

train_set = convert(train_set)
test_set = convert(test_set)

### Converting the data into `Torch tensors`

In [145]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [146]:
train_set

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [147]:
test_set

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## `2) Building the architecture of the Neural Network`

In [148]:
# Stacked AutoEncoder
class SAE(nn.Module):

    def __init__(self, ):
        super(SAE, self).__init__()  # inherit from nn.Module
        self.fc1 = nn.Linear(no_movies, 20)  # first hidden layer
        self.fc2 = nn.Linear(20, 10)  # second hidden layer
        self.fc3 = nn.Linear(10, 20)  # third hidden layer
        self.fc4 = nn.Linear(20, no_movies)  # fourth hidden layer
        self.activation = nn.Sigmoid()  # activation function

    # forward propagation
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        
        return x

### Creating SAE object

In [149]:
sae = SAE()
criterion = nn.MSELoss() # loss function
optimizer = optim.RMSprop(sae.parameters(), # parameters to optimize
                          lr=0.01, # learning rate
                          weight_decay=0.5 # regularization
                          )

## `3) Training the SAE`

In [150]:
no_epoch = 50

for epoch in range(1, no_epoch + 1):
    
    train_loss = 0
    s = 0.
    
    for id_user in range(no_users):
        
        input = Variable(train_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = no_movies / \
                float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            
            train_loss += np.sqrt(loss.data * mean_corrector)

            s += 1.
            optimizer.step()
    
    print(f"epoch: {epoch}, loss: {train_loss/s}")

epoch: 1, loss: 1.7717725038528442
epoch: 2, loss: 1.0968667268753052
epoch: 3, loss: 1.0533068180084229
epoch: 4, loss: 1.0383820533752441
epoch: 5, loss: 1.0310888290405273
epoch: 6, loss: 1.0265321731567383
epoch: 7, loss: 1.023843765258789
epoch: 8, loss: 1.0219769477844238
epoch: 9, loss: 1.0206589698791504
epoch: 10, loss: 1.0198068618774414
epoch: 11, loss: 1.0187984704971313
epoch: 12, loss: 1.018229603767395
epoch: 13, loss: 1.0180617570877075
epoch: 14, loss: 1.0175716876983643
epoch: 15, loss: 1.0175323486328125
epoch: 16, loss: 1.0169435739517212
epoch: 17, loss: 1.016739010810852
epoch: 18, loss: 1.0162692070007324
epoch: 19, loss: 1.0168176889419556
epoch: 20, loss: 1.0161631107330322
epoch: 21, loss: 1.0162333250045776
epoch: 22, loss: 1.015907883644104
epoch: 23, loss: 1.0161758661270142
epoch: 24, loss: 1.015860915184021
epoch: 25, loss: 1.0156302452087402
epoch: 26, loss: 1.0156985521316528
epoch: 27, loss: 1.0150457620620728
epoch: 28, loss: 1.0150691270828247
epoch:

## `4) Testing the SAE`

In [152]:
test_loss = 0
s = 0.
for id_user in range(no_users):
    
    input = Variable(train_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)
    
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = no_movies / \
            float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data * mean_corrector)
        s += 1.

print(f"test loss: {test_loss/s}")


test loss: 0.9911261200904846
