# `Recommendation System using Boltzmann Machine`

## Libaries  
Using pytorch

In [342]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## `1) Data Preprocessing`

### Importing the dataset

#### Movies

In [343]:
movies = pd.read_csv("ml-1m/movies.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1" # encoding for special characters
                     )

In [344]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [345]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3883 non-null   int64 
 1   1       3883 non-null   object
 2   2       3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


#### Users

In [346]:
users = pd.read_csv("ml-1m/users.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1"
                     )

In [347]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [348]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       6040 non-null   int64 
 1   1       6040 non-null   object
 2   2       6040 non-null   int64 
 3   3       6040 non-null   int64 
 4   4       6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


#### Ratings

In [349]:
ratings = pd.read_csv("ml-1m/ratings.dat",
                     sep="::",
                     header=None,
                     engine="python",
                     encoding="latin-1"
                     )

In [350]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [351]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000209 non-null  int64
 1   1       1000209 non-null  int64
 2   2       1000209 non-null  int64
 3   3       1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


### Prepare the training and test sets

#### Training set

In [352]:
train_set = pd.read_csv("ml-100k/u1.base",
                        delimiter="\t"
                        )

In [353]:
train_set.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [354]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79999 entries, 0 to 79998
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1          79999 non-null  int64
 1   1.1        79999 non-null  int64
 2   5          79999 non-null  int64
 3   874965758  79999 non-null  int64
dtypes: int64(4)
memory usage: 2.4 MB


In [355]:
train_set = np.array(train_set, dtype="int")

In [356]:
train_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

#### Test set

In [357]:
test_set = pd.read_csv("ml-100k/u1.test", delimiter="\t")

In [358]:
test_set.head()

Unnamed: 0,1,6,5,887431973
0,1,10,3,875693118
1,1,12,5,878542960
2,1,14,5,874965706
3,1,17,3,875073198
4,1,20,4,887431883


In [359]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1          19999 non-null  int64
 1   6          19999 non-null  int64
 2   5          19999 non-null  int64
 3   887431973  19999 non-null  int64
dtypes: int64(4)
memory usage: 625.1 KB


In [360]:
test_set = np.array(test_set, dtype="int")

In [361]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

### Getting the number of users and movies

In [362]:
no_users = int(max(max(train_set[:, 0]), max(test_set[:, 0])))
no_movies = int(max(max(train_set[:, 1]), max(test_set[:, 1])))

### Converting the data into an array with users in rows and movies in columns

In [363]:
def convert(data):
    new_data = []
    for id_users in range(1, no_users + 1):
        id_movies = data[:, 1][data[:, 0] == id_users] # all movies rated by user
        id_ratings = data[:, 2][data[:, 0] == id_users] # all ratings given by user
        ratings = np.zeros(no_movies)
        ratings[id_movies - 1] = id_ratings # ratings for each movie
        new_data.append(list(ratings))
    
    return new_data

train_set = convert(train_set)
test_set = convert(test_set)

### Converting the data into `Torch tensors`

In [364]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [365]:
train_set

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [366]:
test_set

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

### Converting the ratings into `binary ratings`: 1 (Liked) or 0 (Not Liked)

In [367]:
train_set[train_set == 0] = -1 # unrated movies
train_set[train_set == 1] = 0
train_set[train_set == 2] = 0
train_set[train_set >= 3] = 1

test_set[train_set == 0] = -1
test_set[train_set == 1] = 0
test_set[train_set == 2] = 0
test_set[train_set >= 3] = 1

## `2) Creating the architecture of the Neural Network`

In [368]:
class RBM:
    # nv = number of visible nodes
    # nh = number of hidden nodes
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv) # Matrix of weights
        self.a = torch.randn(1, nh)  # Bias for hidden nodes
        self.b = torch.randn(1, nv)  # Bias for visible nodes

    # Sample hidden nodes given visible nodes
    def sample_h(self, x):  # x = input vector of visible nodoes
        # wx is a vector of probabilities of activation of hidden nodes
        # Matrix multiplication of x and W
        wx = torch.mm(x, self.W.t()) # .t() = transpose
        
        # Add bias to each row of wx
        activation = wx + self.a.expand_as(wx)
        
        # Probability of activation of hidden nodes
        p_h_given_v = torch.sigmoid(activation)
        
        # bernoulli returns 1 with probability p_h_given_v and 0 otherwise
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    # Sample visible nodes given hidden nodes
    def sample_v(self, y): # y = input vector of hidden nodes
        wy = torch.mm(y, self.W)
        
        activation = wy + self.b.expand_as(wy)
        
        # Probability of activation of visible nodes
        p_v_given_h = torch.sigmoid(activation)
        
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    def train(self, v0, vk, ph0, phk):
        # v0 = input vector of visible nodes
        # vk = visible nodes after k sampling
        # ph0 = probability of activation of hidden nodes given v0
        # phk = probability of activation of hidden nodes given vk
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        
        self.b += torch.sum((v0 - vk), 0)
        
        self.a += torch.sum((ph0 - phk), 0)

### Creating RBM object

In [369]:
nv = len(train_set[0])  # no. of visible nodes = no. of movies
nh = 100  # no. of hidden nodes = no. of features
batch_size = 100 # no. of users in each batch
rbm = RBM(nv, nh)

## `3) Training the RBM`

In [370]:
no_epoch = 10
for epoch in range(1, no_epoch+1):

    train_loss = 0  # loss for each epoch
    s = 0. # no. of users who rated at least one movie

    for id_user in range(0, no_users-batch_size, batch_size):

        # visible nodes after k sampling
        vk = train_set[id_user:id_user+batch_size]
        # input vector of visible nodes
        v0 = train_set[id_user:id_user+batch_size]
        # probability of activation of hidden nodes given v0
        ph0, _ = rbm.sample_h(v0)

        for k in range(10):  # k = no. of sampling
            _, hk = rbm.sample_h(vk)  # hk = hidden nodes after k sampling
            _, vk = rbm.sample_v(hk)  # vk = visible nodes after k sampling

            vk[v0 < 0] = v0[v0 < 0]  # keep unrated movies as they are

        # Probability of activation of hidden nodes given vk
        phk, _ = rbm.sample_h(vk)
        
        # Update weights
        rbm.train(v0, vk, ph0, phk)
        
        # Calculate loss
        train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
        
        # Count no. of users
        s += 1.
    
    print(f"epoch: {epoch}, loss: {train_loss/s}")

epoch: 1, loss: 0.336189866065979
epoch: 2, loss: 0.24541375041007996
epoch: 3, loss: 0.2512347996234894
epoch: 4, loss: 0.247942715883255
epoch: 5, loss: 0.2464195340871811
epoch: 6, loss: 0.24945691227912903
epoch: 7, loss: 0.2500212490558624
epoch: 8, loss: 0.24569903314113617
epoch: 9, loss: 0.2481144219636917
epoch: 10, loss: 0.2471057027578354


## `4) Testing the RBM`

In [371]:
test_loss = 0
s = 0.

for id_user in range(no_users):

    v = train_set[id_user:id_user+1]
    vt = test_set[id_user:id_user+1]

    if len(vt[vt >= 0]) > 0:

        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)

        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.

print(f"test loss: {test_loss/s}")

test loss: 0.7112565636634827
