In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/boltzmann-machines/Boltzmann_Machines/Boltzmann_Machine.ipynb
/kaggle/input/boltzmann-machines/Boltzmann_Machines/boltzmann_machine.py
/kaggle/input/boltzmann-machines/Boltzmann_Machines/AItRBM-proof.pdf
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/Train_Test_Set_Creation.R
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/test_set.csv
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/ratings.csv
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/users.dat
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/ratings.dat
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/README
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/.Rhistory
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/training_set.csv
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/movies.dat
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-100k/u.occupation
/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-100k/u1.base
/

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [3]:
#Importing the dataset
# We are using separator as :: and not a comma as some movies might have commas in their names itself.
#Our movie file doesn't contain headers, so instead of the default header='infer', which infers
#the headers from the column names, we put it to 'none.'
#engine=python for efficiency.
#Some movies contain names which can't be decoded by the default 'UTF-8' encoder so we are using
#'latin-1' instead.
movies=pd.read_csv('/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/movies.dat',
                  sep ='::',
                  header= None,
                  engine='python',
                  encoding='latin-1')
users=pd.read_csv('/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/users.dat',
                  sep ='::',
                  header= None,
                  engine='python',
                  encoding='latin-1')
ratings= pd.read_csv('/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-1m/ratings.dat',
                  sep ='::',
                  header= None,
                  engine='python',
                  encoding='latin-1')

In [4]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# We are just taking u1,u2 focusing on autoencoders, instead of all the 5 u1-u5 train test splits
# which are there for cross validation.
# The dataset has `tab` as the delimiter, so we use `tab`. By default it is `comma`
# We use `tab` with delimiter and `::` with separator as sep.

training_set=pd.read_csv('/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-100k/u1.base',
                        delimiter='\t')
test_set=pd.read_csv('/kaggle/input/boltzmann-machines/Boltzmann_Machines/ml-100k/u1.test',
                    delimiter='\t')

In [6]:
training_set=np.array(training_set,dtype='int')
test_set=np.array(test_set,dtype='int')

In [7]:
# We will create two matrices mat[u][i] where u: user, i:movie, m[u][i]:rating of movie i given by user u.
#If user u didn't rate movie i, we will put a 0 in that cell, for both the training and test sets.

# We are finding the max in both training and test set to get the last user and last movie,
# as the data might be random and maximum might be present in any of the two.
nb_users= int(max(max(training_set[:,0]),max(test_set[:,0]))) #max. of first column in the training
              # and test sets. The first column contains the user ids.
nb_movies= int(max(max(training_set[:,1]),max(test_set[:,1]))) #max. of second column in the training
              # and test sets. The second column contains the movie names.

In [8]:
nb_users,nb_movies

(943, 1682)

In [9]:
#All movie_ids are stored in  data[:,1].
#User ids are stored in data[:,0]
#Ratings are stored in data[:,2]
def convert(data):
    new_data=[]
    for id_users in range(1,nb_users+1):
        id_movies=data[:,1][data[:,0]==id_users] #all the movies watched by the ith(id_user) user and rated.
        id_ratings=data[:,2][data[:,0]==id_users] #all the ratings given by the ith(id_user) user
        #Still we are not getting a '0'for when the user didnot rate the movie.
        #Read Below:
        ratings=np.zeros(nb_movies)
        ratings[id_movies-1]= id_ratings #ratings for each of the 1682 movies given by user=id_user(i th user in loop)
        new_data.append(list(ratings)) #list of lists, each list containing ratings given by all the different users.
    return new_data
training_set=convert(training_set)
test_set=convert(test_set)

There are 1682 movies and 982 users, so we have to create a list of 1682 users with '0'.
For movies, the user rated we have to replace '0' by the user rating, for that movie.

Also, indexes in python start by zero and our movie ids start by 1. Therefore ratings[id_movies-1]

In [10]:
# training_set_array = np.array(training_set)
# print(training_set_array.shape)

In [11]:
#If we uncomment the above command we will get (943,1682) as output after converting the list into
# a numpy nd-array to know its dimensions.

# as we have converted the nd-array training_set into list of lists.

#Therefore we can see each of the 943 users have a list of 1682 movies with their ratings from 0-5 each
# in floating point decimals format.

#We had to do it in the above format because training_set is a list of lists and not a numpy nd
#array or pandas data frame, so we had to get it into a nd-array format.

# We can also get the dimensions of the list of lists by doing the command below:


In [12]:
len(training_set),len(training_set[0])

(943, 1682)

In [13]:
#Converting the data into torch tensors
training_set=  torch.FloatTensor(training_set)
test_set=  torch.FloatTensor(test_set)#This expects a list of lists, which it converts into a torch tensor.

In [14]:
#Converting the ratings into binary ratings 1 (Liked) or Not Liked(0)
#RBM will predict in 0-1 binary format whether the user liked a movie or not, therefore we need to convert
#these ratings as well into 0-1  format, otherwise things will be inconsistent for the RBM.

# Where training_set=0, means user hasn't watched the movie and we replace it by `-1`

training_set[training_set==0]=-1;
training_set[training_set<=2]=0;
training_set[training_set>=3]=1;

test_set[test_set==0]=-1;
test_set[test_set<=2]=0;
test_set[test_set>=3]=1;


In [21]:
# An RBM is a probabilistic graphical model, so we will build one.

#Creating the architecture of the Neural Network:
        
    #Input-vector(1-user), #vk-visible nodes obtained after k-samplings
    #ph0:vector of probabilities that at the first iteration, the hidden nodes=1, given the values
    #of v0, phk->probabilites of hidden nodes after k-sampling, given the values of visible nodes(vk).
    
    
class RBM():
  def __init__(self, nv, nh):
    self.W = torch.randn(nh, nv)
    self.a = torch.randn(1, nh)
    self.b = torch.randn(1, nv) #h-hidden node, v-visible node.
  def sample_h(self, x): #x-visible node
    wx = torch.mm(x, self.W.t())
    activation = wx + self.a.expand_as(wx)
    p_h_given_v = torch.sigmoid(activation)
    return p_h_given_v, torch.bernoulli(p_h_given_v)
  def sample_v(self, y): #y represents hidden nodes.
    wy = torch.mm(y, self.W)
    activation = wy + self.b.expand_as(wy)
    p_v_given_h = torch.sigmoid(activation)
    return p_v_given_h, torch.bernoulli(p_v_given_h)
  def train(self, v0, vk, ph0, phk):
    self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
    self.b += torch.sum((v0 - vk), 0)
    self.a += torch.sum((ph0 - phk), 0)
nv = len(training_set[0])
nh = 100
batch_size = 100
rbm = RBM(nv, nh)
        

torch.randn(nh,nv) initializes a tensor of size (nh,nv),
according to a normal distribution, and besides this normal distribution has
a mean of zero and variance of 1. Hence, this initializes all the weights for the probabilities.
(P of the visible nodes according to the hidden nodes).

The biases a(for hidden nodes) and b(for visible nodes), should have 2-dimensions because that's what pytorch accepts.
First dimension corresponding to the batch(fake dimension), and second dimension corresponding to the bias.

Pytorch only accepts a 2-D tensor

that's why we create self.a= torch.randn(1,nh)-> 1 represents the batch and nh corresponding to the bias(normalized in 0-1).

def sample_h(self, x):
        Calculating probability of h given v,
        probability that the hidden neuron=1 given the values of the visible neurons,
        that is actually our input vector of observations with all the ratings
        
        This probability is nothing else than the sigmoid function.
        applied to W(vector of weights)*x(vector of visible neurons)+the bias(a),
        because a corresponds to the bias of the hidden nodes.
        
        torch.mm -->multiplies two matrices
 
We take the transpose of self.w to make it mathematically correct.
self.a is a 2-D matrix that contains the input vector in batches(mini batch), although the batch size is 1.
We want to make sure that we apply this bias to each line of the mini batch, that is to each line of 
this dimension an to do that we add a new dimension for the batch that we are adding, and this function
is known by expand_as

The activation function will be a probability that the hidden node will be activated according to the value of the visible node.

p_h_given_v= probability that the hidden node is activated given the value of the visible node.


We are making a Bernouille RBM because we are predicting a binary outcome whether the users like (yes/no) for a movie.

And that is what we are returning, some Bernoulli samples of that distribution, of that probabilities
of h given v.


ph given v is a vector of 100 elements, each of these elements corresponds to each of the 100
hidden nodes and each of the elements is the probability that the hidden node is activated.


return p_h_given_v, torch.bernoulli(p_h_given_v)
This will return all the probabilities of the hidden neurons, given the values of the visible nodes.
That is the ratings and it will return also that sampling of the hidden neurons.

The function above is the first function we need for Gibb's sampling.

In function:
def sample_h(self, x):
    We return the probabilities that each of the visible nodes=1,given the values of the hidden nodes.
    given whether the hidden nodes are activated or not.
    
#### This is because we are recreating the Gibbs' Sampling where we recreate the Input vectors,
and also for the logarithmic function to minimise the gradient at the original input vector.

#### We must not take the transpose in the third function. Why?
Since W is the weight matrix of p_v_given_h and we are computing p_v_given_h in the second third function.
We were calculating p_h_given_v in the second function that's why we had to take the transpose of W.

#### Function 4:
We have to minimise the energy or maximize the log-likelihood, and to do that we need to compute the gradient.
Since the direct computation of gradients is heavy, we are going to try to approximate it.
We do that by Contrastive Divergence in Gibbs' Sampling.
#### Watch lecture no.110 to better understand function:4 and read the research paper as well. 

In [22]:
#nv: no. of visible nodes= each movie
nv= len(training_set[0])
nh=100 #Random number
batch_size=100 #Random number
rbm=RBM(nv,nh)

In [25]:
#Training the RBM
# we have 9000 users with binary(0,1) classification of whether or not the user liked the movie, hence we will reach
# convergence pretty fast in around 10 epochs.
nb_epoch=10
for epoch in range(1,nb_epoch+1):
    train_loss=0 #initial loss=0 before starting training.
    s=0. #o=counter to normalize train_loss(divide train_loss by the counter 0.= float)
    for id_user in range(0,nb_users-batch_size,batch_size): #taking input in batch size=100
        vk=training_set[id_user:id_user+batch_size] #visible nodes to be changed after gibbs' sampling iterations.
        v0=training_set[id_user:id_user+batch_size] #visible nodes at the start,i.e., the original ratings
        ph0,_= rbm.sample_h(v0) #probabilities that hidden node equal one given the visible nodes, also taking only first output returned by the function.
        for k in range(10):# training for 10 times a/c to gibbs' sampling.
            _,hk=rbm.sample_h(vk) #changing hidden nodes after sampling visible nodes.
            _,vk=rbm.sample_v(hk)#changing visible nodes after sampling hidden nodes.
            vk[v0<0] = v0[v0<0]#making sure that where movie ratings was -1( not given by user) persists even after samplings.
        phk,_= rbm.sample_h(vk) #sample_h applied on last iterated sample of visible nodes.
        rbm.train(v0,vk,ph0,phk)# Now weights,biases are going to be updated towards direction of maximum likelihood and the train
        #function does not return anything it just updates the weights and biases.
        train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))#vk:prediction, v0:original, v0[v0>=0]:ratings that exist/given by user.
        s+=1. #updating counter by 1(in float)
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s)) #normalizing the training loss

epoch: 1 loss: tensor(0.0708)
epoch: 2 loss: tensor(0.0712)
epoch: 3 loss: tensor(0.0716)
epoch: 4 loss: tensor(0.0704)
epoch: 5 loss: tensor(0.0711)
epoch: 6 loss: tensor(0.0724)
epoch: 7 loss: tensor(0.0712)
epoch: 8 loss: tensor(0.0712)
epoch: 9 loss: tensor(0.0711)
epoch: 10 loss: tensor(0.0715)


#### ((1-0.07)/1) % of ratings correctly predicted.

In [27]:
#Testing the RBM
#No training required, hence no epochs necessary.
test_loss = 0
s = 0.
for id_user in range(nb_users):
    v = training_set[id_user : id_user+1] #id user+1 for each user, v  is input on which we will make prediction. READ BELOW:
    vt = test_set[id_user : id_user+1]  #target was mentioned by vt.
    if len(vt[vt>=0])>0:
        _,h = rbm.sample_h(v)
        _,v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.
print('test_loss: '+str(test_loss/s))

test_loss: tensor(0.0670)


vk = training_set[id_user : id_user+1] The training set will be used to activate the hidden neurons to get the outputs.
Right now the training set contains the ratings of the training set and it doesn't contain the answers of the test set.
But by using the inputs of the training set, we will activate the neurons of our RBM to predict the ratings of the movies
that were not rated yet, and that is the ratings of the test set.

To get our predictions of the test set ratings, do we need to apply the k-step contrastive divergence or more precisely,
do we need k steps of the random walk(i.e.10 steps of the random walk) or 1 step of the random walk?
-> We need to make one step because the principle of the random walk, this is not the random walk even, because in the random walk the probabilities are the same. Here, even if its a Markov chain, the probabilities are not the same so its not a random walk rather its a blind walk.

Read on: MCMCM, Markov Chain Monte Carlo's blind walk techniques.

Above, we were trained on 10 steps, so we will be much better at staying on an imaginary straight line(imagine being blindfolded).

So, our prediction will be one round trip of Gibbs' Sampling, one iteration, one step of the blind walk.

Hence, we can see the test loss is 0.0670
So for new observations, we managed to predict if the user will like the movie or not by 93% ((1-0.067)/1j)