In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import variable

First, we will use a RBM (restricted boltzmann machine) to recommend movies to users.

In [2]:
# Load the data

movies = pd.read_csv('/home/gui/Downloads/udemy_deep_learning AZ/P16-Boltzmann-Machines/Boltzmann_Machines/ml-1m/movies.dat',
                sep='::', header=None, engine='python', encoding = 'latin-1')

users = pd.read_csv('/home/gui/Downloads/udemy_deep_learning AZ/P16-Boltzmann-Machines/Boltzmann_Machines/ml-1m/users.dat',
                sep='::', header=None, engine='python', encoding = 'latin-1')

ratings = pd.read_csv('/home/gui/Downloads/udemy_deep_learning AZ/P16-Boltzmann-Machines/Boltzmann_Machines/ml-1m/ratings.dat',
                sep='::', header=None, engine='python', encoding = 'latin-1')

In [3]:
movies.head() # Columns: MovieID, Title, Genres

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
users.head() # Columns: UserID, Sex, Age, JobID, Zip Code

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings.head() # Columns: UserId, MovieID, Rating, Time Stamps

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.shape, users.shape, ratings.shape

((3883, 3), (6040, 5), (1000209, 4))

In [7]:
# Training and test data are subsets of the ratings data
# Training and test contain the same users rating different movies in each set

ratings_train = pd.read_csv('/home/gui/Downloads/udemy_deep_learning AZ/P16-Boltzmann-Machines/Boltzmann_Machines/ml-100k/u1.base', delimiter='\t')
ratings_train = np.array(ratings_train, dtype='int')

ratings_test = pd.read_csv('/home/gui/Downloads/udemy_deep_learning AZ/P16-Boltzmann-Machines/Boltzmann_Machines/ml-100k/u1.test', delimiter='\t')
ratings_test = np.array(ratings_test, dtype='int')

In [8]:
ratings_train.shape, ratings_test.shape # 80% / 20% split

((79999, 4), (19999, 4))

In [9]:
# Take the number of users and movies in order to build rating matrices indexed by users and movies
# That's because in our RBM we will use user ratings as observations and movies as features
# See that IDs are sequential, so their max is the number of users/movies considered

n_users = int(max(max(ratings_train[:,0]),max(ratings_test[:,0])))
n_movies = int(max(max(ratings_train[:,1]),max(ratings_test[:,1])))

n_users, n_movies

(943, 1682)

In [10]:
# Now, format the train/test data in order to match pytorch input specifications
# First, we want a "matrix" that is a list of lists, each row is a list containing the ratings of a user for each movie
# If the user didn't rate a movie, the rating is 0 (ratings are integers from 1 to 5)

def FormatData(data):
    FData = []
    for userID in range(1, n_users+1): # Loop for each user; userID starts at 1
        movieID = data[:,1][data[:,0]==userID] # Take movieID for each movie rated by the user
        user_ratings = data[:,2][data[:,0]==userID] # Now take the ratings
        Fratings = np.zeros(n_movies) # Initialize new ratings with zeros to fill with rated movies
        Fratings[movieID-1] = user_ratings # Fill with user ratings; movieID starts at 1 (broadcasting only on indexes given by movieID)
        FData.append(list(Fratings))
    return FData

In [11]:
Fratings_train = FormatData(ratings_train)
len(Fratings_train), len(Fratings_train[0])

(943, 1682)

In [12]:
Fratings_test = FormatData(ratings_test)
len(Fratings_test), len(Fratings_test[0])

(943, 1682)

In [13]:
# Now, finalize data transformation by turning this matrix into a pytorch tensor
# A tensor is just a multidimensional matrix that could be implemented with numpy
# We use pytorch for computational efficiency (numpy is not optimized for tensors)

train = torch.FloatTensor(Fratings_train) # There will be used by the RBM
test = torch.FloatTensor(Fratings_test)

AE_train = torch.FloatTensor(Fratings_train) # Take train and test for the auto encoder
AE_test = torch.FloatTensor(Fratings_test)

train

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [14]:
# RBM

# With the boltzmann machine, we want to know whether a user will like or not a new movie
# To make our input match, we make 0 rating = -1; 1, 2, and 3 rating = 0 (not liked); 4 and 5 rating = 1 (liked)

train[train==0] = -1
train[train==1] = 0
train[train==2] = 0
train[train==3] = 1
train[train>=4] = 1

test[test==0] = -1
test[test==1] = 0
test[test==2] = 0
test[test==3] = 1
test[test>=4] = 1

In [15]:
train[0].shape, train[:100].shape # dim(n_movies), dim(batch_size, n_movies); movies = features

(torch.Size([1682]), torch.Size([100, 1682]))

In [16]:
# Create a RBM class

# What the RBM will do? Users in train and test sets are the same, but the train set says the user watched some movies
# and the test set says the user watched some all different movies. So, the RBM takes the train set and learns the
# weights so as to function as a posterior density sampler that correctly correlates similar movies, based on how users
# liked movies together. Then, a new data point inputs with some watched movies and receives a sample given by the
# trained RBM with movies they will like and movies they will not like, based on the submitted ratings. For testing,
# we will take an user from the training set and input it to the RMB. We receive the recommendations for the remainning
# movies and then compare with the ratings that the same user gives in the test set. If the RBM is good, its
# recommendations will match the new movies rated by that user in the test set. For a prediction, you just input your
# ratings and receive the recommendation sampled by the RBM, saying that you'll like some of the other movies and won't
# like the rest

class RBM():
    def __init__(self, nv, nh): # Number of hidden and visible nodes
        self.W = torch.randn(nh, nv) # RMB's connection weights between hidden and visible nodes, starts randomly
        self.a = torch.randn(1, nh) # Bias for prob. of hidden nodes iven visible, 1d form is made compatible to 2d
        self.b = torch.randn(1, nv) # Bias for prob. of visible nodes given hidden
    
    # In the RBM, each activation function is a conditional probability that is given by a sigmoid function
    # The weights are adjusted by minimizing a log likelihood function
    # The log-likelihood gradients are approximated by contrastive divergence, that uses k-fold Gibbs sampling to
    # sample from visible nodes to update hidden samples and sample from hidden nodes to update visible samples
    # At the end of k Gibbs iterations, weights are updated with gradients approximated by the contrastive divergence
    
    def sample_h(self, x): # x will be a tensor of visible nodes of dim(batch_size, nv)
        xw = torch.mm(x, self.W.t()) # (batch_size, nv) x (nv, nh) = (batch_size, nh)
        activation = xw + self.a.expand_as(xw) # we expand a from (1, nh) to (batch_size, nh)
        ph_given_v = torch.sigmoid(activation)
        return ph_given_v, torch.bernoulli(ph_given_v) # h sample and ph, of dim (batch_size, nh)
    
    def sample_v(self, y): # the same as sample_h, exchanging v and h
        yw = torch.mm(y, self.W)
        activation = yw + self.b.expand_as(yw)
        pv_given_h = torch.sigmoid(activation)
        return pv_given_h, torch.bernoulli(pv_given_h)
    
    def train(self, vo, pho, vk, phk): # train by contrastive divergence, using input data against k-fold Gibbs samples
        self.W += (torch.mm(vo.t(),pho) - torch.mm(vk.t(),phk)).t()
        self.a += torch.sum((pho-phk),0) # torch.sum aggregates tensors, dim(x) is compatible with dim(1, x)
        self.b += torch.sum((vo-vk),0)
        
    def predict(self, new_data):
        _, h = self.sample_h(new_data)
        _, v = self.sample_v(h)
        return v

In [17]:
# initialize RMB

nv = len(train[0]) # number of visible nodes, number of features
nh = 100 # number of hidden nodes
batch_size = 100 # how many users go into each training step

rbm = RBM(nv,nh)

nv, nh

(1682, 100)

In [18]:
# Training

n_epochs = 10 # number of training steps

for epoch in range(1, n_epochs + 1):
    loss = 0 # loss function to assess error between epochs; measures difference between k-th sample and the input data
    s = 0. # counts the number of epochs for loss normalization
    
    for userID in range(0, n_users - batch_size, batch_size): # initialize batch for training, 
        vk = train[userID:userID+batch_size]
        vo = train[userID:userID+batch_size]
        
        for k in range(10): # k-fold Gibbs sampling, hidden nodes update visible nodes's density and vice versa
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)
            vk[vo<0] = vo[vo<0] # reset unwatched movies having meaningless update
        
        pho, _ = rbm.sample_h(vo) # take hidden density values
        phk, _ = rbm.sample_h(vk)
        
        rbm.train(vo, pho, vk, phk) # train to update weights based on obtained vk and phk
        
        loss += torch.mean(torch.abs(vo[vo>=0]-vk[vo>=0])) # loss func is mean absolute difference (it's 0 or 1 anyway)
        s += 1.
    print('epoch: ', str(epoch), ' loss: ', str(loss/s))

epoch:  1  loss:  tensor(0.3419)
epoch:  2  loss:  tensor(0.2461)
epoch:  3  loss:  tensor(0.2481)
epoch:  4  loss:  tensor(0.2506)
epoch:  5  loss:  tensor(0.2474)
epoch:  6  loss:  tensor(0.2452)
epoch:  7  loss:  tensor(0.2488)
epoch:  8  loss:  tensor(0.2473)
epoch:  9  loss:  tensor(0.2448)
epoch:  10  loss:  tensor(0.2477)


In [19]:
train[2:3].shape, train[2].shape # see the difference in dimensions, we will use this below... this is because the first
                                 # is still a slice of train and the second is an element of it

(torch.Size([1, 1682]), torch.Size([1682]))

In [20]:
# Testing

test_loss = 0
s = 0.

for userID in range(n_users):
    v = train[userID:userID+1] # Take the user's training ratings
    vt = test[userID:userID+1] # Take the user's test ratings
    
    if len(vt[vt>=0]) > 0: # To ensure that we really have that user rating movies in the training set
        _,h = rbm.sample_h(v) # Sample hidden nodes that will give the posterior for recommendations
        _,v = rbm.sample_v(h) # Generate recommendations
        test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0])) # Loss compares recommendations with test ratings
        s += 1.

print('test loss: '+str(test_loss/s)) # Test loss is good if kept close to train loss (meaning no overfitting)

test loss: tensor(0.2512)


In [21]:
# A prediction for user 21

pred = rbm.predict(test[20:21])
print('Test user 21 watched', str(len(test[20:21][test[20:21]>=0])), 'movies')
print('And liked', len(test[20:21][test[20:21]==0]) , 'of them')
print('Number of recommended movies:', str(len(pred[pred==1])))
print('Number of not recommended movies:', str(n_movies - len(pred[pred==1])))

Test user 21 watched 84 movies
And liked 37 of them
Number of recommended movies: 1160
Number of not recommended movies: 522


In [22]:
# We can see here that the user receives recommendations for already watched movies, and some of them are wrong
test[20:21][test[20:21]>0] == pred[test[20:21]>0]

tensor([ True,  True,  True,  True,  True,  True, False,  True, False,  True,
         True,  True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True, False,  True,
         True,  True, False, False,  True, False,  True, False,  True,  True,
         True,  True, False,  True,  True,  True,  True])

In [23]:
# Now, we build an AutoEncoder to give non-binary recommendations, the actual predicted ratings, from 1 to 5

# Autoencoders are neural networks for unsupervised learning. It is a method of data compression / dimensionality
# reduction. Take an input layer and an output layer of the same size. Now, add a smaller hidden layer. The algorithm
# tries to learn the identity function, making the output the same as the input. If the input data is random, without
# structure, this will be hard. If it's structured, the network will be able to reproduce the input by learning the
# structure with the hidden layer. Then, the hidden layer will be a smaller representation of the input data, with
# reduced features / dimensions. We can also make the hidden layer bigger an add a sparcity constraint to make the
# neurons fire way less on average; this makes it possible for data structure to be learned more efficiently at the cost
# of dimensionality reduction. This is because we can associate in a more

In [24]:
# Stacked AutoEncoder class
# This architecture is a kind of deep autoencoder, with 3 hidden layers, of 20, 10, and 20 nodes

class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__() # Necessary for pytorch nn classes
        self.fc1 = nn.Linear(n_movies, 20) # This function builds the architecture
        self.fc2 = nn.Linear(20,10)
        self.fc3 = nn.Linear(10,20)
        self.fc4 = nn.Linear(20, n_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x): # This function passes the data through the network
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x

In [25]:
# Initialize model

sae = SAE()

loss = nn.MSELoss() # Mean Square Error loss function
optimi = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) # We choose RMSprop as optimization algorithm

In [26]:
# Training

n_epochs = 200

for epoch in range(1, n_epochs+1):
    train_loss = 0
    s = 0.
    
    for userID in range(n_users):
        D = AE_train[None, userID] # Take data
        T = D.clone() # Take target, here is the same, AE compares the output with the input itself
        
        if torch.sum(T.data > 0) > 0: # User has to have rated a movie
            output = sae(D)
            T.require_grad = False 
            output[T==0] = 0 # Reset non-rated movies
            L = loss(output,T)
            mean_corrector = n_movies/float(torch.sum(T.data > 0) + 1e-10) # Leave out non watched movies
            L.backward() # Compute loss
            train_loss += np.sqrt(L.item()*mean_corrector)
            s += 1.
            optimi.step() # Compute the optimization
    print('epoch :', epoch, ' loss :', train_loss/s)

epoch : 1  loss : 1.7715149109148893
epoch : 2  loss : 1.096687728815192
epoch : 3  loss : 1.0534122318199772
epoch : 4  loss : 1.0383853239892509
epoch : 5  loss : 1.030818054192946
epoch : 6  loss : 1.0265354084312588
epoch : 7  loss : 1.0238244704641435
epoch : 8  loss : 1.0220064819749113
epoch : 9  loss : 1.020216112204642
epoch : 10  loss : 1.0197007850983588
epoch : 11  loss : 1.0185181032649502
epoch : 12  loss : 1.0184571359518915
epoch : 13  loss : 1.0179434414347033
epoch : 14  loss : 1.017452886892715
epoch : 15  loss : 1.017495548274439
epoch : 16  loss : 1.0169047779410412
epoch : 17  loss : 1.0165627230521688
epoch : 18  loss : 1.016262887091639
epoch : 19  loss : 1.0161386072587186
epoch : 20  loss : 1.0160162400660269
epoch : 21  loss : 1.0159327388513368
epoch : 22  loss : 1.0161472004241539
epoch : 23  loss : 1.0157173420009935
epoch : 24  loss : 1.0158304517535013
epoch : 25  loss : 1.015597894480981
epoch : 26  loss : 1.0153955775904588
epoch : 27  loss : 1.0152723

In [32]:
# Test

test_loss = 0
s = 0.

for userID in range(n_users):
    D = AE_train[None, userID] # Take data
    T = AE_test[None, userID] # Take target, here it compares the output with the test set
    
    if torch.sum(T.data > 0) > 0:
        output = sae(D)
        T.require_grad = False
        output[T==0] = 0
        L = loss(output,T)
        mean_corrector = n_movies/float(torch.sum(T.data > 0) + 1e-10)
        L.backward()
        test_loss += np.sqrt(L.item()*mean_corrector)
        s += 1.
print('test loss :', test_loss/s)

test loss : 0.9712291417439184


This means that, on average, our autoencoder is off by slightly less than 1 star in reccommending actual ratings, from 1 to 5 stars.