In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 16 16:56:49 2018

@author: NVDL
"""
###Part 1 - Importing 
#Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
#Importing the dataset
movies = pd.read_csv('/Users/NVDL/Code/Practice_/Data/Math/Ranking/Movies/ml-1m/movies.dat',
                     sep='::',
                     header= None, 
                     engine = 'python', 
                     encoding ='latin-1')

users = pd.read_csv('/Users/NVDL/Code/Practice_/Data/Math/Ranking/Movies/ml-1m/users.dat',
                     sep='::',
                     header= None, 
                     engine = 'python', 
                     encoding ='latin-1')

ratings = pd.read_csv('/Users/NVDL/Code/Practice_/Data/Math/Ranking/Movies/ml-1m/ratings.dat',
                     sep='::',
                     header= None, 
                     engine = 'python', 
                     encoding ='latin-1')

In [3]:
### Part 2 - Preprocessing train/test set
#Preparing the training set 
training_set = pd.read_csv('/Users/NVDL/Code/Practice_/Data/Math/Ranking/Movies/ml-100k/u1.base',
                           delimiter = '\t') #80% of total set 
#Convert df training_set to array
training_set = np.array(training_set, dtype = 'int')

#Preparing the the test set
test_set = pd.read_csv('/Users/NVDL/Code/Practice_/Data/Math/Ranking/Movies/ml-100k/u1.test',
                           delimiter = '\t') #20% of total set 
#Convert df test_set to array
test_set = np.array(test_set, dtype = 'int')




In [4]:
#Getting the number of users and movies
num_users = int(max(max(training_set[:,0]), max(test_set[:,0]))) #first column, maximum user id
num_movies = int(max(max(training_set[:,1]), max(test_set[:,1]))) #first column, no. of movies


In [5]:
#Converting data into array with users in lines 
#and movies in columns (943 list of users with list of 1682 movies usersrating)
def convert(data):
    new_data = []
    for id_users in range(1, num_users + 1): #1, up to [shift data 1+] num_users 
        id_movies = data[:,1][data[:,0] == id_users] #movie of users
        id_ratings = data[:,2][data[:,0] == id_users] #movie rating of users
        ratings = np.zeros(num_movies)
        ratings[id_movies - 1] = id_ratings 
        new_data.append(list(ratings))
    return new_data 

In [6]:
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [7]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

In [8]:
#Convert training set to return list of customers with lists of rating per movie
training_set = convert(training_set)
test_set = convert(test_set)

In [9]:
#Converting the dataset into Torch tensors, multidimensional tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)    

In [10]:
## Part 3 - Creating architecture of Neural Network
class SAE(nn.Module):
    def __init__(self, ): #Self = object of SAE
        super(SAE, self).__init__() #Super calls every inheritance modules from SAE
        self.fc1 = nn.Linear(num_movies, 20) #Input vector and first encoded vector of 20 elements for 1st hidden layers
        """
        If input is related to one of the encoded vectors in the hidden layer,
        it will activate that node.
        """
        self.fc2 = nn.Linear(20, 10) #Input from previous output, and new input to second hidden layer
        
        """
        Let's reconstruct our model from the hidden layers
        """
        
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, num_movies)
        
        """
        Let's create activation function
        """
        self.activation =nn.Sigmoid() #Sigmoid activation for the full network

    
    def forward(self, x): #Rating of movies per user
        """
        #x = activation of nodes in the input layer fc1 that converts 20 input
        nodes to 10 output nodes in the hidden layer, and uses activation function
        to encode input vector x. 
        """
        x = self.activation(self.fc1(x)) #Input to 20 elements
        x = self.activation(self.fc2(x)) #Input 20 elements to 10 elements
        
        #Reconstruct
        x = self.activation(self.fc3(x)) #Input 10 elements to 20 elements
        x = self.fc4(x) #Vector of predicted ratings
        
        return x 
    


In [11]:
sae = SAE() 
criterion = nn.MSELoss() #Object of sae 
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) #all features in our architecture


In [12]:
#Part 4 - Training & Testing SAE Model
epochs = 200 
for epoch in range(1,epochs+1): #Upperbound excluded in python
    #Initialise values that keeps information over epochs
    training_loss = 0 #at beginning of training
    #exclude variables without ratings, only users with > 1 ratings
    s= 0. #RMS is a float
    #let's get input vector of features per user
    for id_user in range(num_users):
        """
        We need to add another dimension corresponding to the batches, else 
        Pytorch, Keras etc. won't take the var in. 
        """
        input = Variable(training_set[id_user]).unsqueeze(0) #Correspond with index of user in the loop
        #Clone input
        target = input.clone() #A clone of ratings per user
        #If sum  observation > 0, thenn users rated at least 1 rating, 
        if torch.sum(target.data > 0) > 0:
            output = sae(input) #Applying object with real ratings to class SAE
            target.require_grad = False #we don't compute gradient with respect to target
            #Only include non-zero values for target
            output[target ==0] = 0 #Will not be included in rmsprop optimiser, for large datasets
            #Calculate loss 
            loss = criterion(output, target) #Output, vs  real non-zero ratings for movie per user
            """
            Average of error by movies that have non-zero ratings. 
            """
            mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10) 
            #Decrease loss
            loss.backward() #Which direction weights are updated
            #
            training_loss += np.sqrt(loss.item()*mean_corrector) 
            #Increment S by 1 per rated movie by user
            s += 1.
            #Optimizer to update the weights
            optimizer.step()  #Amount by which weights are updated
    #Print epoch,loss after every epoch
    print ('epoch: '+str(epoch)+ 'loss: '+str(training_loss/s)) #Loss of < 1 = optimal
 

epoch: 1loss: 1.770976260699006
epoch: 2loss: 1.0966909778060805
epoch: 3loss: 1.0536440186600466
epoch: 4loss: 1.0383528789871033
epoch: 5loss: 1.0310357578805416
epoch: 6loss: 1.0265327117832463
epoch: 7loss: 1.0240013224652358
epoch: 8loss: 1.0218246776954192
epoch: 9loss: 1.0208632993150688
epoch: 10loss: 1.019559880811075
epoch: 11loss: 1.0188678294418307
epoch: 12loss: 1.0182290285752373
epoch: 13loss: 1.0180218666973364
epoch: 14loss: 1.017370061074304
epoch: 15loss: 1.0170723034538427
epoch: 16loss: 1.0168677461742364
epoch: 17loss: 1.0168903566913232
epoch: 18loss: 1.0161682702519792
epoch: 19loss: 1.0164777885690595
epoch: 20loss: 1.0162646870058751
epoch: 21loss: 1.0161942278498992
epoch: 22loss: 1.0157987575012628
epoch: 23loss: 1.0160014347697466
epoch: 24loss: 1.0157687362230634
epoch: 25loss: 1.0158070378226538
epoch: 26loss: 1.01547029865176
epoch: 27loss: 1.0154456728306465
epoch: 28loss: 1.0149542365302469
epoch: 29loss: 1.0128178719735959
epoch: 30loss: 1.01114202723

In [13]:
test_loss = 0 #at beginning of testing
s= 0. #RMS is a float
#let's get input vector of features per user
for id_user in range(num_users):
    """
    We need to add another dimension corresponding to the batches, else 
    Pytorch, Keras etc. won't take the var in. 
    """
    input = Variable(training_set[id_user]).unsqueeze(0) #Use training_set to compare loss with real examples
    #Test set with real ratings
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:#If sum  observation > 0, then users rated at least 1 rating 
        output = sae(input) #Applying object with real ratings to class SAE
        target.require_grad = False #we don't compute gradient with respect to target
        #Only include non-zero values for target
        output[target ==0] = 0 #Will not be included in rmsprop optimiser, for large datasets,thus no zeros included
        #Calculate loss 
        loss = criterion(output, target) #Output, vs  real non-zero ratings for movie per user
        """
        Average of error by movies that have non-zero ratings. Adjusting factor
        testing those with users|rating  > 1. Or rated at least n movie > 1. 
        """
        mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10) 
        #Loss of new data
        test_loss += np.sqrt(loss.item()*mean_corrector) 
        #Increment S by 1 per rated movie by user
        s += 1.
        
#Print epoch loss after every epoch
print ('test loss: '+str(test_loss/s)) #Loss of < 1 = optimal

test loss: 0.9496335469083546
