In [None]:
# !wget -O ./data/moviedataset.zip http://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip -o ./data/moviedataset.zip -d ./data
# from https://grouplens.org/datasets/movielens/

In [None]:
from tensorflow import  keras
from keras.datasets.mnist import load_data
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neural_network import BernoulliRBM
import pandas as pd
import numpy as np


In [None]:
# MovieID::Title::Genres
df_movies = pd.read_csv('movies.dat', sep='::', header= None)
df_movies.head(2)

  


Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [None]:
# UserID::MovieID::Rating::Timestamp
df_ratings = pd.read_csv('ratings.dat', sep='::', header= None)
df_ratings.head(2)

  


Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109


In [None]:
# UserID::Gender::Age::Occupation::Zip-code
df_users = pd.read_csv('users.dat', sep='::', header= None)
df_users.head(2)

  


Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072


In [None]:
df_movies.columns = ['MovieID','Title','Genres']
# df_movies.set_index('MovieID', inplace=True)
df_movies.head(2)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [None]:
df_ratings.columns = ['UserID','MovieID','Rating','Timestamp']
df_ratings.head(2)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109


In [None]:
# The Restricted Boltzmann Machine model has two layers of neurons,
#  one of which is what we call a visible input layer and the other is called a hidden layer.
#   The hidden layer is used to learn features from the information fed through the input layer.
#    For our model, the input is going to contain X neurons, where X is the amount of movies in our dataset.
#     Each of these neurons will possess a normalized rating value varying from 0 to 1,
#      where 0 meaning that a user has not watched that movie and the closer the value is to 1,
#       the more the user likes the movie that neuron's representing. These normalized values, of course,
#        will be extracted and normalized from the ratings dataset.

# After passing in the input,
#  we train the RBM on it and have the hidden layer learn its features.
#   These features are what we use to reconstruct the input, which in our case,
#    will predict the ratings for movies that user hasn't watched, which is exactly what we can use to recommend movies!

In [None]:
len(df_movies)

3883

In [None]:
user_rating_df = df_ratings.pivot(index='UserID', columns='MovieID', values='Rating')
user_rating_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,2.0,,,,,,,,,,3.0,,,,,,,,1.0,,,,,5.0,,,4.0,,4.0,,3.0,,,3.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
norm_user_rating_df = user_rating_df.fillna(0) / 5.0
trX = norm_user_rating_df.values
trX[0:5]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
hiddenUnits = 20
visibleUnits =  len(user_rating_df.columns)
visibleUnits

3706

In [None]:
# n_components is hiddenUnits
rbm = BernoulliRBM(n_components=hiddenUnits, learning_rate=0.01, n_iter=15, verbose=1)
rbm.fit(trX)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -832.99, time = 2.96s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -421.10, time = 3.11s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -423.05, time = 3.08s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -329.31, time = 3.08s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -333.99, time = 3.10s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -335.33, time = 3.08s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -310.74, time = 3.15s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -340.91, time = 3.13s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -315.56, time = 3.13s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -325.28, time = 3.05s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -275.58, time = 3.06s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -325.60, time = 3.07s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -304.71, time = 3.07s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -302.93, time = 3.07s
[BernoulliRBM] 

BernoulliRBM(batch_size=10, learning_rate=0.01, n_components=20, n_iter=15,
             random_state=None, verbose=1)

In [None]:
mock_user_id = 215
#Selecting the input user
inputUser = trX[mock_user_id-1].reshape(1, -1)
inputUser[0:5]

array([[0.8, 0. , 0. , ..., 0. , 0. , 0. ]])

In [None]:
# read here about rbm functions 
# https://www.gabormelli.com/RKB/sklearn.neural_network.BernoulliRBM

In [None]:
result = rbm.gibbs(inputUser)
len(result[0])

3706

3706