#Download the Movie100klens Dataset

In [1]:
!git clone http://github.com/Manojkumar404/project.git

Cloning into 'project'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 0), reused 6 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.


#Import Required Libraries

In [2]:
import os
import csv
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

DATA_DIR = './project/'
OUTPUT_DIR = './'

class Config:
    device='cpu'
    epochs=40
    seed=17
    train_bs=8
    valid_bs=8
    embedding_dim=20
    lr=1e-2
    num_workers=None       
    verbose_step=100
    
def torch_seed_everything(seed_value=777):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

config=Config()
torch_seed_everything(config.seed)

# load full data

In [3]:
df = pd.read_csv(os.path.join(DATA_DIR, 'u.data'), sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user1 = df.user_id.nunique()
n_item1 = df.item_id.nunique()
print(df)

       user_id  item_id  rating  timestamp
0          196      242       3  881250949
1          186      302       3  891717742
2           22      377       1  878887116
3          244       51       2  880606923
4          166      346       1  886397596
...        ...      ...     ...        ...
99995      880      476       3  880175444
99996      716      204       5  879795543
99997      276     1090       1  874795795
99998       13      225       2  882399156
99999       12      203       3  879959583

[100000 rows x 4 columns]


#Data clustering 


In [4]:
csvfile = open('project/u.data', 'r').readlines()
filename = 1
for i in range(len(csvfile)):
     if i % 10000 == 0:
         open(str(filename) + '.data', 'w+').writelines(csvfile[i:i+10000])
         filename += 1

In [5]:
df1 = pd.read_csv(os.path.join('1.data'), sep='\t', header=None)
df1.columns = ['user_id', 'item_id', 'rating', 'timestamp']
n_user = df.user_id.nunique()
n_item = df.item_id.nunique()
df1

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
9995,74,150,3,888333458
9996,178,195,4,882826944
9997,321,527,3,879439763
9998,337,742,5,875184353


In [6]:
df2 = pd.read_csv(os.path.join( '2.data'), sep='\t', header=None)
df2.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user2 = df2.user_id.nunique()
n_item2 = df2.item_id.nunique()
df2

Unnamed: 0,user_id,item_id,rating,timestamp
0,56,189,4,892683248
1,325,403,2,891479102
2,336,845,1,877758035
3,13,802,2,882398254
4,64,202,4,889738993
...,...,...,...,...
9995,21,320,3,874951658
9996,1,170,5,876892856
9997,436,66,5,887770457
9998,385,1021,5,879441572


In [7]:
df3 = pd.read_csv(os.path.join('3.data'), sep='\t', header=None)
df3.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user3 = df3.user_id.nunique()
n_item3 = df3.item_id.nunique()
df3

Unnamed: 0,user_id,item_id,rating,timestamp
0,391,222,2,877399864
1,462,895,4,886365297
2,268,288,4,875306477
3,21,706,2,874951695
4,207,527,4,877879172
...,...,...,...,...
9995,392,114,4,891038401
9996,8,174,5,879362183
9997,244,655,5,880605766
9998,535,174,4,879617747


In [8]:
df4 = pd.read_csv(os.path.join('3.data'), sep='\t', header=None)
df4.columns = ['user_id', 'item_id', 'rating', 'timestamp']
#df = df.sort_values('timestamp').reset_index(drop=True)
n_user4 = df4.user_id.nunique()
n_item4 = df4.item_id.nunique()
df4

Unnamed: 0,user_id,item_id,rating,timestamp
0,391,222,2,877399864
1,462,895,4,886365297
2,268,288,4,875306477
3,21,706,2,874951695
4,207,527,4,877879172
...,...,...,...,...
9995,392,114,4,891038401
9996,8,174,5,879362183
9997,244,655,5,880605766
9998,535,174,4,879617747


#Matrix Factorization

In [9]:
class MF:

    def __init__(self,lmbda = 0.01, learning_rate=0.001,max_iteration=10000,rank=10,verbose=True,gap=None):

        self.lmbda = lmbda
        self.lr = learning_rate
        self.max_iteration = max_iteration
        self.rank = rank
        self.verb = verbose
        self.gap = gap
        self.U = None
        self.V = None
        self.gap = (max_iteration / 10) if gap is None else gap
        
    def mse(self,truth, pred):
        """Returns the mse of nonzero errors"""
        pred = pred[truth.nonzero()].flatten()
        truth = truth[truth.nonzero()].flatten()
        return mean_squared_error(truth, pred)
    def mae(self,truth, pred):
        """Returns the mse of nonzero errors"""
        pred = pred[truth.nonzero()].flatten()
        truth = truth[truth.nonzero()].flatten()
        return mean_absolute_error(truth, pred)

    def graph(self,testset=False):
        """
        Training and test graph with other meta data.
        """
        fig, ax = plt.subplots(facecolor='white',figsize=(10,5))
        train = [w[0] for w in self.history]
        test = [w[1] for w in self.history]
        x = list(range(0,self.max_iteration+2,int(self.gap)))
        ax.plot(x,train,color='red',label='Train MSE')
        if testset==True:
            ax.plot(x,test,color='green',label='Test MSE')
        ax.legend()
        ax.grid(True)
        ax.set_xlabel("Iteration")
        ax.set_ylabel("MSE")
        caption = f'lmbda: {lmb} lr: {self.lr} iteration: {self.max_iteration}'
        plt.title(caption)
        plt.show()

    def predict(self,query_embedding,type='neighbour',name='Aladdin',measure='cosine'):

        
        u = query_embedding
        V = self.V
        if measure == 'cosine':
            V = V / np.linalg.norm(V, axis=1, keepdims=True)
            u = u / np.linalg.norm(u)
        sim_vector = u.dot(V.T)
        return sim_vector

    
    def SGD(self,A,rated_rows,rated_cols,A_test=None):
     
        self.history= []
        for itr in range(self.max_iteration):
            # choosing an observed user,item combination
            u = np.random.choice(rated_rows)
            i = np.random.choice(rated_cols)
            #forward pass
            error = A[u,i] - np.dot(self.U[u], self.V[i])  # check this line alone
    #         cost = error**2 + lmbda * (np.linalg.norm(self.U[u])**2 + np.linalg.norm(self.V[i])**2)        
            # backward pass
            tmp = self.U[u]
            self.U[u] = self.U[u] + self.lr * (error * self.V[i] - self.lmbda * self.U[u])
            self.V[i] = self.V[i] + self.lr * (error * tmp - self.lmbda * self.V[i])
            
            if (itr % self.gap) == 0 or itr == self.max_iteration - 1:
                A_hat = np.dot(self.U,self.V.T)
                train_mse = self.mse(A,A_hat)
                test_mse = -1
                if isinstance(A_test,np.ndarray):
                    test_mse = self.mse(A_test,A_hat)
                self.history.append((train_mse,test_mse))
                if self.verb==True:
                    print("iteration %d, TrainMSE: %.2f TestMSE: %.2f"%
                          (itr,train_mse,test_mse))
                train_mae = self.mae(A,A_hat)
                test_mae = -1
                if isinstance(A_test,np.ndarray):
                    test_mae = self.mse(A_test,A_hat)
                self.history.append((train_mae,test_mae))
                if self.verb==True:
                    print("iteration %d, RMSE: %.2f  RMAE: %.2f"% (itr,train_mse,train_mae))
    
    def fit(self,A,A_test=None):
       
        rated_rows,rated_cols = A.nonzero()
        n_user = A.shape[0]
        n_item = A.shape[1]
        if self.U is None:
            self.U = np.random.rand(n_user,self.rank)
            self.V = np.random.rand(n_item,self.rank)
        # used in verbose mode
        self.SGD(A,rated_rows,rated_cols,A_test)

#Data Splitting

In [10]:
def split(data):
    """
    Splits 100k data between train and test set, and builts corresponding rating matrix, A for each set
    """
    # n_user = data.user_id.nunique()
    # n_item = data.item_id.nunique()
    train,test = train_test_split(data,test_size=0.3)
    # print(train.shape)
    # print(n_user.shape)
    def fun1(x):
        A_train[x[0],x[1]] = x[2]
        return x
    def fun2(x):
        A_test[x[0],x[1]] = x[2]
        return x
    A_train = np.zeros((n_user,n_item))
    A_test = np.zeros((n_user,n_item))
    train.apply(fun1, axis=1)
    test.apply(fun2,axis=1)
    return A_train, A_test

In [11]:
A_train,A_test = split(df1)
A_train2,A_test2 = split(df2)
A_train3,A_test3 = split(df3)
A_train4,A_test4 = split(df4)

# Gaussian Noise Integration

In [12]:
epsilon = 1

delta = 10e-5
sigma = np.sqrt(2 * np.log(1.25 / delta)) * 1 / epsilon
vals_gauss =np.random.normal(0,sigma,(943,1682))

A_train_noise = A_train+vals_gauss
A_test_noise = A_test+vals_gauss
A_train_noise2 = A_train2+vals_gauss
A_test_noise2 = A_test2+vals_gauss
A_train_noise3 = A_train3+vals_gauss
A_test_noise3 = A_test3+vals_gauss
A_train_noise4 = A_train4+vals_gauss
A_test_noise4 = A_test4+vals_gauss
print(A_train_noise)
print(A_test_noise)
print(A_train_noise2)
print(A_test_noise2)
print(A_train_noise3)
print(A_test_noise3)
print(A_train_noise4)
print(A_test_noise4)


[[ -4.86428999   4.46053943   3.15288465 ...  -6.62652113  -1.95274114
    2.33180541]
 [ -3.93889563  -5.63024471   6.28188206 ...   3.35266893   9.60115829
   -0.85717285]
 [ -4.9449349    0.66944911  -1.25879307 ...   3.27149888 -10.4852328
   -4.6998701 ]
 ...
 [ -2.59638922  -0.82190885   2.97301008 ...   4.32310057   1.24313722
   14.80027921]
 [  3.21660635   7.61570237   3.28495428 ...   2.13863357   2.28779659
    2.00918283]
 [ -5.2152718    8.62392449   5.15544842 ...   2.84942624   2.69560667
    0.21481734]]
[[ -4.86428999   4.46053943   3.15288465 ...  -6.62652113  -1.95274114
    2.33180541]
 [ -3.93889563  -5.63024471   6.28188206 ...   3.35266893   9.60115829
   -0.85717285]
 [ -4.9449349    0.66944911  -1.25879307 ...   3.27149888 -10.4852328
   -4.6998701 ]
 ...
 [ -2.59638922  -0.82190885   2.97301008 ...   4.32310057   1.24313722
   14.80027921]
 [  3.21660635   7.61570237   3.28495428 ...   2.13863357   2.28779659
    2.00918283]
 [ -5.2152718    8.62392449   5.15

#Cluster 1 Training with Gaussian noise





In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model.fit(A_train_noise,A_test_noise)
# model.graph(testset=True)


iteration 0, TrainMSE: 76.90 TestMSE: 77.00
iteration 0, RMSE: 76.90  RMAE: 7.70
iteration 2160, TrainMSE: 73.85 TestMSE: 73.95
iteration 2160, RMSE: 73.85  RMAE: 7.51
iteration 4320, TrainMSE: 71.09 TestMSE: 71.18
iteration 4320, RMSE: 71.09  RMAE: 7.35
iteration 6480, TrainMSE: 68.43 TestMSE: 68.52
iteration 6480, RMSE: 68.43  RMAE: 7.18
iteration 8640, TrainMSE: 65.95 TestMSE: 66.04
iteration 8640, RMSE: 65.95  RMAE: 7.03
iteration 10800, TrainMSE: 63.72 TestMSE: 63.80
iteration 10800, RMSE: 63.72  RMAE: 6.88
iteration 12960, TrainMSE: 61.61 TestMSE: 61.70
iteration 12960, RMSE: 61.61  RMAE: 6.75
iteration 15120, TrainMSE: 59.61 TestMSE: 59.69
iteration 15120, RMSE: 59.61  RMAE: 6.61
iteration 17280, TrainMSE: 57.76 TestMSE: 57.84
iteration 17280, RMSE: 57.76  RMAE: 6.49
iteration 19440, TrainMSE: 56.06 TestMSE: 56.14
iteration 19440, RMSE: 56.06  RMAE: 6.38
iteration 21599, TrainMSE: 54.44 TestMSE: 54.51
iteration 21599, RMSE: 54.44  RMAE: 6.27


#Cluster 1 Training without Gaussian noise

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model1 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model1.fit(A_train,A_test)

iteration 0, TrainMSE: 18.27 TestMSE: 18.53
iteration 0, RMSE: 18.27  RMAE: 3.95
iteration 2160, TrainMSE: 12.72 TestMSE: 13.11
iteration 2160, RMSE: 12.72  RMAE: 3.19
iteration 4320, TrainMSE: 9.18 TestMSE: 9.63
iteration 4320, RMSE: 9.18  RMAE: 2.60
iteration 6480, TrainMSE: 7.00 TestMSE: 7.50
iteration 6480, RMSE: 7.00  RMAE: 2.18
iteration 8640, TrainMSE: 5.60 TestMSE: 6.06
iteration 8640, RMSE: 5.60  RMAE: 1.89
iteration 10800, TrainMSE: 4.67 TestMSE: 5.11
iteration 10800, RMSE: 4.67  RMAE: 1.70
iteration 12960, TrainMSE: 4.09 TestMSE: 4.49
iteration 12960, RMSE: 4.09  RMAE: 1.58
iteration 15120, TrainMSE: 3.75 TestMSE: 4.12
iteration 15120, RMSE: 3.75  RMAE: 1.52
iteration 17280, TrainMSE: 3.59 TestMSE: 3.92
iteration 17280, RMSE: 3.59  RMAE: 1.50
iteration 19440, TrainMSE: 3.54 TestMSE: 3.85
iteration 19440, RMSE: 3.54  RMAE: 1.50
iteration 21599, TrainMSE: 3.56 TestMSE: 3.83
iteration 21599, RMSE: 3.56  RMAE: 1.52


#Cluster 2 Training with Gaussian noise



In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model2 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model2.fit(A_train_noise2,A_test_noise2)


iteration 0, TrainMSE: 76.81 TestMSE: 76.91
iteration 0, RMSE: 76.81  RMAE: 7.69
iteration 2160, TrainMSE: 73.82 TestMSE: 73.91
iteration 2160, RMSE: 73.82  RMAE: 7.51
iteration 4320, TrainMSE: 70.98 TestMSE: 71.07
iteration 4320, RMSE: 70.98  RMAE: 7.34
iteration 6480, TrainMSE: 68.33 TestMSE: 68.42
iteration 6480, RMSE: 68.33  RMAE: 7.17
iteration 8640, TrainMSE: 65.87 TestMSE: 65.96
iteration 8640, RMSE: 65.87  RMAE: 7.02
iteration 10800, TrainMSE: 63.68 TestMSE: 63.76
iteration 10800, RMSE: 63.68  RMAE: 6.88
iteration 12960, TrainMSE: 61.56 TestMSE: 61.64
iteration 12960, RMSE: 61.56  RMAE: 6.74
iteration 15120, TrainMSE: 59.61 TestMSE: 59.68
iteration 15120, RMSE: 59.61  RMAE: 6.61
iteration 17280, TrainMSE: 57.78 TestMSE: 57.86
iteration 17280, RMSE: 57.78  RMAE: 6.49
iteration 19440, TrainMSE: 56.03 TestMSE: 56.10
iteration 19440, RMSE: 56.03  RMAE: 6.37
iteration 21599, TrainMSE: 54.46 TestMSE: 54.53
iteration 21599, RMSE: 54.46  RMAE: 6.27


#Cluster 2 Training without Gaussian noise

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model3 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model3.fit(A_train2,A_test2)

iteration 0, TrainMSE: 19.16 TestMSE: 19.82
iteration 0, RMSE: 19.16  RMAE: 4.04
iteration 2160, TrainMSE: 13.94 TestMSE: 14.75
iteration 2160, RMSE: 13.94  RMAE: 3.37
iteration 4320, TrainMSE: 10.40 TestMSE: 11.27
iteration 4320, RMSE: 10.40  RMAE: 2.82
iteration 6480, TrainMSE: 8.02 TestMSE: 8.86
iteration 6480, RMSE: 8.02  RMAE: 2.39
iteration 8640, TrainMSE: 6.38 TestMSE: 7.18
iteration 8640, RMSE: 6.38  RMAE: 2.07
iteration 10800, TrainMSE: 5.24 TestMSE: 5.96
iteration 10800, RMSE: 5.24  RMAE: 1.83
iteration 12960, TrainMSE: 4.47 TestMSE: 5.14
iteration 12960, RMSE: 4.47  RMAE: 1.67
iteration 15120, TrainMSE: 3.95 TestMSE: 4.55
iteration 15120, RMSE: 3.95  RMAE: 1.56
iteration 17280, TrainMSE: 3.60 TestMSE: 4.16
iteration 17280, RMSE: 3.60  RMAE: 1.49
iteration 19440, TrainMSE: 3.40 TestMSE: 3.89
iteration 19440, RMSE: 3.40  RMAE: 1.46
iteration 21599, TrainMSE: 3.30 TestMSE: 3.73
iteration 21599, RMSE: 3.30  RMAE: 1.44


#Cluster 3 Training with Gaussian noise



In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model4 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model4.fit(A_train_noise3,A_test_noise3)

iteration 0, TrainMSE: 76.72 TestMSE: 76.82
iteration 0, RMSE: 76.72  RMAE: 7.69
iteration 2160, TrainMSE: 73.70 TestMSE: 73.79
iteration 2160, RMSE: 73.70  RMAE: 7.51
iteration 4320, TrainMSE: 70.90 TestMSE: 70.99
iteration 4320, RMSE: 70.90  RMAE: 7.33
iteration 6480, TrainMSE: 68.33 TestMSE: 68.42
iteration 6480, RMSE: 68.33  RMAE: 7.17
iteration 8640, TrainMSE: 65.92 TestMSE: 66.01
iteration 8640, RMSE: 65.92  RMAE: 7.02
iteration 10800, TrainMSE: 63.69 TestMSE: 63.78
iteration 10800, RMSE: 63.69  RMAE: 6.88
iteration 12960, TrainMSE: 61.57 TestMSE: 61.65
iteration 12960, RMSE: 61.57  RMAE: 6.74
iteration 15120, TrainMSE: 59.63 TestMSE: 59.70
iteration 15120, RMSE: 59.63  RMAE: 6.62
iteration 17280, TrainMSE: 57.82 TestMSE: 57.90
iteration 17280, RMSE: 57.82  RMAE: 6.50
iteration 19440, TrainMSE: 56.16 TestMSE: 56.23
iteration 19440, RMSE: 56.16  RMAE: 6.38
iteration 21599, TrainMSE: 54.52 TestMSE: 54.59
iteration 21599, RMSE: 54.52  RMAE: 6.27


#Cluster 3 Training without Gaussian noise

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model5 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model5.fit(A_train3,A_test3)


iteration 0, TrainMSE: 18.36 TestMSE: 18.23
iteration 0, RMSE: 18.36  RMAE: 3.97
iteration 2160, TrainMSE: 13.76 TestMSE: 13.71
iteration 2160, RMSE: 13.76  RMAE: 3.36
iteration 4320, TrainMSE: 10.57 TestMSE: 10.63
iteration 4320, RMSE: 10.57  RMAE: 2.85
iteration 6480, TrainMSE: 8.34 TestMSE: 8.47
iteration 6480, RMSE: 8.34  RMAE: 2.46
iteration 8640, TrainMSE: 6.78 TestMSE: 6.99
iteration 8640, RMSE: 6.78  RMAE: 2.15
iteration 10800, TrainMSE: 5.65 TestMSE: 5.91
iteration 10800, RMSE: 5.65  RMAE: 1.92
iteration 12960, TrainMSE: 4.82 TestMSE: 5.11
iteration 12960, RMSE: 4.82  RMAE: 1.74
iteration 15120, TrainMSE: 4.22 TestMSE: 4.54
iteration 15120, RMSE: 4.22  RMAE: 1.61
iteration 17280, TrainMSE: 3.82 TestMSE: 4.14
iteration 17280, RMSE: 3.82  RMAE: 1.52
iteration 19440, TrainMSE: 3.53 TestMSE: 3.86
iteration 19440, RMSE: 3.53  RMAE: 1.46
iteration 21599, TrainMSE: 3.36 TestMSE: 3.69
iteration 21599, RMSE: 3.36  RMAE: 1.43


#Cluster 4 Training with Gaussian noise


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model6 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model6.fit(A_train_noise4,A_test_noise4)


iteration 0, TrainMSE: 76.56 TestMSE: 76.66
iteration 0, RMSE: 76.56  RMAE: 7.68
iteration 2160, TrainMSE: 73.50 TestMSE: 73.59
iteration 2160, RMSE: 73.50  RMAE: 7.49
iteration 4320, TrainMSE: 70.66 TestMSE: 70.76
iteration 4320, RMSE: 70.66  RMAE: 7.32
iteration 6480, TrainMSE: 68.09 TestMSE: 68.17
iteration 6480, RMSE: 68.09  RMAE: 7.16
iteration 8640, TrainMSE: 65.69 TestMSE: 65.77
iteration 8640, RMSE: 65.69  RMAE: 7.01
iteration 10800, TrainMSE: 63.43 TestMSE: 63.51
iteration 10800, RMSE: 63.43  RMAE: 6.86
iteration 12960, TrainMSE: 61.37 TestMSE: 61.45
iteration 12960, RMSE: 61.37  RMAE: 6.73
iteration 15120, TrainMSE: 59.41 TestMSE: 59.49
iteration 15120, RMSE: 59.41  RMAE: 6.60
iteration 17280, TrainMSE: 57.61 TestMSE: 57.68
iteration 17280, RMSE: 57.61  RMAE: 6.48
iteration 19440, TrainMSE: 55.91 TestMSE: 55.98
iteration 19440, RMSE: 55.91  RMAE: 6.37
iteration 21599, TrainMSE: 54.30 TestMSE: 54.37
iteration 21599, RMSE: 54.30  RMAE: 6.26


#Cluster 4 Training without Gaussian noise

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython import display
import matplotlib.pyplot as plt

lmb = 0.1
lr = 0.001
mx_itr = 21600
gap = mx_itr / 10
view = True

model7 = MF(lmb,learning_rate=lr,max_iteration=mx_itr,rank=30,verbose=True)
model7.fit(A_train4,A_test4)


iteration 0, TrainMSE: 18.90 TestMSE: 19.14
iteration 0, RMSE: 18.90  RMAE: 4.02
iteration 2160, TrainMSE: 14.13 TestMSE: 14.56
iteration 2160, RMSE: 14.13  RMAE: 3.40
iteration 4320, TrainMSE: 10.71 TestMSE: 11.27
iteration 4320, RMSE: 10.71  RMAE: 2.87
iteration 6480, TrainMSE: 8.40 TestMSE: 8.97
iteration 6480, RMSE: 8.40  RMAE: 2.47
iteration 8640, TrainMSE: 6.79 TestMSE: 7.38
iteration 8640, RMSE: 6.79  RMAE: 2.16
iteration 10800, TrainMSE: 5.66 TestMSE: 6.22
iteration 10800, RMSE: 5.66  RMAE: 1.92
iteration 12960, TrainMSE: 4.85 TestMSE: 5.38
iteration 12960, RMSE: 4.85  RMAE: 1.75
iteration 15120, TrainMSE: 4.27 TestMSE: 4.77
iteration 15120, RMSE: 4.27  RMAE: 1.63
iteration 17280, TrainMSE: 3.88 TestMSE: 4.35
iteration 17280, RMSE: 3.88  RMAE: 1.55
iteration 19440, TrainMSE: 3.62 TestMSE: 4.04
iteration 19440, RMSE: 3.62  RMAE: 1.49
iteration 21599, TrainMSE: 3.46 TestMSE: 3.83
iteration 21599, RMSE: 3.46  RMAE: 1.46


# Recommender System Inference

In [34]:
import json
import requests
import numpy as np


def store_score_matrix(model,name='Aladdin'):
    # might return multiple movies
    movieids =  item[item['movie_title'].str.contains(name)].index.values
    if len(movieids) == 0:
        print('No movie found by that name. Remember, searching is case-sensitive')
        return
    print('Found ',len(movieids),'searching by: ',item.loc[movieids[0],'movie_title'])
    query = model.V[movieids[0]] # a single movie embedding
    sim_vector = model.predict(query)
    out_arr = np.array_str(sim_vector, precision = 6, suppress_small = True)
    # print("Score matrix is ",out_arr)
    createurl = "http://80.209.228.87:5001/data_encrypt_api"
    headers = {'content-Type': 'application/json'}
    payload = {
        "block_no": "null",
        "block_hash":{"data":out_arr}

    }
    response = requests.post(createurl,data=json.dumps(payload),headers=headers)
    print(response.status_code)
    print(response.json())
    # item['similarity'] = sim_vector
    # top5 = item.sort_values(['similarity'],axis=0,inplace=False,ascending=False)[
    #     ['itemid','movie_title','genre','similarity','IMDb_URL']
    # ].head()
    # display.display(top5)

def retrive_score_matrix(sim_vector):

    item['similarity'] = sim_vector
    top5 = item.sort_values(['similarity'],axis=0,inplace=False,ascending=False)[
        ['itemid','movie_title','genre','similarity','IMDb_URL']
    ].head()
#     top5 = top5.append(item.loc[movieids[0],['itemid','movie_title','genre','similarity']])
    display.display(top5)



def get_movie_suggestion(model,name='Aladdin'):
    # might return multiple movies
    movieids =  item[item['movie_title'].str.contains(name)].index.values
    if len(movieids) == 0:
        print('No movie found by that name. Remember, searching is case-sensitive')
        return
    print('Found ',len(movieids),'searching by: ',item.loc[movieids[0],'movie_title'])
    query = model.V[movieids[0]] # a single movie embedding
    sim_vector = model.predict(query)
    print("Score matrix is ",sim_vector)
    item['similarity'] = sim_vector
    top5 = item.sort_values(['similarity'],axis=0,inplace=False,ascending=False)[
        ['itemid','movie_title','genre','similarity','IMDb_URL']
    ].head()
#     top5 = top5.append(item.loc[movieids[0],['itemid','movie_title','genre','similarity']])
    display.display(top5)

#Secured Scorematrix Storage

In [35]:
# data
itemcol = [w.strip().replace(" ","_") for w in "itemid | movie_title | release_date | video release date |IMDb URL | unknown | Action | Adventure | Animation |Children's | Comedy | Crime | Documentary | Drama | Fantasy |Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |Thriller | War | Western |".split('|')]
itemcol.remove('')
item = pd.read_csv("project/u.item",delimiter='|',
                   names=itemcol,encoding='latin_1')

item['itemid'] = item['itemid'].apply(lambda x: x - 1)
# Genre aggregate
def genre(series):
    genres = series.index[6:-2]
    
    text = []
    for i in genres:
        if series[i] == 1:
            text.append(i)
    return ", ".join(text)
item['genre'] = item.apply(genre,axis=1)
store_score_matrix(model7,name='Devil in a Blue Dress') ## here you can change model name and check for each clusters.Eg: model1 or model2 or model 3

Found  1 searching by:  Devil in a Blue Dress (1995)
200
{'block_no': '13', 'block_hash': 'U2FsdGVkX18Fte7WDj4/NpV+lnJyh2Md+9w0BP42WF3ghd3LmYQeA5ZDpntaQar5LUy7c2+p8kRyPDf4uFtCGBgzHZ2zKs3DjpLfivsNE/xaHAwIKZIs2+ZZnaxdT+ce67MuSAA6uA8/Zx3bVulmXNwZJuTeaf/Aq8eBIxzG/nw+ns/y2LvovweahV9S6hVbrcfWtLLB7/47zhImkh+9bfJrdlxrTFXtj5+JQ5LAQYcNDJcMKX0vp8xZqQkmKQ9EsaO9zY1OkvkNW64uj29labQZEmTvZQv3lNptC8idgEJmLgZ/djU4qAO02IPrKLpHxCHlD3Ts1EXkWKQbcHogEQ==244210e48437b6556980a70249a99369934a352429034cef9d7bd253b3bf2c01', '_id': 'cknr5dwtv000301mq4detbpnw', '__v': 0}


#Secure Score Matrix Retrive and Prediction

In [36]:
createurl = "http://80.209.228.87:5001/data_decrypt_api"
headers = {'content-Type': 'application/json'}

response = requests.get(createurl,headers=headers)
print(response.status_code)
a = response.json()
retrive_score_matrix(a['data'])


200


Unnamed: 0,itemid,movie_title,genre,similarity,IMDb_URL
0,0,Toy Story (1995),"Animation, Children's, Comedy",[0.682075 0.662934 0.529242 ... 0.680333 0.646...,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1104,1104,Firestorm (1998),"Action, Adventure, Thriller",[0.682075 0.662934 0.529242 ... 0.680333 0.646...,http://us.imdb.com/M/title-exact?imdb-title-12...
1128,1128,Chungking Express (1994),"Drama, Mystery, Romance",[0.682075 0.662934 0.529242 ... 0.680333 0.646...,http://us.imdb.com/M/title-exact?Chongqing%20S...
1127,1127,Heidi Fleiss: Hollywood Madam (1995),Documentary,[0.682075 0.662934 0.529242 ... 0.680333 0.646...,http://us.imdb.com/M/title-exact?Heidi%20Fleis...
1126,1126,"Truman Show, The (1998)",Drama,[0.682075 0.662934 0.529242 ... 0.680333 0.646...,"http://us.imdb.com/Title?Truman+Show,+The+(1998)"
