In [None]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from random import normalvariate
from math import sqrt
import time
import pickle

In [None]:
class Utility:
    '''
    Splits the data into training and testing dataframe
    '''
    def train_test_split(pivoted_ratings, count=9):
        testingData = pd.DataFrame(np.zeros(pivoted_ratings.shape))
        trainingData = pivoted_ratings.copy()
        for uid in np.arange(pivoted_ratings.shape[1]):
            user = pivoted_ratings.columns[uid]
            futsals = np.array(pivoted_ratings[user][:]).nonzero()[0]
            if len(futsals) >= count:
                fid = np.random.choice(futsals, size=3, replace=False)
                futsal = pivoted_ratings.index[fid][0]
                trainingData.loc[futsal, user] = 0.0
                testingData.loc[fid, uid] = pivoted_ratings[user][futsal]
        return trainingData, testingData

In [None]:
class SVD:
    
    def random_unit_vector(self, size):
        normalVector = [normalvariate(0, 1) for i in range(size)]
        vectorNorm = sqrt(sum(value * value for value in normalVector))
        return [value / vectorNorm for value in normalVector]
    
    def eigenvector_power_method(self, data, epsilon=1e-10):    
        rows, cols = data.shape
        new_eigenvector = self.random_unit_vector(cols)
        covariance_matrix = np.dot(data.T, data)

        iterations = 0        
        while True:
            iterations += 1
            old_eigenvector = new_eigenvector
            new_eigenvector = np.dot(covariance_matrix, old_eigenvector)
            new_eigenvector = new_eigenvector / norm(new_eigenvector)
            if abs(np.dot(new_eigenvector, old_eigenvector)) > 1 - epsilon:            
                return new_eigenvector

    def svd(self, trainData, epsilon=1e-10):
        rows, cols = trainData.shape
        basis = []

        for i in range(cols):
            data_matrix = trainData.copy()

            for sigma, u, v in basis[:i]:
                data_matrix -= sigma * np.outer(u, v) 

            v = self.eigenvector_power_method(data_matrix, epsilon=epsilon) 
            combined_us = np.dot(trainData, v)
            sigma = norm(combined_us)  
            u = combined_us / sigma    

            basis.append((sigma, u, v))

        S, U, VT = [np.array(x) for x in zip(*basis)]

        return S,U.T,VT

In [None]:
class SGD:
    def __init__(self, learning_rate=0.01, regularization_rate=0.01, num_features=7, max_epoch = 500):
        self.learning_rate = learning_rate
        self.regularization_rate = regularization_rate
        self.num_features = num_features
        self.max_epoch = max_epoch
    
    def train_new_user(self, newData):
        new_user_bias = np.mean(newData[newData.nonzero()]) - self.overall_mean
        nuser_feature_matrix = np.array([normalvariate(0,1) for _ in range(7)])
        futsalIds = newData.nonzero()[0]
        for epoch in range(self.max_epoch):
          for fid in futsalIds:
            predicted = self.overall_mean + new_user_bias + self.bias_futsal[fid] + np.dot(self.Q[fid].T, nuser_feature_matrix)
            error = newData[fid] - predicted
            new_user_bias = new_user_bias + self.learning_rate * (error - self.regularization_rate * new_user_bias)
            nuser_feature_matrix = nuser_feature_matrix + self.learning_rate * (error * self.Q[fid] - self.regularization_rate * nuser_feature_matrix)
        f_bias_matrix = np.array(self.bias_futsal).T
        u_bias_matrix = new_user_bias * np.ones(len(self.bias_futsal))
        g_mean_matrix = self.overall_mean * np.ones(len(self.bias_futsal))
        d_matrix = np.dot(self.Q, nuser_feature_matrix.T)
        prediction_matrix = f_bias_matrix + u_bias_matrix + g_mean_matrix + d_matrix
        return prediction_matrix

    def initialize(self, trainData):
        self.num_users = trainData.shape[1]
        self.num_futsals = trainData.shape[0]
        self.bias_user = [normalvariate(0,1) for _ in range(self.num_users)]
        self.bias_futsal = [normalvariate(0,1) for _ in range(self.num_futsals)]
        self.initial_user_bias = self.bias_user
        self.initial_futsal_bias = self.bias_futsal
        self.overall_mean = np.mean(trainData.values[trainData.values.nonzero()])
        self.fids, self.uids = trainData.values.nonzero()
        self.errorTrain = []
        self.errorTest = []
        return self
    
    def get_initial_biases(self):
        return self.initial_user_bias, self.initial_futsal_bias
      
    def prediction(self, uid, fid):
        return self.overall_mean + self.bias_user[uid] + self.bias_futsal[fid] + np.dot(self.Q[fid].T, self.P[uid])
    
    def actual_prediction_matrix(self,trainData):
        futsal_bias_matrix = np.array([self.bias_futsal for _ in range(len(self.bias_user))]).T
        user_bias_matrix = np.array([self.bias_user for _ in range(len(self.bias_futsal))])
        global_mean_matrix = self.overall_mean * np.ones(trainData.shape)
        dot_matrix = np.dot(self.Q, self.P.T)
        return futsal_bias_matrix + user_bias_matrix + global_mean_matrix + dot_matrix
        
    def mean_squared_error(self, prediction, truth):
        num_cond = len(prediction)
        sq_error = 0
        for i in range(num_cond):
            sq_error += (prediction[i] - truth[i]) ** 2
        return sq_error / num_cond

    def rmse(self, prediction, actual):
        prediction = prediction[actual.values.nonzero()].flatten()
        actual = actual.values[actual.values.nonzero()].flatten()
        return np.sqrt(self.mean_squared_error(prediction, actual))
    
    def fit(self, trainData, testData, user_feature_matrix, futsal_feature_matrix):
        self.initialize(trainData)
        self.P = user_feature_matrix
        self.Q = futsal_feature_matrix
        start_time = time.time()
        for epoch in range(self.max_epoch):
            for uid, fid in zip(self.uids, self.fids):
                error = trainData.values[fid][uid] - self.prediction(uid, fid)
                self.bias_user[uid] = self.bias_user[uid] + self.learning_rate * (error - self.regularization_rate * self.bias_user[uid])
                self.bias_futsal[fid] = self.bias_futsal[fid] + self.learning_rate * (error - self.regularization_rate * self.bias_futsal[fid])
                self.P[uid] = self.P[uid] + self.learning_rate * (error * self.Q[fid] - self.regularization_rate * self.P[uid])
                self.Q[fid] = self.Q[fid] + self.learning_rate * (error * self.P[uid] - self.regularization_rate * self.Q[fid])
            current_prediction = self.actual_prediction_matrix(trainData)
            self.errorTrain.append(self.rmse(current_prediction, trainData))
            self.errorTest.append(self.rmse(current_prediction, testData))
        elapsed_time = time.time() - start_time
        print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        return self.P, self.Q, self.bias_user, self.bias_futsal, self.overall_mean
    
    def print_rmse(self):
        print("Training\t\tTesting")
        for i in range(len(self.errorTrain)):
            print(f'{self.errorTrain[i]}\t{self.errorTest[i]}')
    
    def get_rmse(self):
        return self.errorTrain, self.errorTest

    def get_results(self):
        return self.P, self.Q, self.bias_user, self.bias_futsal, self.overall_mean

# Preprocessing data
+ Removing duplicates
+ Removing futsals with less than 20 ratings
+ Removing users with less than 3 ratings

In [None]:
ratings = pd.read_csv('ratings.csv')
processed_ratings = ratings[["User","Rating","Futsal"]]
processed_ratings = processed_ratings.drop_duplicates()
processed_ratings.rename(columns={'Futsal':'Futsal_Name'},inplace=True)
processed_ratings['Rating'] = processed_ratings['Rating'].str.split(' ').str.get(0).astype(int)
x = processed_ratings.groupby("User")["Rating"].count() > 4
users_who_rate = x[x].index
final_ratings = processed_ratings[processed_ratings["User"].isin(users_who_rate)]
final_ratings

Unnamed: 0,User,Rating,Futsal_Name
25,Maheshwar Chaudhary,2,5 5 Soccer Center
27,Anup Khanal,5,5 5 Soccer Center
31,Ààyûsh Tîmàl ç ñá,2,5 5 Soccer Center
34,Pradeep Rai,3,5 5 Soccer Center
40,Milan Shrestha,1,5 5 Soccer Center
...,...,...,...
12186,Roshan Shrestha,5,Yala Futsal & Recreational Center
12195,Pranish Maharjan,4,Yala Futsal & Recreational Center
12208,Nikesh Maharjan,3,Yala Futsal & Recreational Center
12211,prabin shrestha,4,Yala Futsal & Recreational Center


In [None]:
processed_ratings.groupby("Futsal_Name").count()

Unnamed: 0_level_0,User,Rating
Futsal_Name,Unnamed: 1_level_1,Unnamed: 2_level_1
5 5 Soccer Center,155,155
BG Brothers Health Club Futsal,329,329
Baijanti Futsal,128,128
Balchhi Dhurbe Futsal,149,149
Buddhanagar Futsal,213,213
Chaitya Futsal,130,130
Champions Futsal,169,169
Dhanyentari Futsal,992,992
Dhuku Sports Hub,388,388
Easy Futsal,75,75


In [None]:
futsals_count = processed_ratings.groupby("Futsal_Name")["Rating"].count().reset_index()
futsals_count.rename(columns={"Rating": "Count"}, inplace=True)

In [None]:
futsals_mean = processed_ratings.groupby("Futsal_Name")["Rating"].mean().reset_index()
futsals_mean.rename(columns={"Rating": "Mean"},inplace=True)

In [None]:
popular_futsals = futsals_count.merge(futsals_mean, on="Futsal_Name")
popular_futsals = popular_futsals.sort_values(by='Mean', ascending=False).reset_index()

In [None]:
popular_futsals

Unnamed: 0,index,Futsal_Name,Count,Mean
0,8,Dhuku Sports Hub,388,4.244845
1,44,Yala Futsal & Recreational Center,151,4.086093
2,11,Futsal Arena Boudha,283,4.084806
3,18,Kathmandu Futsal,283,4.084806
4,39,United Futsal,280,4.060714
5,7,Dhanyentari Futsal,992,4.055444
6,26,Prime Futsal Gyaneshwor,127,3.984252
7,3,Balchhi Dhurbe Futsal,149,3.979866
8,2,Baijanti Futsal,128,3.976562
9,34,Shankhamul Futsal,650,3.972308


In [None]:
top20futsals = popular_futsals[popular_futsals["Count"] > 100].sort_values(by="Mean", ascending=False).reset_index()

In [None]:
top20futsals

Unnamed: 0,index,Futsal_Name,Count,Mean
0,8,Dhuku Sports Hub,388,4.244845
1,44,Yala Futsal & Recreational Center,151,4.086093
2,11,Futsal Arena Boudha,283,4.084806
3,18,Kathmandu Futsal,283,4.084806
4,39,United Futsal,280,4.060714
5,7,Dhanyentari Futsal,992,4.055444
6,26,Prime Futsal Gyaneshwor,127,3.984252
7,3,Balchhi Dhurbe Futsal,149,3.979866
8,2,Baijanti Futsal,128,3.976562
9,34,Shankhamul Futsal,650,3.972308


In [None]:
def recommend_from_top20():
  return np.random.choice(top20futsals["Futsal_Name"].values, 10, replace=False)

In [None]:
pickle.dump(popular_futsals, open('futsals.pkl','wb'))

In [None]:
top20futsals[top20futsals['Futsal_Name'] == "Imadol Futsal"]["Count"].values[0]

300

In [None]:
recommend_from_top20()

array(['Kathmandu Futsal', 'Premier Futsal', 'Manang Marshyangdi Futsal',
       'Prime Futsal Gyaneshwor', 'Futsal Arena Boudha',
       'Dhanyentari Futsal', 'Grande Sports Center', "Mate's Futsal",
       'Imadol Futsal', 'Dhuku Sports Hub'], dtype=object)

In [None]:
pivoted_ratings = final_ratings.pivot_table(index="Futsal_Name", columns="User", values="Rating")
pivoted_ratings.fillna(0, inplace=True)

In [None]:
trainData, testData = Utility.train_test_split(pivoted_ratings, count=7)

In [None]:
testData.values[testData.values.nonzero()]

array([1., 4., 1., 4., 3., 5., 4., 5., 4., 4., 3., 3., 4., 1., 3., 3., 5.,
       3., 2., 2., 5., 5., 3., 3., 3., 5., 5., 4., 2., 5., 4., 4., 5., 4.,
       2., 4., 1., 4., 4., 3., 3., 3., 5., 5., 2., 5., 5., 1., 5., 2., 5.,
       3., 5., 3., 4., 5., 3., 1., 3., 4., 2., 5., 5., 2., 5., 4., 1., 2.,
       4., 3., 3., 3., 3., 4., 3., 4., 3., 4., 3., 5., 2., 5., 3., 5., 4.,
       5., 4., 3., 2., 3., 2., 4., 3., 4., 4., 3., 3., 1., 4., 4., 5., 1.,
       5., 2., 5., 3., 2., 4., 4., 5., 3., 5., 5., 5., 4., 5., 4., 4., 1.,
       4., 4., 3., 4., 4., 5., 2., 5., 5., 2., 1., 4., 5., 4., 4., 3., 4.,
       5., 5., 5., 4., 2., 4., 2., 3., 4., 5., 4., 1., 3., 3., 5., 3., 3.,
       5., 4., 3.])

### Calculating SVD for train data as it provides us with two matrices with most less sum of squared error for train data

In [None]:
# %%time
# U, S, VT = svd(trainData)

In [None]:
# copy_U = U.copy()
# copy_S = S.copy()
# copy_VT = VT.copy()

In [None]:
model_svd = SVD()

In [None]:
S, U, VT = model_svd.svd(trainData)

In [None]:
VT

array([[ 0.12503621,  0.0500787 ,  0.02969612, ...,  0.0692334 ,
         0.11661052,  0.09595611],
       [ 0.02278374, -0.03562302, -0.0555685 , ...,  0.10312669,
         0.0051651 ,  0.10962527],
       [-0.01671014,  0.03728805, -0.02712579, ..., -0.05294403,
         0.0175646 , -0.14069094],
       ...,
       [-0.051259  , -0.06163378, -0.08126742, ..., -0.10539902,
         0.00774786,  0.11342646],
       [-0.051259  , -0.06163378, -0.08126742, ..., -0.10539902,
         0.00774786,  0.11342646],
       [ 0.051259  ,  0.06163378,  0.08126742, ...,  0.10539902,
        -0.00774786, -0.11342646]])

In [None]:
copy_U = U.copy()
copy_S = S.copy()
copy_VT = VT.copy()

In [None]:
P = copy_U[:,:7]
Q = np.matmul(np.diag(S[:7]), VT[:7,:])

In [None]:
pd.DataFrame(Q).shape

(7, 178)

### Training the model

In [None]:
model = SGD(learning_rate=5e-3,regularization_rate=1e-1,num_features=7,max_epoch=2000)

In [None]:
new_P, new_Q, userB, futsalB, overall_mean = model.fit(trainData, testData, Q.T, P)

Execution time: 00:00:59


In [None]:
model.print_rmse()

Training		Testing
1.3926357561945397	1.7179351746594829
1.313768723437934	1.6667223655191918
1.2607104214452483	1.650257978882047
1.2179387926392207	1.6373729677164417
1.1808656798707289	1.6252296888297952
1.1476594255004609	1.6132646485472864
1.1173773858101406	1.601454601498701
1.0894605231843513	1.5898306779051508
1.0635404745916015	1.578428551533487
1.039355352756603	1.5672831694946479
1.0167079862601023	1.556427954025873
0.9954432511085799	1.5458932054994738
0.9754348430828007	1.5357045394828575
0.956577097745831	1.525881945996041
0.9387796858937852	1.516439535326592
0.921964034983282	1.507385802082849
0.9060608324505159	1.4987242011757895
0.8910082322644935	1.4904538679396033
0.8767505329698816	1.4825703685049172
0.8632371805010878	1.47506641266181
0.8504220001447186	1.4679324944626881
0.8382625937251519	1.4611574470742013
0.8267198582959674	1.4547289109429509
0.8157575958152138	1.4486337210648923
0.8053421920857264	1.442858222228768
0.7954423492316002	1.4373885220411589
0.786028

In [None]:
import pickle

In [None]:
pickle.dump(model, open('recomm-model.pkl', 'wb'))

In [None]:
pickle.dump(trainData, open('trainData.pkl', 'wb'))
pickle.dump(testData, open('testData.pkl', 'wb'))
pickle.dump(U, open('svdU.pkl', 'wb'))
pickle.dump(VT, open('svdVT.pkl', 'wb'))
pickle.dump(S, open('svdSigma.pkl','wb'))

In [None]:
futsal_matrix = np.array([futsalB for _ in range(len(userB))]).T
user_matrix = np.array([userB for _ in range(len(futsalB))])
global_matrix = overall_mean * np.ones(trainData.shape)
d_matrix = np.dot(new_Q, new_P.T)
actualpred =  futsal_matrix + user_matrix + global_matrix + d_matrix

In [None]:
actualpred[testData.values.nonzero()]

array([2.90808502, 2.74311899, 1.57725087, 2.88067484, 2.25590813,
       3.99125829, 3.16439923, 4.83290724, 2.38380424, 4.09558404,
       4.71293938, 4.85156999, 3.2451123 , 3.30938114, 3.01192969,
       3.69396222, 3.10997971, 3.9472708 , 2.30224536, 1.41762305,
       4.8063077 , 4.091784  , 4.01093457, 4.09883011, 3.79941617,
       4.94688292, 3.21588967, 4.22070508, 2.92896304, 4.87855953,
       3.65187535, 3.70175544, 3.10637586, 4.89847981, 4.79571412,
       4.80651876, 4.20747848, 3.22301482, 3.90518176, 2.9952055 ,
       3.04700549, 3.94212152, 3.87986887, 3.83522507, 4.15448227,
       3.07669172, 4.40891716, 4.70801055, 4.76016927, 3.02602325,
       4.21200367, 3.13391679, 4.85494428, 3.12444742, 2.08479059,
       3.64609055, 4.50912351, 1.40354876, 3.37100421, 3.87706077,
       2.53301403, 3.72928204, 3.22376834, 3.96304982, 4.8006529 ,
       4.04086114, 4.89095798, 3.07297493, 3.96838683, 2.88599065,
       2.44868872, 3.1114435 , 3.95943944, 4.026484  , 3.96241

In [None]:
testData.values[testData.values.nonzero()]

array([1., 4., 1., 4., 3., 5., 4., 5., 4., 4., 3., 3., 4., 1., 3., 3., 5.,
       3., 2., 2., 5., 5., 3., 3., 3., 5., 5., 4., 2., 5., 4., 4., 5., 4.,
       2., 4., 1., 4., 4., 3., 3., 3., 5., 5., 2., 5., 5., 1., 5., 2., 5.,
       3., 5., 3., 4., 5., 3., 1., 3., 4., 2., 5., 5., 2., 5., 4., 1., 2.,
       4., 3., 3., 3., 3., 4., 3., 4., 3., 4., 3., 5., 2., 5., 3., 5., 4.,
       5., 4., 3., 2., 3., 2., 4., 3., 4., 4., 3., 3., 1., 4., 4., 5., 1.,
       5., 2., 5., 3., 2., 4., 4., 5., 3., 5., 5., 5., 4., 5., 4., 4., 1.,
       4., 4., 3., 4., 4., 5., 2., 5., 5., 2., 1., 4., 5., 4., 4., 3., 4.,
       5., 5., 5., 4., 2., 4., 2., 3., 4., 5., 4., 1., 3., 3., 5., 3., 3.,
       5., 4., 3.])

In [None]:
actualpred[trainData.values.nonzero()]

array([1.35569396, 2.74766034, 4.81668828, ..., 3.76234487, 4.10738676,
       3.77782628])

In [None]:
trainData.values[trainData.values.nonzero()]

array([1., 3., 5., ..., 4., 4., 4.])

In [None]:
np.mean(trainData.values[trainData.values.nonzero()])

3.675550405561993

In [None]:
data = np.zeros(45)

In [None]:
data[1] = 3
data[44] = 4
data[12] = 3

In [None]:
data.nonzero()

(array([ 1, 12, 44]),)

In [None]:
model.train_new_user(data)

array([3.12983559, 3.05003984, 3.50795183, 3.96358351, 3.50004416,
       3.75274175, 3.26284906, 3.65928252, 4.31554998, 3.66331262,
       3.59230844, 3.95516064, 3.03250234, 3.99823236, 3.70225671,
       3.59517457, 3.6330353 , 3.63806902, 4.0000687 , 3.63803392,
       3.77722867, 3.53022468, 3.36496626, 3.83582313, 3.36516424,
       3.78943125, 3.97955714, 3.27110986, 3.62225155, 3.54740399,
       3.71447342, 3.25353683, 3.55765232, 3.5862459 , 3.73777328,
       3.69491867, 3.53971333, 3.26752816, 4.01722465, 4.0672197 ,
       4.01118667, 3.52350137, 3.5115619 , 3.7279346 , 3.91489065])

In [None]:
trainedModel = pickle.load(open('recomm-model.pkl','rb'))

In [None]:
trainedModel.get_rmse()

In [None]:
f = model.train_new_user(data)

In [None]:
pd.DataFrame(f).sort_values(by=0, ascending=False).reset_index()

Unnamed: 0,index,0
0,8,4.255342
1,39,4.110408
2,13,4.095818
3,38,4.041388
4,3,4.037776
5,26,3.991969
6,40,3.952291
7,18,3.922079
8,44,3.914865
9,43,3.901804


In [None]:
trainData.index.values[1]

'BG Brothers Health Club Futsal'

In [None]:
fm = pickle.load(open('recomm-model.pkl', 'rb'))

In [None]:
gm = fm.train_new_user(data)

In [None]:
pd.DataFrame(gm).rename(columns={0:"r"}).sort_values(by="r")

Unnamed: 0,r
12,3.0325
1,3.050036
0,3.183074
27,3.282889
31,3.301831
37,3.322287
6,3.35954
22,3.425274
24,3.447467
36,3.496955
