In [0]:
#installation:
#!pip install googledrivedownloader

In [29]:
!  free -h

              total        used        free      shared  buff/cache   available
Mem:            25G        619M         22G        848K        2.0G         24G
Swap:            0B          0B          0B


In [0]:
#imports:
import pandas as pd
import numpy as np
import math
import random

from scipy.sparse import vstack
from scipy.sparse import csr_matrix

from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder

from matplotlib import pyplot as plt

# Data gen


In [0]:

df = pd.DataFrame(3 * np.random.random_sample((int(2*1e4),4)) -1) #random data columns user-item


for i in range((int)(df.shape[0]*df.shape[1]*0.9)): # let 50% of data are zeros
    col=random.randrange(0, df.shape[1]) 
    row=random.randrange(0, df.shape[0]) 
    df.iloc[row,col]=0
df['target']=0.1+df.iloc[:,0]+0.4*df.iloc[:,1]*df.iloc[:,2]+0.5*df.iloc[:,0]*df.iloc[:,2]

print(df.shape) 


(20000, 5)


In [0]:

cols=df.columns.to_list()
cols.remove('target')
X=df.loc[:,cols]
y=df.loc[:,'target']

X = csr_matrix(X) #for csr


# Functions

## metrics

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def custom_R2(target, pred):
    return 1-(np.sum((target-pred)**2))/(np.sum((target-np.mean(target))**2))

def custom_RMSE(target, pred, w):
    return np.sqrt(
        np.sum((target-pred)**2)/len(pred)#+gamma*np.linalg.norm(w)
    )

## learning funcs

In [0]:
def get_para_w(old_w, X, eps, lam): 
    return old_w+lam*(X.T.dot(eps))

def get_V(old_V, X, eps, lam, sum_in): 
    right=np.multiply(old_V, X.T.dot(eps.values)) 
    left=(X.T.dot(np.multiply(eps.values,sum_in).T)).T
    old_V += lam*(left-right)
    return old_V


def get_para_w0(old_w0, eps, lam):
    return old_w0+lam*(np.sum(eps))

def get_prediction(w0, w, V, X, sum_in):
    return w0 + X.dot(w) + (0.5 * np.sum((sum_in**2) - (X.power(2)).dot((V.T)**2), axis=1))
    
def svd_plus_plus(X, y, k, lam, terms_num, max_epoch, fig_flag):
    if fig_flag:
        RMSE_arr=np.array([],dtype=float)

    w=np.array([0.5]*(X.shape[1]))
    V=(np.array([0.5]*(X.shape[1]*k)).reshape(k,X.shape[1]))
    w0=0.0
    sum_in=X.dot(V.T)
    prediction=get_prediction(w0,w,V,X, sum_in)
    #minRMSE=custom_RMSE(y, prediction, w)

    #best_params=np.append(w,w0)
    #best_V=V
    #best_pred=prediction

    for iter_num in range(max_epoch):    
        X, y = shuffle(X, y)

        for batch_counter in range(math.ceil(X.shape[0]/terms_num)):
            X_batch=X[batch_counter*terms_num:(batch_counter+1)*terms_num]

            sum_in=X_batch.dot(V.T)
            prediction=get_prediction(w0,w,V, X_batch, sum_in)
            eps=(y[batch_counter*terms_num:(batch_counter+1)*terms_num]-prediction)

            w=get_para_w(w, X_batch, eps, lam)
            w0=get_para_w0(w0,  eps, lam)    
            V=get_V(V, X_batch, eps, lam, sum_in.T) 

           

            #if fig_flag:
            #    RMSE_arr=np.append(RMSE_arr,custom_RMSE(df['target'], get_prediction(w0,w,V, X_batch, sum_in), gamma, w))

            #if custom_RMSE(y, prediction, w)<minRMSE:
                #minRMSE=custom_RMSE(y, prediction, w)
                #best_params=np.append(w,w0)
                #best_V=V
                #best_pred=prediction
            
    #print('RMSE is:'+str(minRMSE))
    #print('RMSE is:'+str(minRMSE))
    #print('R2 is:'+str(custom_R2(y, best_pred)))
    #print('R2 is:'+str(r2_score(y, best_pred)))
    """
    if fig_flag:
        plt.figure(figsize=(15,7))
        plt.plot(RMSE_arr)
        plt.xlabel('iteration')
        plt.ylabel('RMSE')
        plt.title('RMSE changing')
        plt.show()
    """
    
    #return best_params, best_V
    return np.append(w,w0), V

In [0]:
cols=df.columns.to_list()
cols.remove('target')

cols.append('target')
df=df.reindex(columns=cols)

# learning

In [0]:
#set algo params:
k=3 #factors num
terms_num=100
lam=(2e-2*2)/terms_num
max_epoch=30

# cross-val

In [0]:
import time

In [0]:
res=pd.DataFrame()

idxs=[round(X.shape[0]/5)*(i-1) for i in range(1,6)]
idxs.append(X.shape[0])
fig_flag=False
start= time.time()
for part in range(1,6):
    #separate dataset
    test_part_X=X[idxs[part-1]:idxs[part]]
    test_part_y=y[idxs[part-1]:idxs[part]]

    train_part_X=vstack([X[idxs[0]:idxs[part-1]], X[idxs[part]:]], 'csr') #for csr
    #train_part_X=X.drop(X.index[idxs[part-1]:idxs[part]]) #for df
    train_part_y=y.drop(y.index[idxs[part-1]:idxs[part]])


    params, V=svd_plus_plus(train_part_X, train_part_y, k, lam, terms_num, max_epoch, fig_flag) #model    
    test_pred=get_prediction(params[-1],params[:-1], V, test_part_X, test_part_X.dot(V.T))

    #RMSE=custom_RMSE(test_part_y, test_pred, params[:-1])
    #r2=custom_R2(test_part_y, test_pred)

    #res[part]=[RMSE,r2]
    res[part]=[custom_RMSE(test_part_y, test_pred, params[:-1]),custom_R2(test_part_y, test_pred)]
    if fig_flag:
        fig_flag=False
print(time.time()-start)

29.653566360473633


In [0]:
res.index=["RMSE", "R2"]

res['mean']=np.mean(res, axis=1)
res['std']=np.std(res.iloc[:,:-1], axis=1)

In [0]:
res

Unnamed: 0,1,2,3,4,5,mean,std
RMSE,0.035819,0.03674,0.038051,0.037028,0.039007,0.037329,0.001101
R2,0.997398,0.997476,0.997361,0.997311,0.996957,0.997301,0.00018
