# Importations

In [None]:
from sklearn.metrics import mean_squared_error
import gc
from scipy import sparse

In [None]:
import math

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from numpy import save

# Read Data

In [None]:
bookmarks = pd.read_csv("../recommendations/bookmarks.csv")
#assets = pd.read_csv("../recommendations/assets.csv", index_col=0)
#assets_genres = pd.read_csv("../recommendations/asset_genres.csv")
favorites = pd.read_csv("../recommendations/favorites.csv")
#genres = pd.read_csv("../recommendations/genres.csv")
ratings = pd.read_csv("../recommendations/ratings.csv")

In [None]:
bookmarks.head()

In [None]:
ratings.head()

In [None]:
favorites.head()

# Compute Interests

In [None]:
def interest_generator(bookmarks):
    bookmarks = bookmarks.merge(ratings.drop("time", axis=1), on=['id_profile', 'id_asset'], how = "left")\
                         .merge(favorites, on=['id_profile', 'id_asset'], how = "left")\
                         .fillna(0)
    bookmarks["w"] = [1 for i in range(len(bookmarks))]
    bookmarks["n"] = bookmarks["score"]
    bookmarks["f"] = bookmarks["added_date"].apply(lambda x : 5 if x>0 else 0)
    bookmarks["interest"] = bookmarks["w"] + bookmarks["n"] + bookmarks["f"]
    return bookmarks[["id_profile", "id_asset", "interest"]]

In [None]:
bookmarks = interest_generator(bookmarks)
del ratings
del favorites

In [None]:
bookmarks.head()

In [None]:
train = np.load('../input/recommendations/bookmarks_idx_train.npy')
test = np.load('../input/recommendations/bookmarks_idx_test.npy')

In [None]:
train_set = bookmarks.iloc[train].drop_duplicates(subset=['id_profile', 'id_asset'])
del train
test_set = bookmarks.iloc[test].drop_duplicates(subset=['id_profile', 'id_asset'])
del test
del bookmarks

# SVD++

In [None]:
gc.collect()

In [None]:
f = 18

In [None]:
train_set.sort_values(by=["id_profile"], inplace=True)

In [None]:
train_set.head()

In [None]:
train_set_sparse = sparse.coo_matrix((train_set.interest.values, (train_set.id_profile.values, train_set.id_asset.values)))

In [None]:
train_set_sparse

In [None]:
mu = np.mean(train_set.interest.values)

In [None]:
unique_assets_train = pd.unique(train_set.id_asset)
unique_profiles_train = pd.unique(train_set.id_profile)

number_of_films = len(unique_assets_train)
number_of_users = len(unique_profiles_train)

In [None]:
def dict_values_to_array(d):
    return np.array(list(d.values()))

def calculer_sum_yj(Nu, yi):
    return yi

def calculer_rui_chapeau(mu, bu, bi, qi, pu, Nu, Nu_count, yi):
    yj = calculer_sum_yj(Nu, yi)
    print("mu", mu.shape)
    print("bu", bu.shape)
    print("bi.T" , bi.T.shape)
    print("qi.T" , qi.T.shape)
    print("pu" , pu.shape)
    print("yj" , yj.shape)
    print("Nu_count" , Nu_count.shape)
    rui_chapeau =  mu + \
                   bu + \
                   bi.T + \
                   (np.dot(qi.T, pu + np.dot((1/np.sqrt(Nu_count)), yj.T).T)).T
    print("rui_chapeau" , rui_chapeau.shape)
    return rui_chapeau

def calculer_difference(rui, mu, bu, bi, qi, pu, Nu, Nu_count, yi):
    return rui - calculer_rui_chapeau(mu, bu, bi, qi, pu, Nu, Nu_count, yi)

def calculer_cost(list_u, list_i, rui, mu, bu, bi, qi, pu, Nu, Nu_count, yi, gamma=0.007, lambda6=0.005, lambda7=0.015):
    cost = 0
    cost += np.sum(np.power(calculer_difference(rui, mu, bu, bi, qi, pu, Nu, Nu_count, yi), 2), axis=None)
    cost += lambda6*(np.sum(np.power(bi, 2), axis=None) + np.sum(np.power(bu, 2), axis=None))
    cost += lambda7*(np.sum(np.power(qi, 2), axis=None) + np.sum(np.power(pu, 2), axis=None) + np.sum(np.power(yi, 2), axis=None))
    return cost

def estimer_parametres(list_u, list_i, rui, mu, bu, bi, qi, pu, Nu, Nu_count, yi, gamma=0.007, lambda6=0.005, lambda7=0.015):
    
    diff = calculer_difference(rui, mu, bu, bi, qi, pu, Nu, Nu_count, yi)
    diff_sum_1 = np.sum(diff, axis=1)
    diff_sum_1 = diff_sum_1.reshape(diff_sum_1.shape[0], 1)
    diff_sum_0 = np.sum(diff, axis=0)
    diff_sum_0 = diff_sum_0.reshape(diff_sum_0.shape[0], 1)
    
    print("diff" , diff.shape)
    print("diff sum axis1" , diff_sum_1.shape)
    print("diff sum axis0" , diff_sum_0.shape)
    
    bu = bu + gamma * (diff_sum_1 - lambda6*bu)
    pu = pu + gamma * (np.dot(diff, qi.T).T - lambda7*pu)
    
    bi = bi + gamma*(diff_sum_0 - lambda6*bi)
    qi = qi + gamma*(np.dot(diff.T, pu.T + np.dot(1/np.sqrt(Nu_count), calculer_sum_yj(Nu, yi).T)).T - lambda7*qi)
    yi = yi + gamma*(np.dot(np.dot(diff.T, (1/np.sqrt(Nu_count))).T, qi.T).T - lambda7*yi)
    return bu, bi, qi, pu, yi

In [None]:
bu = np.random.randn(len(unique_profiles_train),1)
bi = np.random.randn(len(unique_assets_train),1)

qi = np.random.randn(f,len(unique_assets_train))
pu = np.random.randn(f,len(unique_profiles_train))
yi = np.random.randn(f,1)

In [None]:
epochs = 250
gamma = 0.007 
lambda6 = 0.005
lambda7 = 0.015

costs = list()

user_batch_size = 2
iters = math.ceil(len(unique_profiles_train)/user_batch_size)

for epoch in range(1, epochs+1):
    
    average_cost = 0
    for i in range(iters):
        print(i)
        train_set_subset = train_set.loc[train_set.id_profile.isin(unique_profiles_train[0*i:(i+1)*user_batch_size])]
        
        group = train_set_subset[train_set_subset.interest > 0].groupby("id_profile")
        Nu = group['id_asset'].apply(list)
        Nu_count = group['id_asset'].count().to_numpy()
        Nu_count = Nu_count.reshape(Nu_count.shape[0], 1)
        del group
        
        rui_subset = train_set_subset.pivot(index='id_profile', columns='id_asset', values='interest').values
        rui_u = rui_subset.shape[0]
        rui_i = rui_subset.shape[1]
        
        print("Calculer Cost")
        
        cost = calculer_cost(train_set_subset.id_profile.values,
                            train_set_subset.id_asset.values, 
                            rui_subset, 
                            mu, 
                            bu[i*rui_u:(i+1)*rui_u], 
                            bi[i*rui_i:(i+1)*rui_i], 
                            qi[:, i*rui_i:(i+1)*rui_i], 
                            pu[:, i*rui_u:(i+1)*rui_u], 
                            Nu, 
                            Nu_count, 
                            yi)
        gc.collect()
        
        average_cost += cost
        print(cost)
        
        print("Estimer Parametres")
        
        bu[i*rui_u:(i+1)*rui_u], bi[i*rui_i:(i+1)*rui_i], qi[:, i*rui_i:(i+1)*rui_i], pu[:, i*rui_u:(i+1)*rui_u], yi = estimer_parametres(train_set_subset.id_profile.values, 
                            train_set_subset.id_asset.values, 
                            rui_subset, 
                            mu, 
                            bu[i*rui_u:(i+1)*rui_u], 
                            bi[i*rui_i:(i+1)*rui_i], 
                            qi[:, i*rui_i:(i+1)*rui_i], 
                            pu[:, i*rui_u:(i+1)*rui_u],  
                            Nu, 
                            Nu_count, 
                            yi,
                            gamma = gamma)
        
        gc.collect()
    costs.append(average_cost/iters)
    gamma *= 0.9

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax = plt.plot(costs)
plt.title("Cost Function Evolution (SVD++)")
plt.xlabel('Epoch')
plt.ylabel('Regularized RMSE')

In [None]:
np.save('bu.npy', dict_values_to_array(bu))
np.save('bi.npy', dict_values_to_array(bi))
np.save('pu.npy', dict_values_to_array(pu))
np.save('qi.npy', dict_values_to_array(qi))
np.save('yi.npy', dict_values_to_array(yi))

In [None]:
def predict_interest(u, i):
    pred = mu
    
    try:
        pred += bu[u]
    except KeyError:
        pass
        #print("User {} not found".format(u))
        
    try:
        pred += bi[i]
    except KeyError:
        pass
        #print("Film {} not found".format(i))
        
    try:
        pred = pred + calculer_rui_chapeau(mu, bu, bi, qi, pu, Nu, Nu_count, yi, u, i)
    except KeyError:
        pass
        #print("Film {} / User {} not found".format(i, u))
        
    return pred

In [None]:
test_set["predicted_interest"] = [predict_interest(u, i) for u,i in zip(test_set.id_profile.values, test_set.id_asset.values)]

In [None]:
test_set.head(50)

In [None]:
mean_squared_error(test_set.interest.values, test_set.predicted_interest.values)