# Importations

In [12]:
from sklearn.metrics import mean_squared_error
import gc

In [13]:
import math

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from numpy import save

# Read Data

In [15]:
bookmarks = pd.read_csv("./recommendation/bookmarks.csv")
#assets = pd.read_csv("../input/recommendations/assets.csv", index_col=0)
#assets_genres = pd.read_csv("../input/recommendations/asset_genres.csv")
favorites = pd.read_csv("./recommendation/favorites.csv")
#genres = pd.read_csv("../input/recommendations/genres.csv")
ratings = pd.read_csv("./recommendation/ratings.csv")

In [16]:
bookmarks.head()

Unnamed: 0,id_profile,id_asset,time
0,10,24016,0
1,10,24042,0
2,10,24130,0
3,10,24250,0
4,10,24359,19000


In [17]:
ratings.head()

Unnamed: 0,id_profile,score,time,id_asset
0,581352,5.0,1555296701000,23568
1,1043474,5.0,1539800341000,385428
2,420488,3.5,1518482338000,26464
3,1277812,5.0,1565056946000,443500
4,327953,5.0,1508631887000,34243


In [18]:
favorites.head()

Unnamed: 0,id_profile,id_asset,added_date
0,144,1486,1567660088461
1,144,374333,1567660088461
2,144,443500,1567660088461
3,2367,24056,1567660088461
4,2472,24152,1567660088461


# Compute Interests

In [19]:
def interest_generator(bookmarks):
    bookmarks = bookmarks.merge(ratings.drop("time", axis=1), on=['id_profile', 'id_asset'], how = "left")\
                         .merge(favorites, on=['id_profile', 'id_asset'], how = "left")\
                         .fillna(0)
    bookmarks["w"] = [1 for i in range(len(bookmarks))]
    bookmarks["n"] = bookmarks["score"]
    bookmarks["f"] = bookmarks["added_date"].apply(lambda x : 5 if x>0 else 0)
    bookmarks["interest"] = bookmarks["w"] + bookmarks["n"] + bookmarks["f"]
    return bookmarks[["id_profile", "id_asset", "interest"]]

In [20]:
bookmarks = interest_generator(bookmarks)
del ratings
del favorites

In [21]:
bookmarks.head()

Unnamed: 0,id_profile,id_asset,interest
0,10,24016,1.0
1,10,24042,1.0
2,10,24130,1.0
3,10,24250,1.0
4,10,24359,1.0


In [22]:
train = np.load('./recommendation/bookmarks_idx_train.npy')
test = np.load('./recommendation/bookmarks_idx_test.npy')

In [23]:
train_set = bookmarks.iloc[train].drop_duplicates(subset=['id_profile', 'id_asset'])
del train
test_set = bookmarks.iloc[test].drop_duplicates(subset=['id_profile', 'id_asset'])
del test
del bookmarks

# Baseline Estimates

In [24]:
from scipy import sparse

In [25]:
train_set.head()

Unnamed: 0,id_profile,id_asset,interest
39963458,326534,202094,1.0
41808284,133318,5343689,1.0
37973227,1101174,86739,1.0
43528603,1264136,5349884,1.0
9434040,1093487,1547,1.0


In [26]:
rui_bu = train_set.groupby("id_profile")["interest"].apply(sum).to_numpy()
rui_bi = train_set.groupby("id_asset")["interest"].apply(sum).to_numpy()

In [27]:
unique_assets_train = pd.unique(train_set.id_asset)
unique_profiles_train = pd.unique(train_set.id_profile)

number_of_films = len(unique_assets_train)
number_of_users = len(unique_profiles_train)

In [28]:
mu = np.mean(train_set.interest.values)

In [29]:
del train_set

In [30]:
bu = np.random.randn(number_of_users)
bi = np.random.randn(number_of_films)

In [31]:
epochs = 100
learning_rate = 0.0000001
reg = 0.02

mean_cost_derivatives_bu = list() 
mean_cost_derivatives_bi = list() 

for epoch in range(1, epochs+1):
    #cost = np.sum(np.power(rui - sparse.csc_matrix(mu) - sparse.csc_matrix(bu + bi.T), 2)) + reg*(np.sum(np.power(bu, 2)) + np.sum(np.power(bi, 2)))
    #print(cost)
     
    cost_derivative_bu = 2*(-rui_bu + number_of_films*mu + number_of_films*bu + np.sum(bi, axis=None) + reg*bu)
    cost_derivative_bi = 2*(-rui_bi + number_of_users*mu + number_of_users*bi + np.sum(bu, axis=None) + reg*bi)
    
    mean_cost_derivatives_bu.append(np.mean(cost_derivative_bu))
    mean_cost_derivatives_bi.append(np.mean(cost_derivative_bi))
    
    bu = bu - learning_rate*cost_derivative_bu
    bi = bi - learning_rate*cost_derivative_bi
    gc.collect()

In [None]:
del rui_bu
del rui_bi

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax = plt.plot(mean_cost_derivatives_bu)
ax = plt.plot(mean_cost_derivatives_bi)
plt.title("Mean Derivative Cost Function Evolution (Baseline Estimates++)")
plt.xlabel('Epoch')
plt.ylabel('Regularized Cost Derivative')

In [None]:
hashmap_bu = {k:v for k,v in zip(unique_profiles_train, bu)}
hashmap_bi = {k:v for k,v in zip(unique_assets_train, bi)}

In [None]:
np.save('bu.npy', bu)
np.save('bi.npy', bi)

In [None]:
test_set.head()

In [None]:
def predict_interest(u, i):
    pred = mu
    
    try:
        pred += hashmap_bu[u]
    except KeyError:
        print("User {} not found".format(u))
        
    try:
        pred += hashmap_bi[i]
    except KeyError:
        print("Film {} not found".format(i))
        
    return pred

In [None]:
test_set["predicted_interest"] = [predict_interest(u, i) for u,i in zip(test_set.id_profile.values, test_set.id_asset.values)]

In [None]:
test_set.head(50)

In [None]:
mean_squared_error(test_set.interest.values, test_set.predicted_interest.values)