In [1]:
import numpy as np
import pandas as pd

from surprise import SVD, accuracy
from surprise import Reader, Dataset
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

# Rassemblement du dataset et génération de la "user rating matrix"

In [2]:
articles_metadata = pd.read_csv("../Ressources/articles_metadata.csv")  
articles_metadata

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162
...,...,...,...,...,...
364042,364042,460,1434034118000,0,144
364043,364043,460,1434148472000,0,463
364044,364044,460,1457974279000,0,177
364045,364045,460,1515964737000,0,126


In [3]:
clicks= pd.read_csv('clicks.csv', index_col=False)
clicks

Unnamed: 0.1,Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2988176,2988176,10051,1508211372158328,1508211372000,2,84911,1508211557302,4,3,2,1,25,1
2988177,2988177,322896,1508211376302329,1508211376000,2,30760,1508211672520,4,1,17,1,25,2
2988178,2988178,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2988179,2988179,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2


In [4]:
%%time
dfUsersAndArticleCatego = clicks.merge(articles_metadata, left_on='click_article_id', right_on='article_id')
dfUsersAndArticleCatego = dfUsersAndArticleCatego[['user_id', 'article_id', 'category_id']]
dfUsersAndArticleCatego

Wall time: 795 ms


Unnamed: 0,user_id,article_id,category_id
0,0,157541,281
1,20,157541,281
2,44,157541,281
3,45,157541,281
4,76,157541,281
...,...,...,...
2988176,195186,2221,1
2988177,75658,271117,399
2988178,217129,20204,9
2988179,217129,70196,136


In [18]:
%%time
series = dfUsersAndArticleCatego.groupby(['user_id', 'category_id']).size()
user_rating_matrix = series.to_frame()
user_rating_matrix = user_rating_matrix.reset_index()
user_rating_matrix.rename(columns = {0:'rate'}, inplace = True)

user_rating_matrix

Wall time: 750 ms


Unnamed: 0,user_id,category_id,rate
0,0,136,1
1,0,186,2
2,0,209,1
3,0,281,2
4,0,375,1
...,...,...,...
1882297,322894,297,1
1882298,322895,133,1
1882299,322895,418,1
1882300,322896,26,1


# Mise en oeuvre de la lib "Surprise"

In [21]:
user_rating_matrix.loc[user_rating_matrix.rate > 1]

Unnamed: 0,user_id,category_id,rate
1,0,186,2
3,0,281,2
9,1,281,2
12,1,375,2
13,1,412,2
...,...,...,...
1882258,322874,228,2
1882261,322876,412,2
1882266,322879,281,2
1882277,322884,340,2


In [22]:
reader = Reader(rating_scale=(1,10))

_x = user_rating_matrix.loc[user_rating_matrix.rate > 1]

data = Dataset.load_from_df(_x[['user_id', 'category_id', 'rate']], reader)

print(f'Nous avons sélectionné {len(_x)} interactions.')

Nous avons sélectionné 503616 interactions.


In [23]:
trainset, testset = train_test_split(data, test_size=0.25)
print('Taile du set de test :', len(testset))
print("Taile du set d'entrainement :", len(_x) - len(testset))

Taile du set de test : 125904
Taile du set d'entrainement : 377712


In [24]:
%%time
algo = SVD()
algo.fit(trainset)

Wall time: 15.2 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1160074d130>

In [27]:
%%time
predictions = algo.test(testset)
print("Nombre de prédictions dans l'ensemble de test :", len(predictions))

Nombre de prédictions dans l'ensemble de test : 125904
Wall time: 973 ms


In [28]:
accuracy.rmse(predictions)

RMSE: 7.5527


7.552733113567636

Ok pas terrible ?! Test de **GridSearchCV**

In [30]:
param_grid = {
    'n_epochs': [5, 10, 20]# defaut 20
    ,'lr_all': [0.002, 0.005, 0.008]# learningRate pour tout les paramètres / defaut 0.005
    ,'reg_all': [0.4, 0.6]# terme de régulation pour tous les paramètres / defaut 0.02
}

In [31]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [32]:
gs.fit(data)
# 8min

In [33]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

3.171062698588257
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}


Ajustons:

In [34]:
param_grid = {
    'n_epochs': [3, 5, 7]# defaut 20
    ,'lr_all': [0.004, 0.005, 0.006]# learningRate pour tout les paramètres / defaut 0.005
    ,'reg_all': [0.6, 0.7]# terme de régulation pour tous les paramètres / defaut 0.02
}

In [36]:
%%time
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(f"Meilleur score RMSE: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f'"Combinaison gagnante": {gs.best_params["rmse"]}')

Meilleur score RMSE: 3.1653930281927245
"Combinaison gagnante": {'n_epochs': 7, 'lr_all': 0.005, 'reg_all': 0.7}
Wall time: 4min 26s


In [39]:
param_grid = {
    'n_epochs': [6, 7, 8]# defaut 20
    ,'lr_all': [0.0045, 0.005, 0.0055]# learningRate pour tout les paramètres / defaut 0.005
    ,'reg_all': [0.7, 0.8, 0.9]# terme de régulation pour tous les paramètres / defaut 0.02
}

In [40]:
%%time
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

print(f"Meilleur score RMSE: {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f'"Combinaison gagnante": {gs.best_params["rmse"]}')

Meilleur score RMSE: 3.161777807409089
"Combinaison gagnante": {'n_epochs': 8, 'lr_all': 0.005, 'reg_all': 0.8}
Wall time: 8min 24s
