In [1]:
import os
import pandas as pd

from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate

# Visualisation des données

In [2]:
file_path_metadata = "../data/news-portal-user-interactions-by-globocom/articles_metadata.csv"
df_metadata = pd.read_csv(file_path_metadata)
df_metadata = pd.DataFrame(df_metadata)
df_metadata

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162
...,...,...,...,...,...
364042,364042,460,1434034118000,0,144
364043,364043,460,1434148472000,0,463
364044,364044,460,1457974279000,0,177
364045,364045,460,1515964737000,0,126


In [3]:
file_path_clicks = "../data/news-portal-user-interactions-by-globocom/clicks_sample.csv"
df_click_sample = pd.read_csv(file_path_clicks)
df = pd.DataFrame(df_click_sample)
df_click_sample

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1878,705,1506828968165442,1506828968000,2,119592,1506830912301,4,1,17,1,21,2
1879,705,1506828968165442,1506828968000,2,284847,1506830942301,4,1,17,1,21,2
1880,706,1506828979881443,1506828979000,3,108854,1506829027334,4,3,2,1,25,1
1881,706,1506828979881443,1506828979000,3,96663,1506829095732,4,3,2,1,25,1


# Agrégation des données du dossier clicks dans un dataframe

In [4]:
clicks_directory = "../data/news-portal-user-interactions-by-globocom/clicks/"
columns = ["user_id", "session_id", "session_start", "session_size", "click_article_id",
           "click_timestamp", "click_environment", "click_deviceGroup", "click_os",
           "click_country", "click_region", "click_referrer_type"]
df_clicks = pd.DataFrame(columns = columns)

for i in os.listdir(clicks_directory):
    temp_ = pd.read_csv(clicks_directory + i, index_col=None, header=0)
    liste_concat = [df_clicks, temp_]
    df_clicks = pd.concat(liste_concat, ignore_index=True)

In [5]:
df_clicks

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,93863,1507865792177843,1507865792000,2,96210,1507865832925,4,3,2,1,21,2
1,93863,1507865792177843,1507865792000,2,158094,1507865862925,4,3,2,1,21,2
2,294036,1507865795185844,1507865795000,2,20691,1507865819095,4,3,20,1,9,2
3,294036,1507865795185844,1507865795000,2,96210,1507865849095,4,3,20,1,9,2
4,77136,1507865796257845,1507865796000,2,336245,1507866133178,4,3,2,1,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988176,15107,1507872960434128,1507872960000,4,20691,1507873325254,4,4,2,1,14,1
2988177,15107,1507872960434128,1507872960000,4,327984,1507873616575,4,4,2,1,14,1
2988178,15107,1507872960434128,1507872960000,4,172760,1507873646575,4,4,2,1,14,1
2988179,294270,1507872968214129,1507872968000,2,96210,1507873274995,4,3,2,1,25,2


In [6]:
df_merged = df_clicks.merge(df_metadata, left_on='click_article_id', right_on='article_id')
df_merged = df_merged[['user_id', 'article_id', 'category_id']]
df_merged["value"] = 1
df_merged

Unnamed: 0,user_id,article_id,category_id,value
0,93863,96210,209,1
1,93863,158094,281,1
2,294036,20691,9,1
3,294036,96210,209,1
4,77136,336245,437,1
...,...,...,...,...
2988176,15107,20691,9,1
2988177,15107,327984,435,1
2988178,15107,172760,299,1
2988179,294270,96210,209,1


In [7]:
grouped = df_merged.groupby(['user_id', 'category_id'])
df_aggregated = grouped['value'].sum().reset_index()
df_aggregated = df_aggregated.rename(columns={'value': 'rate'})
df_aggregated

Unnamed: 0,user_id,category_id,rate
0,0,136,1
1,0,186,2
2,0,209,1
3,0,281,2
4,0,375,1
...,...,...,...
1882297,322894,297,1
1882298,322895,133,1
1882299,322895,418,1
1882300,322896,26,1


In [8]:
final_df = '../data/csv/df_aggregated.csv'
df_aggregated.to_csv(final_df, index=False)

In [9]:
df_aggregated = pd.read_csv('../data/csv/df_aggregated.csv')

In [10]:
print("Nombre de valeurs pour 1:", len(df_aggregated[df_aggregated["rate"] == 1]))
print("Nombre de valeurs pour 2:", len(df_aggregated[df_aggregated["rate"] == 2]))
print("Nombre de valeurs pour 3:", len(df_aggregated[df_aggregated["rate"] == 3]))
print("Nombre de valeurs pour 4:", len(df_aggregated[df_aggregated["rate"] == 4]))
print("Nombre de valeurs pour 5:", len(df_aggregated[df_aggregated["rate"] == 5]))
print("Nombre de valeurs supérieur à 5:", len(df_aggregated[df_aggregated["rate"] > 5]))

Nombre de valeurs pour 1: 1378686
Nombre de valeurs pour 2: 293327
Nombre de valeurs pour 3: 97043
Nombre de valeurs pour 4: 43989
Nombre de valeurs pour 5: 23658
Nombre de valeurs supérieur à 5: 45599


In [12]:
df_sample = pd.DataFrame(columns=["user_id", "category_id", "rate"])

for i in range(1,6):
    df_sample = pd.concat([df_sample, df_aggregated[df_aggregated["rate"] == i].sample(20000)],
                          ignore_index=True)

In [13]:
df_sample

Unnamed: 0,user_id,category_id,rate
0,83240,250,1
1,159143,353,1
2,194001,331,1
3,42870,418,1
4,167047,348,1
...,...,...,...
99995,1359,375,5
99996,46338,281,5
99997,130260,281,5
99998,136964,412,5


In [14]:
reader = Reader(rating_scale=(1,5))
df_significant = df_sample.loc[df_sample["rate"] > 1]
data = Dataset.load_from_df(df_significant[['user_id', 'category_id', 'rate']], reader)

In [15]:
print(len(df_significant))

80000


In [16]:
trainset, testset = train_test_split(data, test_size=0.25)
print('Test set lenght :', len(testset))
print('Train set lenght :', len(df_significant) - len(testset))

Test set lenght : 20000
Train set lenght : 60000


In [17]:
from surprise import SVD, accuracy

algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fee6c43b670>

In [18]:
predictions = algo.test(testset)
print('Number of predictions in Test set :', len(predictions))

Number of predictions in Test set : 20000


In [19]:
accuracy.rmse(predictions)

RMSE: 1.1030


1.1030016610631952

In [20]:
predictions

[Prediction(uid=284432, iid=418, r_ui=3.0, est=2.999406654248274, details={'was_impossible': False}),
 Prediction(uid=80116, iid=281, r_ui=4.0, est=3.6451191274977264, details={'was_impossible': False}),
 Prediction(uid=227441, iid=281, r_ui=3.0, est=3.6451191274977264, details={'was_impossible': False}),
 Prediction(uid=140083, iid=209, r_ui=2.0, est=3.202318734196316, details={'was_impossible': False}),
 Prediction(uid=27795, iid=437, r_ui=3.0, est=3.4420884131962137, details={'was_impossible': False}),
 Prediction(uid=73185, iid=323, r_ui=4.0, est=3.06171098724281, details={'was_impossible': False}),
 Prediction(uid=126390, iid=136, r_ui=2.0, est=3.3759413492890826, details={'was_impossible': False}),
 Prediction(uid=182039, iid=375, r_ui=5.0, est=3.530645942445059, details={'was_impossible': False}),
 Prediction(uid=201287, iid=331, r_ui=3.0, est=3.481457763086321, details={'was_impossible': False}),
 Prediction(uid=84997, iid=375, r_ui=4.0, est=3.530645942445059, details={'was_imp