In [1]:
import os
import pandas as pd

from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate

In [2]:
file_path_metadata = "../data/news-portal-user-interactions-by-globocom/articles_metadata.csv"
df_metadata = pd.read_csv(file_path_metadata)
df_metadata = pd.DataFrame(df_metadata)
df_metadata

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162
...,...,...,...,...,...
364042,364042,460,1434034118000,0,144
364043,364043,460,1434148472000,0,463
364044,364044,460,1457974279000,0,177
364045,364045,460,1515964737000,0,126


In [3]:
file_path_clicks = "../data/news-portal-user-interactions-by-globocom/clicks_sample.csv"
df_click_sample = pd.read_csv(file_path_clicks)
df = pd.DataFrame(df_click_sample)
df_click_sample

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1878,705,1506828968165442,1506828968000,2,119592,1506830912301,4,1,17,1,21,2
1879,705,1506828968165442,1506828968000,2,284847,1506830942301,4,1,17,1,21,2
1880,706,1506828979881443,1506828979000,3,108854,1506829027334,4,3,2,1,25,1
1881,706,1506828979881443,1506828979000,3,96663,1506829095732,4,3,2,1,25,1


In [4]:
clicks_directory = "../data/news-portal-user-interactions-by-globocom/clicks/"
columns = ["user_id", "session_id", "session_start", "session_size", "click_article_id",
           "click_timestamp", "click_environment", "click_deviceGroup", "click_os",
           "click_country", "click_region", "click_referrer_type"]
df_clicks = pd.DataFrame(columns = columns)

for i in os.listdir(clicks_directory):
    temp_ = pd.read_csv(clicks_directory + i, index_col=None, header=0)
    liste_concat = [df_clicks, temp_]
    df_clicks = pd.concat(liste_concat, ignore_index=True)


In [5]:
df_clicks

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,93863,1507865792177843,1507865792000,2,96210,1507865832925,4,3,2,1,21,2
1,93863,1507865792177843,1507865792000,2,158094,1507865862925,4,3,2,1,21,2
2,294036,1507865795185844,1507865795000,2,20691,1507865819095,4,3,20,1,9,2
3,294036,1507865795185844,1507865795000,2,96210,1507865849095,4,3,20,1,9,2
4,77136,1507865796257845,1507865796000,2,336245,1507866133178,4,3,2,1,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988176,15107,1507872960434128,1507872960000,4,20691,1507873325254,4,4,2,1,14,1
2988177,15107,1507872960434128,1507872960000,4,327984,1507873616575,4,4,2,1,14,1
2988178,15107,1507872960434128,1507872960000,4,172760,1507873646575,4,4,2,1,14,1
2988179,294270,1507872968214129,1507872968000,2,96210,1507873274995,4,3,2,1,25,2


In [6]:
df_merged = df_clicks.merge(df_metadata, left_on='click_article_id', right_on='article_id')
df_merged = df_merged[['user_id', 'article_id', 'category_id']]
df_merged["value"] = 1
df_merged

Unnamed: 0,user_id,article_id,category_id,value
0,93863,96210,209,1
1,93863,158094,281,1
2,294036,20691,9,1
3,294036,96210,209,1
4,77136,336245,437,1
...,...,...,...,...
2988176,15107,20691,9,1
2988177,15107,327984,435,1
2988178,15107,172760,299,1
2988179,294270,96210,209,1


In [7]:
grouped = df_merged.groupby(['user_id', 'category_id'])
df_aggregated = grouped['value'].sum().reset_index()
df_aggregated = df_aggregated.rename(columns={'value': 'rate'})
df_aggregated

Unnamed: 0,user_id,category_id,rate
0,0,136,1
1,0,186,2
2,0,209,1
3,0,281,2
4,0,375,1
...,...,...,...
1882297,322894,297,1
1882298,322895,133,1
1882299,322895,418,1
1882300,322896,26,1


In [8]:
df_aggregated.rate.unique()

array([  1,   2,  13,   3,  19,  17,  12,   4,   5,   7,   8,  11,   6,
        14,  10,  24,  98,  49,   9,  16,  21,  31,  34,  20,  30,  69,
        40,  42,  25,  32,  15,  33,  18,  27,  44,  29,  22,  23,  57,
        35,  36,  28,  26,  41,  61,  43,  39,  58, 103,  68,  90, 100,
        54,  71,  45,  67,  46,  52, 110,  37,  38,  53, 116,  66, 199,
        50,  59, 179,  48, 145,  89, 156,  56, 117, 121,  55,  82,  62,
       122,  47,  94, 348,  80,  78, 181, 211, 535,  83, 130, 256,  51,
        96, 109,  74, 150,  64,  84,  75,  76, 113, 107, 162,  99, 104,
        60,  95, 172,  65,  86, 102,  73,  93,  72, 126, 118, 140,  70,
       101, 193, 303,  88,  63,  79,  87, 136, 148, 138])

In [9]:
final_df = '../data/csv/df_aggregated.csv'
df_aggregated.to_csv(final_df, index=False)

In [10]:
df_aggregated = pd.read_csv('../data/csv/df_aggregated.csv')

In [11]:
print("Nombre de valeurs pour 1:", len(df_aggregated[df_aggregated["rate"] == 1]))
print("Nombre de valeurs pour 2:", len(df_aggregated[df_aggregated["rate"] == 2]))
print("Nombre de valeurs pour 3:", len(df_aggregated[df_aggregated["rate"] == 3]))
print("Nombre de valeurs pour 4:", len(df_aggregated[df_aggregated["rate"] == 4]))
print("Nombre de valeurs pour 5:", len(df_aggregated[df_aggregated["rate"] == 5]))
print("Nombre de valeurs supérieur à 5:", len(df_aggregated[df_aggregated["rate"] > 5]))

Nombre de valeurs pour 1: 1378686
Nombre de valeurs pour 2: 293327
Nombre de valeurs pour 3: 97043
Nombre de valeurs pour 4: 43989
Nombre de valeurs pour 5: 23658
Nombre de valeurs supérieur à 5: 45599


In [12]:
df_aggregated.loc[df_aggregated["rate"] == 1, "rate"] = 0
df_aggregated.loc[df_aggregated["rate"] == 2, "rate"] = 1
df_aggregated.loc[df_aggregated["rate"] == 3, "rate"] = 2
df_aggregated.loc[df_aggregated["rate"] == 4, "rate"] = 3
df_aggregated.loc[df_aggregated["rate"] == 5, "rate"] = 4
df_aggregated.loc[df_aggregated["rate"] > 5, "rate"] = 5

In [13]:
print("Nombre de valeurs pour 0:", len(df_aggregated[df_aggregated["rate"] == 0]))
print("Nombre de valeurs pour 1:", len(df_aggregated[df_aggregated["rate"] == 1]))
print("Nombre de valeurs pour 2:", len(df_aggregated[df_aggregated["rate"] == 2]))
print("Nombre de valeurs pour 3:", len(df_aggregated[df_aggregated["rate"] == 3]))
print("Nombre de valeurs pour 4:", len(df_aggregated[df_aggregated["rate"] == 4]))
print("Nombre de valeurs pour 5:", len(df_aggregated[df_aggregated["rate"] == 5]))

Nombre de valeurs pour 0: 1378686
Nombre de valeurs pour 1: 293327
Nombre de valeurs pour 2: 97043
Nombre de valeurs pour 3: 43989
Nombre de valeurs pour 4: 23658
Nombre de valeurs pour 5: 45599


In [19]:
df_sample = pd.DataFrame(columns=["user_id", "category_id", "rate"])

for i in range(6):
    df_sample = pd.concat([df_sample, df_aggregated[df_aggregated["rate"] == i].sample(20000)],
                          ignore_index=True)

In [21]:
reader = Reader(rating_scale=(1,5))
df_significant = df_sample.copy()
data = Dataset.load_from_df(df_significant[['user_id', 'category_id', 'rate']], reader)

In [22]:
print('We have selects', len(df_significant), 'interactions.')

We have selects 120000 interactions.


In [23]:
trainset, testset = train_test_split(data, test_size=0.25)
print('Test set lenght :', len(testset))
print('Train set lenght :', len(df_significant) - len(testset))

Test set lenght : 30000
Train set lenght : 90000


In [24]:
from surprise import SVD, accuracy

algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fea5c4bd5d0>

In [25]:
predictions = algo.test(testset)
print('Number of predictions in Test set :', len(predictions))

Number of predictions in Test set : 30000


In [26]:
accuracy.rmse(predictions)

RMSE: 1.5969


1.5969156573184726

In [42]:
df_predict = []
for i in range(len(predictions)):
    df_predict.append([
        predictions[i].uid,
        predictions[i].iid,
        predictions[i].r_ui,
        predictions[i].est]
    )

In [43]:
df_predict = pd.DataFrame(df_predict, columns=["user_id", "category_id", "y_true", "y_predict"])
df_predict

Unnamed: 0,user_id,category_id,y_true,y_predict
0,37549,375,3.0,2.678944
1,103234,442,2.0,1.926248
2,61150,418,4.0,1.403326
3,198424,26,0.0,1.423172
4,154494,281,3.0,2.460934
...,...,...,...,...
29995,160515,437,1.0,2.024088
29996,93387,209,2.0,2.099982
29997,39291,418,1.0,1.590695
29998,209750,26,0.0,2.004572


In [48]:
temporary_ = df_predict[df_predict["category_id"] == 375].copy()
temporary_

Unnamed: 0,user_id,category_id,y_true,y_predict
0,37549,375,3.0,2.678944
8,29491,375,3.0,3.020118
37,67122,375,5.0,2.657218
43,26059,375,4.0,2.422123
48,139007,375,3.0,2.678944
...,...,...,...,...
29968,130973,375,1.0,2.678944
29973,65508,375,5.0,2.920403
29988,149759,375,3.0,2.678944
29990,251790,375,4.0,2.678944


In [53]:
temporary_["y_predict"].max()

4.800513885620038

In [54]:
temporary_[temporary_["y_predict"]==4.800513885620038]

Unnamed: 0,user_id,category_id,y_true,y_predict
26041,49524,375,4.0,4.800514
