In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import pickle
import seaborn as sns
import glob
import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from pandas.core.common import SettingWithCopyWarning
import scipy.sparse as sparse
import implicit 
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import KNNWithMeans, SVD
from surprise import accuracy
from collections import defaultdict
from itertools import chain
from sklearn.metrics.pairwise import cosine_similarity

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
# dir = r"C:\Users\louis\Dropbox\PC\Downloads\news-portal-user-interactions-by-globocom" #Laptop
dir = r"news-portal-user-interactions-by-globocom/"
data_csv = os.path.join(dir, "articles_metadata.csv")
click_sample_csv = os.path.join(dir, "clicks_sample.csv")
articles_embedding_file = os.path.join(dir, "articles_embeddings.pickle")

#### Dataset Content
- articles_metadata.csv : CSV file with metadata information about all (364047) published articles
- articles_embeddings.pickle : Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module for 364047 published articles.
- clicks : Folder with CSV files (one per hour), containing user sessions interactions in the news portal.
- clicks_sample.csv : CSV file containing user sessions interactions in the news portal.

## Exploratory Data Analysis

In [3]:
data = pd.read_csv(data_csv, sep=',')
data.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


Concert "created_at_ts" (timestamp) to datetime column

In [4]:
data["created_at_ts"] = pd.to_datetime(data["created_at_ts"], unit='ms').dt.strftime('%Y/%m/%d')
data.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,2017/12/13,0,168
1,1,1,2014/07/14,0,189
2,2,1,2014/08/22,0,250
3,3,1,2014/08/19,0,230
4,4,1,2014/08/03,0,162


In [5]:
data.shape

(364047, 5)

In [6]:
data.describe()

Unnamed: 0,article_id,category_id,publisher_id,words_count
count,364047.0,364047.0,364047.0,364047.0
mean,182023.0,283.108239,0.0,190.897727
std,105091.461061,136.72347,0.0,59.502766
min,0.0,0.0,0.0,0.0
25%,91011.5,199.0,0.0,159.0
50%,182023.0,301.0,0.0,186.0
75%,273034.5,399.0,0.0,218.0
max,364046.0,460.0,0.0,6690.0


In [7]:
data.category_id.nunique()

461

User interaction = Page view by the user
<br>A session represents a sequence of user clicks with no more than 30 minutes between interactions.

- user_id: The user id
- session_id: the unique identifier of the session; in a session, a user may have consulted several articles
- session_start: Timestamp of the first interaction of the session
- session_size: Number of interactions during the session 
- click_article_id: Arcticle id interacted by the user 
- click_timestamp: Timestamp of the interaction
- click_environment: Id of the Environment: 1=Facebook instant article / 2=Mobile App / 3=AMP (Accelerated Mobile Pages) / 4: Web
- click_deviceGroup: id of the device type: 1=Tablet / 2=TV / 3=Empty / 4=Mobile / 5=Desktop 
- click_os: Id of the operationnal system: 1 - Other, 2 - iOS, 3 - Android, 4 - Windows Phone, 5 - Windows Mobile, 6 - Windows, 7 - Mac OS X, 8 - Mac OS, 9 - Samsung, 10 - FireHbbTV, 11 - ATV OS X, 12 - tvOS, 13 - Chrome OS, 14 - Debian, 15 - Symbian OS, 16 - BlackBerry OS, 17 - Firefox OS, 18 - Android, 19 - Brew MP, 20 - Chromecast, 21 - webOS, 22 - Gentoo, 23 - Solaris
- click_country : id of the Country 

#### Users csv files DataFrame
Open and read users csv files and concatenate them into one dataframe 

In [8]:
df_user = pd.DataFrame()
list_df = []

for csv_file in glob.glob(os.path.join(dir+"clicks", "*.csv")):
    df = pd.read_csv(csv_file)
    list_df.append(df)

df_user = pd.concat(list_df, axis=0, ignore_index=True)

In [9]:
df_user.head()

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
3,1,1506825426267738,1506825426000,2,96663,1506827047951,4,1,17,1,16,2
4,2,1506825435299739,1506825435000,2,119592,1506827090575,4,1,17,1,24,2


Convert "Timestamp" columns to datetime & change "click_article_id" to "article_id"

In [10]:
for col in ["session_start", "click_timestamp"]:
    df_user[col] = pd.to_datetime(df_user[col], unit='ms')

df_user.rename(columns={"click_article_id": "article_id"}, inplace=True)

df_user.head()

Unnamed: 0,user_id,session_id,session_start,session_size,article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,2017-10-01 02:37:03,2,157541,2017-10-01 03:00:28.020,4,3,20,1,20,2
1,0,1506825423271737,2017-10-01 02:37:03,2,68866,2017-10-01 03:00:58.020,4,3,20,1,20,2
2,1,1506825426267738,2017-10-01 02:37:06,2,235840,2017-10-01 03:03:37.951,4,1,17,1,16,2
3,1,1506825426267738,2017-10-01 02:37:06,2,96663,2017-10-01 03:04:07.951,4,1,17,1,16,2
4,2,1506825435299739,2017-10-01 02:37:15,2,119592,2017-10-01 03:04:50.575,4,1,17,1,24,2


In [11]:
df_user.shape

(2988181, 12)

In [12]:
print('Number of unique values:')

for col in ["user_id", "session_id", "session_size", "article_id", "click_environment", "click_deviceGroup", "click_country", "click_region"]:
    print(col, ': ', df_user[col].nunique())

Number of unique values:
user_id :  322897
session_id :  1048594
session_size :  72
article_id :  46033
click_environment :  3
click_deviceGroup :  5
click_country :  11
click_region :  28


We can see that the number of unique article is 46033 whereas we have 364 047 articles in "data" metadata dataframe

For the collaborative filtering, we only care about the following features: ["user_id", "article_id", "click_timestamp", "session_start", "session_size"]

In [13]:
df = df_user[["user_id", "article_id", "session_id", "session_size", "session_start", "click_timestamp"]]
df.head()

Unnamed: 0,user_id,article_id,session_id,session_size,session_start,click_timestamp
0,0,157541,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:28.020
1,0,68866,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:58.020
2,1,235840,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:03:37.951
3,1,96663,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:04:07.951
4,2,119592,1506825435299739,2,2017-10-01 02:37:15,2017-10-01 03:04:50.575


We will create Delta_time score which is the time difference between the session_start and the "click_timestamp"

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2988181 entries, 0 to 2988180
Data columns (total 6 columns):
 #   Column           Dtype         
---  ------           -----         
 0   user_id          object        
 1   article_id       object        
 2   session_id       object        
 3   session_size     object        
 4   session_start    datetime64[ns]
 5   click_timestamp  datetime64[ns]
dtypes: datetime64[ns](2), object(4)
memory usage: 136.8+ MB


In [15]:
df["delta_time"] = (df["click_timestamp"] - df["session_start"])
df

Unnamed: 0,user_id,article_id,session_id,session_size,session_start,click_timestamp,delta_time
0,0,157541,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:28.020,0 days 00:23:25.020000
1,0,68866,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:58.020,0 days 00:23:55.020000
2,1,235840,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:03:37.951,0 days 00:26:31.951000
3,1,96663,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:04:07.951,0 days 00:27:01.951000
4,2,119592,1506825435299739,2,2017-10-01 02:37:15,2017-10-01 03:04:50.575,0 days 00:27:35.575000
...,...,...,...,...,...,...,...
2988176,10051,84911,1508211372158328,2,2017-10-17 03:36:12,2017-10-17 03:39:17.302,0 days 00:03:05.302000
2988177,322896,30760,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:12.520,0 days 00:04:56.520000
2988178,322896,157507,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:42.520,0 days 00:05:26.520000
2988179,123718,234481,1508211379189330,2,2017-10-17 03:36:19,2017-10-17 03:38:33.583,0 days 00:02:14.583000


In [16]:
df.delta_time.max(), df.delta_time.min()

(Timedelta('28 days 00:17:47.886000'), Timedelta('0 days 00:00:00'))

#### Select rows with a delta_time under 30 minutes (which is the time limit of a session)
And also drop rows of session_id where delta_time above 30 minutes because the session_size is therefore biased. For example, session_id=1507563657895091 has a session_size of 124 because this session has a delta_time of 9 hours. It may deform the results and the score based on click_timestamp

In [17]:
session_id_above_30 = df[df["delta_time"]>datetime.timedelta(minutes=30)]["session_id"].unique().tolist()

In [18]:
df1 = df[~df["session_id"].isin(session_id_above_30)] #We use ~ operator to check if values "is not" in the dataframe

In [19]:
df1

Unnamed: 0,user_id,article_id,session_id,session_size,session_start,click_timestamp,delta_time
0,0,157541,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:28.020,0 days 00:23:25.020000
1,0,68866,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:58.020,0 days 00:23:55.020000
2,1,235840,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:03:37.951,0 days 00:26:31.951000
3,1,96663,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:04:07.951,0 days 00:27:01.951000
4,2,119592,1506825435299739,2,2017-10-01 02:37:15,2017-10-01 03:04:50.575,0 days 00:27:35.575000
...,...,...,...,...,...,...,...
2988176,10051,84911,1508211372158328,2,2017-10-17 03:36:12,2017-10-17 03:39:17.302,0 days 00:03:05.302000
2988177,322896,30760,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:12.520,0 days 00:04:56.520000
2988178,322896,157507,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:42.520,0 days 00:05:26.520000
2988179,123718,234481,1508211379189330,2,2017-10-17 03:36:19,2017-10-17 03:38:33.583,0 days 00:02:14.583000


# Train_test_split selection depending on timestamp

I will split my data with 80-20 ratio in which the training set will be composed of firsts read articles after session start

In [20]:
df1 

Unnamed: 0,user_id,article_id,session_id,session_size,session_start,click_timestamp,delta_time
0,0,157541,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:28.020,0 days 00:23:25.020000
1,0,68866,1506825423271737,2,2017-10-01 02:37:03,2017-10-01 03:00:58.020,0 days 00:23:55.020000
2,1,235840,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:03:37.951,0 days 00:26:31.951000
3,1,96663,1506825426267738,2,2017-10-01 02:37:06,2017-10-01 03:04:07.951,0 days 00:27:01.951000
4,2,119592,1506825435299739,2,2017-10-01 02:37:15,2017-10-01 03:04:50.575,0 days 00:27:35.575000
...,...,...,...,...,...,...,...
2988176,10051,84911,1508211372158328,2,2017-10-17 03:36:12,2017-10-17 03:39:17.302,0 days 00:03:05.302000
2988177,322896,30760,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:12.520,0 days 00:04:56.520000
2988178,322896,157507,1508211376302329,2,2017-10-17 03:36:16,2017-10-17 03:41:42.520,0 days 00:05:26.520000
2988179,123718,234481,1508211379189330,2,2017-10-17 03:36:19,2017-10-17 03:38:33.583,0 days 00:02:14.583000


In [21]:
training_ratio = int(0.8 * df1.shape[0])
training_list = df1.sort_values(by='delta_time').iloc[:training_ratio].index.tolist()
testing_list = df1.sort_values(by='delta_time').iloc[training_ratio:].index.tolist()

In [22]:
df_train = df1[["user_id", "article_id"]].loc[training_list]
df_test = df1[["user_id", "article_id"]].loc[testing_list]

Add rating column to df_train

In [23]:
df_train["rating"] = int(1)

# Test Implicit 

In [24]:
df_for_matrix = df_train.copy()
df_for_matrix

Unnamed: 0,user_id,article_id,rating
2697877,91152,199474,1
1422755,226728,272218,1
130394,33014,225764,1
1405362,224927,243728,1
759813,150981,313431,1
...,...,...,...
2746673,72571,289196,1
2910548,279994,114919,1
2289475,195718,119534,1
7098,2600,205897,1


In [25]:
# check if user_id and article_id are counted in 1 to 1

print("Total unique article_id: ", df_for_matrix.article_id.nunique())
print("Min article_id: ", df_for_matrix.article_id.min())
print("Max article_id: ", df_for_matrix.article_id.max())
print('\n')
print("Total unique user_id: ", df_for_matrix.user_id.nunique())
print("Min user_id: ", df_for_matrix.user_id.min())
print("Max user_id: ", df_for_matrix.user_id.max())


Total unique article_id:  33542
Min article_id:  27
Max article_id:  364046


Total unique user_id:  291604
Min user_id:  0
Max user_id:  322896


In [26]:
df_for_matrix["encoded_user_id"] = df_for_matrix["user_id"].astype('category').cat.codes
df_for_matrix["encoded_article_id"] = df_for_matrix["article_id"].astype('category').cat.codes

In [27]:
df_for_matrix

Unnamed: 0,user_id,article_id,rating,encoded_user_id,encoded_article_id
2697877,91152,199474,1,85668,19170
1422755,226728,272218,1,209786,25637
130394,33014,225764,1,31520,21710
1405362,224927,243728,1,208203,23530
759813,150981,313431,1,141437,29637
...,...,...,...,...,...
2746673,72571,289196,1,68423,27598
2910548,279994,114919,1,255862,11901
2289475,195718,119534,1,181853,12384
7098,2600,205897,1,2509,19807


## New rating

In [28]:
# new_df is df whose rating corresponds to number of time a user read an article and not binary (if he reads this article or not)

new_df = df_for_matrix.groupby(["user_id", "article_id"]).size().reset_index(level=['user_id', "article_id"])
new_df.rename(columns={0: "rating"}, inplace=True)
new_df

Unnamed: 0,user_id,article_id,rating
0,0,87205,1
1,0,87224,1
2,0,96755,1
3,0,313996,1
4,2,30760,1
...,...,...,...
1859693,322894,168401,1
1859694,322895,63746,1
1859695,322895,289197,1
1859696,322896,30760,1


In [29]:
# Re_indexing of each user_id & article_id with ".cat.codes" to sort ids between 0 and max(id) for each
new_df["encoded_user_id"] = new_df["user_id"].astype('category').cat.codes
new_df["encoded_article_id"] = new_df["article_id"].astype('category').cat.codes

In [30]:
new_df

Unnamed: 0,user_id,article_id,rating,encoded_user_id,encoded_article_id
0,0,87205,1,0,9211
1,0,87224,1,0,9225
2,0,96755,1,0,10118
3,0,313996,1,0,29712
4,2,30760,1,1,2266
...,...,...,...,...,...
1859693,322894,168401,1,291601,16583
1859694,322895,63746,1,291602,6290
1859695,322895,289197,1,291602,27599
1859696,322896,30760,1,291603,2266


#### Save new_df to csv

In [31]:
# new_df.to_csv("Implicit_files/df_train.csv", index=False)

#### Create function to retrieve original article_id encoded

In [32]:
def encoded_article_id(original_article_id):
    dict_encoded = dict(enumerate(df_for_matrix["article_id"].astype("category").cat.categories))
    list_dict = list(dict_encoded.values())
    index = list_dict.index(original_article_id)

    return index

def decoded_article_id(encoded_article_id):
    dict_encoded = dict(enumerate(df_for_matrix["article_id"].astype("category").cat.categories))
    decoded_id = dict_encoded.get(encoded_article_id)

    return decoded_id

def encoded_user_id(original_user_id):
    dict_encoded = dict(enumerate(df_for_matrix["user_id"].astype("category").cat.categories))
    list_dict = list(dict_encoded.values())
    index = list_dict.index(original_user_id)

    return index

def decoded_user_id(encoded_user_id):
    dict_encoded = dict(enumerate(df_for_matrix["user_id"].astype("category").cat.categories))
    decoded_id = dict_encoded.get(encoded_user_id)

    return decoded_id

## Create Sparses matrices

In [33]:
#with df_for_matrix rating's
# sparse_article_user = sparse.csr_matrix((df_for_matrix["rating"].astype(float), (df_for_matrix["encoded_article_id"], df_for_matrix["encoded_user_id"])))
# sparse_user_article = sparse.csr_matrix((df_for_matrix["rating"].astype(float), (df_for_matrix["encoded_user_id"], df_for_matrix["encoded_article_id"])))

#with new_df rating's
sparse_article_user = sparse.csr_matrix((new_df["rating"].astype(float), (new_df["encoded_article_id"], new_df["encoded_user_id"])))
sparse_user_article = sparse.csr_matrix((new_df["rating"].astype(float), (new_df["encoded_user_id"], new_df["encoded_article_id"])))

In [34]:
sparse_matrix = sparse.load_npz('Implicit_files/sparse_matrix.npz')

In [35]:
os.environ['MKL_NUM_THREADS'] = '1'

In [36]:
model_als = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=50)

In [37]:
# Calculate the confidence by multiplying it by our alpha value.

alpha = 40
# data = (sparse_article_user* alpha).astype('float') # Users recommender model
# data = (sparse_user_article* alpha).astype('float') # Articles recommender model
data = (sparse_matrix* alpha).astype('float')
model_als.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

#### Save model

In [38]:
new_df.encoded_user_id.max()

291603

In [39]:
# model_als.save("Implicit_files/model_als.npz")

In [41]:
def recommend_articles(encoded_user_id):
    """ recommend 5 article ids for a user """
    if encoded_user_id > 291603:
        #Cold-start = if user never read an article
        #Chose the 5 most read articles by unique user but not the 5 most read articles 
        articles_id = df_for_matrix.groupby("encoded_article_id")["encoded_user_id"].nunique().sort_values(ascending=False).index.tolist()[:5]
        recommend_df = pd.DataFrame({"encoded_article_id_recommended": articles_id})

    else:
        articles_id, score = model_als.recommend(encoded_user_id, sparse_user_article[encoded_user_id], 5)
        recommend_df = pd.DataFrame({"encoded_article_id_recommended": list(articles_id), "score": list(score)})

    decoded_article_id_recommended = []
    for id in articles_id:
        decoded_article_id_recommended.append(decoded_article_id(id))
    
    recommend_df["article_id_recommended"] = decoded_article_id_recommended

    return recommend_df

#### Exemple articles recommendation 

In [43]:
recommend_articles(4)

Unnamed: 0,encoded_article_id_recommended,score,article_id_recommended
0,15100,1.281832,156355
1,6955,1.269672,68924
2,15317,1.263524,158047
3,25637,1.113204,272218
4,15183,1.072672,157078


In [44]:
#Get article_id from df_test that are not in df_for_matrix (because they don't have encoded_article_id)
list_intersection_article_id = list(set((df_test.article_id.unique().tolist())) - set(df_for_matrix.article_id.unique().tolist()))
len(list_intersection_article_id)

#Create new df_test with only article_id that are in df_for_matrix (because they have encoded_article_id)
df_test_read_article = df_test[~df_test.article_id.isin(list_intersection_article_id)] # ~ means "not in" (= inverse of isin())

## Add encoded_article_id column to df_test_read_article

In [45]:
df_test_read_article.head()

Unnamed: 0,user_id,article_id
226924,15945,84289
2863531,317000,63842
2586228,301781,293513
2724148,11501,237822
571931,96928,313431


In [46]:
#Dict of k, v where k is encoded_article_id and v is original_article_id
dict_article_encoded = dict(enumerate(df_for_matrix["article_id"].astype("category").cat.categories))

#Change k, v dictionnary to v, k dictionnary
invert_article_dict = {v: k for k, v in dict_article_encoded.items()}

#Create encoded_article_id column in df_test_read_article
df_test_read_article["encoded_article_id"] = df_test_read_article["article_id"].iloc[:].apply(lambda x: invert_article_dict.get(x))

In [47]:
df_test_read_article.head()

Unnamed: 0,user_id,article_id,encoded_article_id
226924,15945,84289,8752
2863531,317000,63842,6316
2586228,301781,293513,28046
2724148,11501,237822,23122
571931,96928,313431,29637


#### Save df_test_read_article

In [48]:
# df_test_read_article.to_csv("Implicit_files/df_test_read_article.csv", index=False)

# Check if user read a recommended article

In [59]:
def true_recommended_als(encoded_user_id_list):
    """ Return encoded articles ids that are recommended for a user and read by him (in df_test)"""
    if type(encoded_user_id_list) != list:
        encoded_user_id_list = [encoded_user_id_list]

    list_recommended = []
    for user_id in encoded_user_id_list:
        # array_recommended_article_id, score = model_als.recommend(user_id, sparse_user_article[user_id],5)
        array_recommended_article_id, score = model_als.recommend(user_id, sparse_user_article[user_id],5)
        list_recommended_article_id = list(array_recommended_article_id) #list of encoded_article_id that are recommended 

        #list of decoded_article_id that are in df_test for encoded_user_id
        list_encoded_article_df_test = df_test_read_article[df_test_read_article["user_id"]==decoded_user_id(user_id)]["encoded_article_id"].tolist()

        #print encoded articles ids that are recommended and in df_test for read articles
        if set(list_recommended_article_id).intersection(list_encoded_article_df_test):
            print({"encoded_user_id": user_id, "encoded_article_id": set(list_recommended_article_id).intersection(list_encoded_article_df_test)})
            list_recommended.append({"encoded_user_id": user_id, "encoded_article_id": set(list_recommended_article_id).intersection(list_encoded_article_df_test)})
        
    return list_recommended


In [60]:
df_test_read_article

Unnamed: 0,user_id,article_id,encoded_article_id
226924,15945,84289,8752
2863531,317000,63842,6316
2586228,301781,293513,28046
2724148,11501,237822,23122
571931,96928,313431,29637
...,...,...,...
74684,25934,156025,15063
838004,94232,283996,26780
469681,120542,129004,13043
1904396,58870,208237,20178


In [None]:
true_recommended_als(list(range(100)))

{'encoded_user_id': 4, 'encoded_article_id': {25637}}
{'encoded_user_id': 19, 'encoded_article_id': {29649}}
{'encoded_user_id': 20, 'encoded_article_id': {27946}}
{'encoded_user_id': 28, 'encoded_article_id': {5568, 27404}}
{'encoded_user_id': 42, 'encoded_article_id': {27576, 25627, 12315}}
{'encoded_user_id': 45, 'encoded_article_id': {26088}}
{'encoded_user_id': 56, 'encoded_article_id': {22427}}
{'encoded_user_id': 69, 'encoded_article_id': {22596}}
{'encoded_user_id': 72, 'encoded_article_id': {22427}}
{'encoded_user_id': 75, 'encoded_article_id': {11378, 12398}}
{'encoded_user_id': 79, 'encoded_article_id': {17811}}
{'encoded_user_id': 84, 'encoded_article_id': {12398}}
{'encoded_user_id': 85, 'encoded_article_id': {19978}}
{'encoded_user_id': 92, 'encoded_article_id': {20154}}
{'encoded_user_id': 93, 'encoded_article_id': {20002}}


[{'encoded_user_id': 4, 'encoded_article_id': {25637}},
 {'encoded_user_id': 19, 'encoded_article_id': {29649}},
 {'encoded_user_id': 20, 'encoded_article_id': {27946}},
 {'encoded_user_id': 28, 'encoded_article_id': {5568, 27404}},
 {'encoded_user_id': 42, 'encoded_article_id': {12315, 25627, 27576}},
 {'encoded_user_id': 45, 'encoded_article_id': {26088}},
 {'encoded_user_id': 56, 'encoded_article_id': {22427}},
 {'encoded_user_id': 69, 'encoded_article_id': {22596}},
 {'encoded_user_id': 72, 'encoded_article_id': {22427}},
 {'encoded_user_id': 75, 'encoded_article_id': {11378, 12398}},
 {'encoded_user_id': 79, 'encoded_article_id': {17811}},
 {'encoded_user_id': 84, 'encoded_article_id': {12398}},
 {'encoded_user_id': 85, 'encoded_article_id': {19978}},
 {'encoded_user_id': 92, 'encoded_article_id': {20154}},
 {'encoded_user_id': 93, 'encoded_article_id': {20002}}]

In [61]:
true_recommended_als(4)

{'encoded_user_id': 4, 'encoded_article_id': {25637}}


[{'encoded_user_id': 4, 'encoded_article_id': {25637}}]

# SVD - Collaborative Filtering

In [36]:
df_train = df1[["user_id", "article_id", "click_timestamp"]].loc[training_list]
df_test = df1[["user_id", "article_id", "click_timestamp"]].loc[testing_list]

In [37]:
df_train.head()

Unnamed: 0,user_id,article_id,click_timestamp
2697877,91152,199474,2017-10-15 02:29:10
1422755,226728,272218,2017-10-08 01:55:53
130394,33014,225764,2017-10-02 08:09:36
1405362,224927,243728,2017-10-07 22:18:41
759813,150981,313431,2017-10-04 14:35:02


## Create New rating for SVD
click ratio = number of clicks of user_id for article_id / total number of click of user_id

In [38]:
#Dictionnary of k, v where k is user_id and v is total_click by user_id
dict_total_click = df_train.user_id.value_counts().sort_index().to_dict()

In [39]:
df_train = df_train.groupby(["user_id", "article_id"]).count().reset_index(level=[0,1]) #Get count of click for each user_id/article_id pair in df_train
df_train = df_train.rename(columns={"click_timestamp":"nbr_click_article"}) 
df_train["total_click_user"] = df_train.user_id.iloc[:].apply(lambda x: dict_total_click.get(x)).astype(np.int64) 
df_train["click_ratio"] = df_train["nbr_click_article"] / df_train["total_click_user"]

In [40]:
df_train.head()

Unnamed: 0,user_id,article_id,nbr_click_article,total_click_user,click_ratio
0,0,87205,1,4,0.25
1,0,87224,1,4,0.25
2,0,96755,1,4,0.25
3,0,313996,1,4,0.25
4,2,30760,1,2,0.5


# Model

In [41]:
reader = Reader(rating_scale=(0,1))

data = Dataset.load_from_df(df_train[['user_id', 'article_id', 'click_ratio']], reader)
train_set, test_set = train_test_split(data, test_size=0.4)
print(f'Size of test set : {len(test_set)}')

Size of test set : 743880


In [42]:
%%time

model = SVD().fit(train_set)

CPU times: total: 1min 5s
Wall time: 1min 5s


In [43]:
%%time

predict = model.test(test_set)

CPU times: total: 7.56 s
Wall time: 7.55 s


In [44]:
predict[:5]

[Prediction(uid=68288, iid=160417, r_ui=0.08333333333333333, est=0.3467518460576517, details={'was_impossible': False}),
 Prediction(uid=52402, iid=48403, r_ui=0.1111111111111111, est=0.2833397303637268, details={'was_impossible': False}),
 Prediction(uid=136745, iid=181873, r_ui=0.125, est=0.26146610657424685, details={'was_impossible': False}),
 Prediction(uid=47285, iid=172084, r_ui=0.03571428571428571, est=0.12050640091025733, details={'was_impossible': False}),
 Prediction(uid=80883, iid=160974, r_ui=0.5, est=0.4544983474002058, details={'was_impossible': False})]

In [45]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [52]:
top_n = get_top_n(predict, n=5)

## Get true recommendations with df_test_reast_articles for a list of users

In [73]:
def true_recommendation_svd(list_user_id, top_n):
    """ Return encoded articles ids that are recommended for a user and read by him (in df_test)"""
    if type(list_user_id) != list:
        list_user_id = [list_user_id]

    list_true_recommendation = []    
    for user_id in list_user_id:
        recommended_list_article = []
        list_score = []
        for id, score in top_n.get(user_id):
            recommended_list_article.append(id)
            list_score.append(score)

        list_article_test = df_test_read_article[df_test_read_article["user_id"]==user_id]["article_id"].unique().tolist()

        if set(recommended_list_article).intersection(list_article_test):
            # print({"user_id": user_id, "article_id": set(recommended_list_article).intersection(list_article_test)})
            list_true_recommendation.append({"user_id": user_id, "article_id": set(recommended_list_article).intersection(list_article_test)})
    
    # print("Number of true recommended articles:", len(list_true_recommendation))
    print(f"Number of true recommended articles for {len(list_user_id)} users: {len(list_true_recommendation)}")

    return list_true_recommendation


In [74]:
#Create list of all user_id in test_set

list_test_set_users = []
for tuple in test_set:
    list_test_set_users.append(tuple[0])

len(list_test_set_users)

743880

In [75]:
true_recommendation_svd(list_test_set_users[:500], top_n)

Number of true recommended articles for 500 users: 7


[{'user_id': 180143, 'article_id': {42298}},
 {'user_id': 39784, 'article_id': {156560}},
 {'user_id': 92939, 'article_id': {331116}},
 {'user_id': 79618, 'article_id': {336223}},
 {'user_id': 1804, 'article_id': {234698}},
 {'user_id': 4555, 'article_id': {160474}},
 {'user_id': 92651, 'article_id': {123909}}]

# Content-based embeddings

In [76]:
pickle = pd.read_pickle(articles_embedding_file)
df_embedding = pd.DataFrame(pickle)
print(df_embedding.shape)
df_embedding.head()

(364047, 250)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


#### DataFrame with only article embeddings we use in df_train

In [77]:
list_article_ids_used = df_train["article_id"].unique().tolist()

In [78]:
df_embedding = df_embedding.loc[list_article_ids_used].sort_index()
df_embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
27,-0.700814,-0.965927,-0.181869,-0.423504,-0.024668,0.187061,-0.674657,-0.108778,-0.762116,0.161282,...,0.165887,0.694188,0.495848,-0.629343,-0.231966,0.564555,-0.762103,-0.388876,0.696865,-0.532873
81,0.221564,-0.978803,0.614596,-0.049789,0.159638,-0.029031,-0.716393,-0.287863,-0.622962,-0.040135,...,0.413513,0.150322,0.073361,-0.152517,-0.645792,0.680978,-0.519698,-0.248209,0.717366,-0.816945
94,-0.379733,-0.971806,0.762224,0.572289,0.373779,-0.252386,-0.849416,0.075758,-0.642696,-0.306346,...,0.212263,0.208385,-0.399368,-0.629359,-0.454191,0.649570,-0.606913,-0.508764,0.367875,-0.770282
125,-0.421352,-0.976883,0.679862,0.014020,0.635972,-0.174772,-0.751978,-0.834340,-0.726507,-0.819717,...,0.270599,0.483142,0.374510,-0.745426,-0.558665,0.368720,-0.676668,-0.597302,0.512930,-0.947722
137,-0.177647,-0.969250,0.439703,-0.186062,0.241945,-0.330587,-0.694466,-0.472638,-0.586550,-0.167608,...,0.480440,0.520094,-0.209589,-0.429436,-0.534269,0.509226,-0.690509,-0.650315,0.738715,-0.939406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364014,0.695122,-0.976821,0.526489,0.459688,-0.591979,0.613581,-0.456412,0.538633,0.135446,0.672410,...,0.262247,-0.412093,-0.469951,0.143977,-0.045607,0.439361,0.137141,0.566291,-0.056841,0.527219
364017,0.290270,-0.963398,-0.019136,0.596347,-0.589016,-0.091982,0.223495,-0.072997,0.472463,0.119069,...,0.850100,0.234533,0.005308,-0.169417,0.443876,0.299790,0.257217,0.615377,0.048421,-0.309620
364022,0.529817,-0.967979,0.503438,0.448184,0.181285,-0.245775,0.094118,0.253014,-0.301180,0.295364,...,0.518466,0.591505,-0.332256,-0.330351,-0.270494,0.700624,-0.471617,-0.323244,-0.237431,-0.838007
364043,-0.136932,-0.995471,0.991298,0.031871,-0.915622,-0.658517,0.633090,-0.564356,0.676551,-0.446068,...,-0.681986,-0.574185,-0.536908,0.688934,0.528204,0.162435,0.940364,0.989298,-0.761595,-0.414652


In [79]:
#Dict of k, v where k is encoded_article_id and v is original_article_id
dict_article_encoded = dict(enumerate(df_for_matrix["article_id"].astype("category").cat.categories))
list_from_dict_article_encoded = list(dict_article_encoded.values())
#because when calcul all cos_sim with the 33542 articles, we have a dict where values are cos_sim score but keys are indexes 0 to 33541 which are encoded_article_id

In [80]:
def recommendation_embedding(user_id):
    list_articles = df_train[df_train["user_id"]==user_id]["article_id"].tolist()
    mean_embedding = df_embedding.loc[list_articles].mean() #Calculate mean embedding from list of articles read by user
    mean_embedding = np.array(mean_embedding).reshape((1,250))

    cos_sim = cosine_similarity(df_embedding, mean_embedding) #Calculate cosine similarity btw most_article_embedding and all article embeddings (=33542) an return array of unit lists
    cos_sim = list(map(float, chain.from_iterable(cos_sim.tolist()))) #transform array of lists in one list 
    dict_sim = dict(zip(list_from_dict_article_encoded, cos_sim)) #index cos_sim score from each cosine similarity btw mean_emb and articles_emb
    dict_sim = dict(sorted(dict_sim.items(), key=lambda item: item[1], reverse=True)) #Sort cos_sim score from highest to smallest

    df = pd.DataFrame({"recommended_article_id":list(dict_sim.keys())[:5], "cosine_sim":list(dict_sim.values())[:5]}) #Chose 5 articles with highest cos_sim score

    return df

In [81]:
def recommendation_embedding_most(user_id):
    
    article_user = new_df[new_df["user_id"]==24].sort_values(by="rating", ascending=False)["article_id"].iloc[0]
    embedding = df_embedding.loc[article_user] #Get embedding of most read article by user_id
    embedding = np.array(embedding).reshape((1,250))

    cos_sim = cosine_similarity(df_embedding, embedding) #Calculate cosine similarity btw most_article_embedding and all article embeddings (=33542) an return array of unit lists
    cos_sim = list(map(float, chain.from_iterable(cos_sim.tolist()))) #transform array of lists in one list 
    dict_sim = dict(zip(list_from_dict_article_encoded, cos_sim)) #index cos_sim score from each cosine similarity btw most_article_embedding and all_articles_emb
    dict_sim = dict(sorted(dict_sim.items(), key=lambda item: item[1], reverse=True)) #Sort cos_sim score from highest to smallest

    df = pd.DataFrame({"recommended_article_id":list(dict_sim.keys())[:5], "cosine_sim":list(dict_sim.values())[:5]}) #Chose 5 articles with highest cos_sim score

    return df

In [82]:
recommendation_embedding_most(0), recommendation_embedding(0)

(   recommended_article_id  cosine_sim
 0                  157798    1.000000
 1                  159266    0.928678
 2                  157791    0.903507
 3                  157792    0.879511
 4                  159450    0.878364,
    recommended_article_id  cosine_sim
 0                   87656    0.776542
 1                   86699    0.769056
 2                   86223    0.762267
 3                   86989    0.757305
 4                   86038    0.747682)

In [83]:
def true_recommendation_embedding(list_user_id):
    """ Check if user_id has read recommended_article in df_test_read_article"""
    
    if type(list_user_id) != list:
        list_user_id = [list_user_id]

    list_true_recommended = []
    for user_id in list_user_id:    
        # print(user_id)
        # list_recommended = recommendation_embedding(user_id)["recommended_article_id"].tolist()
        list_recommended = recommendation_embedding(user_id)["recommended_article_id"].tolist()
        list_article_df_test = df_test_read_article[df_test_read_article["user_id"]==user_id]["article_id"].tolist()

        if set(list_recommended).intersection(list_article_df_test):
            print(set(list_recommended).intersection(list_article_df_test), user_id)
            list_true_recommended.append(set(list_recommended).intersection(list_article_df_test))
    
    return list_true_recommended

In [84]:
#Create list of user_id that are in df_train and df_test_read_article

list_df_train_user = df_train["user_id"].unique().tolist()
list_df_article = df_test_read_article["user_id"].unique().tolist()
liste_train_test = list(set(list_df_train_user).intersection(list_df_article))
len(liste_train_test)

95479

In [85]:
true_recommendation_embedding(liste_train_test[:100])

{157541} 45
{106886} 60
{217852} 69
{271053} 72


[{157541}, {106886}, {217852}, {271053}]

In [86]:
true_recommendation_embedding(45)

{157541} 45


[{157541}]