# Constitution des fichiers "Users" et "EmbeddingMatrix"

In [2]:
import os
import pandas as pd
import pickle
from sklearn import decomposition

class ProductionStartUp():

    def __init__(self, pathClicks, pathEmbedMatrix):
        self.pathClicks = pathClicks
        self.pathEmbedMatrix = pathEmbedMatrix

    def getArticleRef(self, sr):
        sr['article_ref_id'] = dfClicks[dfClicks.user_id==sr['user_id']].loc[dfClicks.click_timestamp==sr['most_resent_clk'],['click_article_id']].values[0][0]
        return sr

    def mergeUsersClicks(self):
        clicks_path = []
        clicks_dir = self.pathClicks# contient tous les fichiers de log

        clicks_path = clicks_path + sorted(
                [
                    os.path.join(clicks_dir, fname)
                    for fname in os.listdir(clicks_dir)
                    if fname.endswith(".csv")
                ]
            )
        print("Nombre de fichiers CSV: ", len(clicks_path))

        _li = []
        for filename in clicks_path:
            df = pd.read_csv(filename, index_col=None, header=0)
            _li.append(df)

        if not os.path.exists('clicks.csv'):
            clicks = pd.concat(_li, axis=0, ignore_index=True)# concaténation de tout les fichiers trouvés
            clicks.to_csv('clicks.csv')
        else:
            # dans ce cas il faut ajouter les nouveaux fichiers aux existants
            df = pd.read_csv('clicks.csv', index_col=None, header=0)
            _li.append(df)
            clicks = pd.concat(_li, axis=0, ignore_index=True)# concaténation de tout les fichiers trouvés
            clicks.to_csv('clicks.csv')

        # agrégation des dataSets
        dfArtMeta = pd.read_csv(os.path.join(self.pathClicks, "articles_metadata.csv"))

        srClicksUniq = clicks.groupby('user_id')['click_article_id'].agg('nunique')
        dfUsers = pd.DataFrame(
            {
                'user_id': srClicksUniq.index.values
                ,'nb_click_article': srClicksUniq.values
                ,'most_resent_clk': clicks.groupby('user_id')['click_timestamp'].agg('max')
            }
        )

        dfUsers = dfUsers.apply(self.getArticleRef, axis=1)
        dfUsers.to_csv(os.path.join(self.pathClicks, "Users.csv"))


    def zipEmbeddingMatrix(self):

        with open(pathEmbedMatrix,'rb') as f:
            ndArtEmbed_pca = pickle.load(f)

        print("Dimensions dataset avant réduction PCA : ", ndArtEmbed.shape)
        pca = decomposition.PCA(n_components=0.99)# nous prenons soins de garder 99% des informations "utiles"
        ndArtEmbed_pca= pca.fit_transform(ndArtEmbed)
        print("Dimensions dataset après réduction PCA : ", ndArtEmbed_pca.shape)
        
        with open(pathEmbedMatrix,'wb') as f:
            pickle.dump(ndArtEmbed_pca,f)

In [None]:
startUp = ProductionStartUp('../Ressources/clicks/', '../Ressources/embedMatrix.pkl')
startUp.mergeUsersClicks()
startUp.zipEmbeddingMatrix()

# Constitution des fichiers de la FunctionApp

In [None]:
import os

os.makedirs("productStartUp")
os.makedirs("productStartUp/functionApp")
os.makedirs("productStartUp/functionApp/HttpTrigger1")

In [None]:
%%writefile productStartUp/functionApp/requirements.txt
azure-functions==1.11.2
certifi==2022.5.18.1
joblib==1.1.0
numpy==1.22.4
pandas==1.4.2
python-dateutil==2.8.2
pytz==2022.1
scikit-learn==1.1.1
scipy==1.8.1
six==1.16.0
sklearn==0.0
threadpoolctl==3.1.0
wincertstore==0.2

Writing productStartUp/functionApp/requirements.txt


In [None]:
%%writefile productStartUp/functionApp/local.settings.json
{
  "IsEncrypted": false,
  "Values": {
    "AzureWebJobsStorage": "DefaultEndpointsProtocol=https;AccountName=bookshelfrbe2;AccountKey=8FdjkQDCRNgsisEag1CqgqNGUozCyIhygfVhyqZk5y77TeZ0908T8pG8DUfoYNk3EaVpAE4LuaAT+AStwZhwsw==;EndpointSuffix=core.windows.net",
    "FUNCTIONS_WORKER_RUNTIME": "python"
  }
}

Writing productStartUp/functionApp/local.settings.json


In [None]:
%%writefile productStartUp/functionApp/HttpTrigger1/function.json
{
  "scriptFile": "__init__.py",
  "bindings": [
    {
      "authLevel": "anonymous",
      "type": "httpTrigger",
      "direction": "in",
      "name": "req",
      "methods": [
        "get",
        "post"
      ]
    },
    {
      "type": "http",
      "direction": "out",
      "name": "$return"
    },
    {
      "type": "blob",
      "direction": "in",
      "name": "users",
      "path": "azure-webjobs-hosts/locks/bookshelflastversion/Users.csv",
      "connection": "AzureWebJobsStorage"
    },
    {
      "type": "blob",
      "direction": "in",
      "name": "embedMatrix",
      "dataType": "binary",
      "path": "azure-webjobs-hosts/locks/bookshelflastversion/embedMatrix.pkl",
      "connection": "AzureWebJobsStorage"
    }
  ]
}

Writing productStartUp/functionApp/HttpTrigger1/function.json


In [None]:
%%writefile productStartUp/functionApp/HttpTrigger1/__init__.py
import azure.functions as func
import os
import tempfile

import json

import numpy as np
import pandas as pd
from operator import itemgetter
import pickle

from io import BytesIO

def find_top_n_indices(data, top=5):
    indexed = enumerate(data)
    sorted_data = sorted(indexed, 
                         key=itemgetter(1), 
                         reverse=True) 
    return [d[0] for d in sorted_data[:top]]

def recommendFromArticle(article_id, top):
    score = []
    for i in range(0, len(ndArtEmbed)):
        if(article_id != i):
            cos_sim = np.dot(ndArtEmbed[article_id], ndArtEmbed[i])/(np.linalg.norm(ndArtEmbed[article_id])*np.linalg.norm(ndArtEmbed[i]))
            score.append(cos_sim)

    _best_scores = find_top_n_indices(score, top)
            
    return _best_scores

def main(req: func.HttpRequest,
        users: func.InputStream,
        embedMatrix: func.InputStream) -> func.HttpResponse:

    global ndArtEmbed

    # Chargement de la dataFrame utilisateurs
    bUsers = users.read()
    dfUsers = pd.read_csv(BytesIO(bUsers), index_col=False)

    # chargement de la matrice d'embedding
    temp_path = tempfile.gettempdir()
    file_name = os.path.join(temp_path, "embedMatrix.pkl")
    with open(file_name, "w+b") as local_file:
        local_file.write(embedMatrix.read())

    with open(local_file.name,'rb') as f:
        ndArtEmbed = pickle.load(f)

    jsonIdUser = req.get_json()

    # récupération de l'id de l'article de référence de l'utilisateur
    articleRef = dfUsers.loc[dfUsers.user_id==jsonIdUser['userId'],['article_ref_id']].values[0,0]
    lReco = recommendFromArticle(articleRef, 5)

    headers = {"Content-Type": "application/json"}
    return func.HttpResponse(
        json.dumps(lReco)
        ,headers=headers
    )


Writing productStartUp/functionApp/HttpTrigger1/__init__.py


# Paramétrage de l'application BookShelf

In [None]:
%%writefile config.json
{
  "API_URL": "https://bookshelflastversion.azurewebsites.net/api/httptrigger1"
}

# Pour aller plus loin et test du Collaborative-Filtering

In [None]:
# contitution DF utilisateur
dfArtMeta = pd.read_csv("../Ressources/articles_metadata.csv")
dfClicks = pd.read_csv('clicks.csv', index_col=False)

In [None]:
dfClics = dfClicks[['user_id','click_article_id','click_timestamp']]
dfClicsAndMeta = dfClics.merge(dfArtMeta[['article_id','category_id']], left_on='click_article_id', right_on='article_id')
dfClicsAndMeta

Unnamed: 0,user_id,click_article_id,click_timestamp,article_id,category_id
0,0,157541,1506826828020,157541,281
1,20,157541,1506836548634,157541,281
2,44,157541,1506857278141,157541,281
3,45,157541,1506827309970,157541,281
4,76,157541,1506828823469,157541,281
...,...,...,...,...,...
2988176,195186,2221,1508210469562,2221,1
2988177,75658,271117,1508210951703,271117,399
2988178,217129,20204,1508210990810,20204,9
2988179,217129,70196,1508211020810,70196,136


In [None]:
dfCatego = dfClicsAndMeta.groupby(['user_id', 'category_id']).size().to_frame().reset_index()

In [None]:
def switchCategoRef(sr):
    grCatego = grDfArtByCatego.get_group(sr['category_id'])
    dfArticleClics = grCatego[grCatego.user_id == sr['user_id'] ][['click_timestamp','article_id']]
    tsLastClic = dfArticleClics.click_timestamp.max()

    sr['article_id'] = dfArticleClics.loc[dfArticleClics.click_timestamp==tsLastClic]['article_id'].values[0]
    return sr


In [None]:
dfCatego.rename(columns = {0:'rate'}, inplace = True)
dfCatego = dfCatego.loc[dfCatego.rate > 1]

In [None]:
grDfArtByCatego = dfClicsAndMeta.groupby('category_id')
dfCatego = dfCatego.apply(switchCategoRef, axis=1)
# 41 min

Enregistrement et chargement

In [None]:
dfCatego.to_csv("collabFilteringMatrix.csv")

In [3]:
dfCatego = pd.read_csv("collabFilteringMatrix.csv")
dfCatego

Unnamed: 0.1,Unnamed: 0,user_id,category_id,rate,article_id
0,1,0,186,2,87205
1,3,0,281,2,160158
2,9,1,281,2,156723
3,12,1,375,2,234481
4,13,1,412,2,283392
...,...,...,...,...,...
503611,1882258,322874,228,2,107190
503612,1882261,322876,412,2,285300
503613,1882266,322879,281,2,158331
503614,1882277,322884,340,2,211455


## Implémentation de Surprise

In [4]:
from surprise import SVD, accuracy
from surprise import Reader, Dataset
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

In [10]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(dfCatego[['user_id', 'article_id', 'rate']], reader)

print(f'Nous avons sélectionné {len(dfCatego)} interactions.')

Nous avons sélectionné 503616 interactions.


In [11]:
%%time
trainset, testset = train_test_split(data, test_size=0.25)
print('Taille du set de test :', len(testset))
print("Taille du set d'entrainement :", len(dfCatego) - len(testset))

Taille du set de test : 125904
Taille du set d'entrainement : 377712
Wall time: 1.01 s


In [7]:
%%time
algo = SVD()
algo.fit(trainset)

Wall time: 15.5 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14770633a00>

In [8]:
%%time
predictions = algo.test(testset)
print("Nombre de prédictions dans l'ensemble de test :", len(predictions))

Nombre de prédictions dans l'ensemble de test : 125904
Wall time: 1.09 s


In [9]:
accuracy.rmse(predictions)

RMSE: 2.8109


2.810854409632504

Autre point de vue:

In [None]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(dfCatego[['user_id', 'article_id', 'rate']], reader)
print(f'Nous avons sélectionnés {len(dfCatego)} interactions.')

Nous avons sélectionné 503616 interactions.


In [None]:
trainset = data.build_full_trainset()

In [None]:
%%time
algo = SVD()
algo.fit(trainset)

Wall time: 18.1 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20f0eae63a0>

In [None]:
res = algo.predict('3','31005',verbose=True)
res

user: 3          item: 31005      r_ui = None   est = 3.20   {'was_impossible': False}


Prediction(uid='3', iid='31005', r_ui=None, est=3.1958774145380606, details={'was_impossible': False})

In [None]:
from collections import defaultdict

In [None]:
top_n = defaultdict(list)
for iIdPnt in lArticleId:
    uId, iId, r_ui, esti, _ = algo.predict(str(3),str(iIdPnt))
    top_n[uId].append((iId, esti))

In [None]:
for uid, user_ratings in top_n.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)