# Constitution des fichiers "Users" et "EmbeddingMatrix"

In [1]:
import os
import pandas as pd
import pickle
from sklearn import decomposition

class ProductionStartUp():

    def __init__(self, pathClicks, pathEmbedMatrix):
        self.pathClicks = pathClicks
        self.pathEmbedMatrix = pathEmbedMatrix

    def getArticleRef(self, sr):
        sr['article_ref_id'] = dfClicks[dfClicks.user_id==sr['user_id']].loc[dfClicks.click_timestamp==sr['most_resent_clk'],['click_article_id']].values[0][0]
        return sr

    def mergeUsersClicks(self):
        clicks_path = []
        clicks_dir = self.pathClicks# contient tous les fichiers de log

        clicks_path = clicks_path + sorted(
                [
                    os.path.join(clicks_dir, fname)
                    for fname in os.listdir(clicks_dir)
                    if fname.endswith(".csv")
                ]
            )
        print("Nombre de fichiers CSV: ", len(clicks_path))

        _li = []
        for filename in clicks_path:
            df = pd.read_csv(filename, index_col=None, header=0)
            _li.append(df)

        if not os.path.exists('clicks.csv'):
            clicks = pd.concat(_li, axis=0, ignore_index=True)# concaténation de tout les fichiers trouvés
            clicks.to_csv('clicks.csv')
        else:
            # dans ce cas il faut ajouter les nouveaux fichiers aux existants
            df = pd.read_csv('clicks.csv', index_col=None, header=0)
            _li.append(df)
            clicks = pd.concat(_li, axis=0, ignore_index=True)# concaténation de tout les fichiers trouvés
            clicks.to_csv('clicks.csv')

        # agrégation des dataSets
        dfArtMeta = pd.read_csv(os.path.join(self.pathClicks, "articles_metadata.csv"))

        srClicksUniq = clicks.groupby('user_id')['click_article_id'].agg('nunique')
        dfUsers = pd.DataFrame(
            {
                'user_id': srClicksUniq.index.values
                ,'nb_click_article': srClicksUniq.values
                ,'most_resent_clk': clicks.groupby('user_id')['click_timestamp'].agg('max')
            }
        )

        dfUsers = dfUsers.apply(self.getArticleRef, axis=1)
        dfUsers.to_csv(os.path.join(self.pathClicks, "Users.csv"))


    def zipEmbeddingMatrix(self):

        with open(pathEmbedMatrix,'rb') as f:
            ndArtEmbed_pca = pickle.load(f)

        print("Dimensions dataset avant réduction PCA : ", ndArtEmbed.shape)
        pca = decomposition.PCA(n_components=0.99)# nous prenons soins de garder 99% des informations "utiles"
        ndArtEmbed_pca= pca.fit_transform(ndArtEmbed)
        print("Dimensions dataset après réduction PCA : ", ndArtEmbed_pca.shape)
        
        with open(pathEmbedMatrix,'wb') as f:
            pickle.dump(ndArtEmbed_pca,f)

In [None]:
startUp = ProductionStartUp('../Ressources/clicks/', '../Ressources/embedMatrix.pkl')
startUp.mergeUsersClicks()
startUp.zipEmbeddingMatrix()

# Constitution des fichiers de la FunctionApp

In [None]:
import os

os.makedirs("productStartUp")
os.makedirs("productStartUp/functionApp")
os.makedirs("productStartUp/functionApp/HttpTrigger1")

In [None]:
%%writefile productStartUp/functionApp/requirements.txt
azure-functions==1.11.2
certifi==2022.5.18.1
joblib==1.1.0
numpy==1.22.4
pandas==1.4.2
python-dateutil==2.8.2
pytz==2022.1
scikit-learn==1.1.1
scipy==1.8.1
six==1.16.0
sklearn==0.0
threadpoolctl==3.1.0
wincertstore==0.2

Writing productStartUp/functionApp/requirements.txt


In [None]:
%%writefile productStartUp/functionApp/local.settings.json
{
  "IsEncrypted": false,
  "Values": {
    "AzureWebJobsStorage": "DefaultEndpointsProtocol=https;AccountName=bookshelfrbe2;AccountKey=8FdjkQDCRNgsisEag1CqgqNGUozCyIhygfVhyqZk5y77TeZ0908T8pG8DUfoYNk3EaVpAE4LuaAT+AStwZhwsw==;EndpointSuffix=core.windows.net",
    "FUNCTIONS_WORKER_RUNTIME": "python"
  }
}

Writing productStartUp/functionApp/local.settings.json


In [None]:
%%writefile productStartUp/functionApp/HttpTrigger1/function.json
{
  "scriptFile": "__init__.py",
  "bindings": [
    {
      "authLevel": "anonymous",
      "type": "httpTrigger",
      "direction": "in",
      "name": "req",
      "methods": [
        "get",
        "post"
      ]
    },
    {
      "type": "http",
      "direction": "out",
      "name": "$return"
    },
    {
      "type": "blob",
      "direction": "in",
      "name": "users",
      "path": "azure-webjobs-hosts/locks/bookshelfrbe/Users.csv",
      "connection": "AzureWebJobsStorage"
    },
    {
      "type": "blob",
      "direction": "in",
      "name": "embedMatrix",
      "dataType": "binary",
      "path": "azure-webjobs-hosts/locks/bookshelfrbe/embedMatrix.pkl",
      "connection": "AzureWebJobsStorage"
    }
  ]
}

Writing productStartUp/functionApp/HttpTrigger1/function.json


In [None]:
%%writefile productStartUp/functionApp/HttpTrigger1/__init__.py
import azure.functions as func
import os
import tempfile

import json

import numpy as np
import pandas as pd
from operator import itemgetter
import pickle

from io import BytesIO

def find_top_n_indices(data, top=5):
    indexed = enumerate(data)
    sorted_data = sorted(indexed, 
                         key=itemgetter(1), 
                         reverse=True) 
    return [d[0] for d in sorted_data[:top]]

def recommendFromArticle(article_id, top):
    score = []
    for i in range(0, len(ndArtEmbed)):
        if(article_id != i):
            cos_sim = np.dot(ndArtEmbed[article_id], ndArtEmbed[i])/(np.linalg.norm(ndArtEmbed[article_id])*np.linalg.norm(ndArtEmbed[i]))
            score.append(cos_sim)

    _best_scores = find_top_n_indices(score, top)
            
    return _best_scores

def main(req: func.HttpRequest,
        users: func.InputStream,
        embedMatrix: func.InputStream) -> func.HttpResponse:

    global ndArtEmbed

    # Chargement de la dataFrame utilisateurs
    bUsers = users.read()
    dfUsers = pd.read_csv(BytesIO(bUsers), index_col=False)

    # chargement de la matrice d'embedding
    temp_path = tempfile.gettempdir()
    file_name = os.path.join(temp_path, "embedMatrix.pkl")
    with open(file_name, "w+b") as local_file:
        local_file.write(embedMatrix.read())

    with open(local_file.name,'rb') as f:
        ndArtEmbed = pickle.load(f)

    jsonIdUser = req.get_json()

    # récupération de l'id de l'article de référence de l'utilisateur
    articleRef = dfUsers.loc[dfUsers.user_id==jsonIdUser['userId'],['article_ref_id']].values[0,0]
    lReco = recommendFromArticle(articleRef, 5)

    headers = {"Content-Type": "application/json"}
    return func.HttpResponse(
        json.dumps(lReco)
        ,headers=headers
    )


Writing productStartUp/functionApp/HttpTrigger1/__init__.py


# Paramétrage de l'application BookShelf

In [None]:
%%writefile config.json
{
  "API_URL": "https://bookshelfrbe.azurewebsites.net/api/httptrigger1"
}