In [156]:
# dataframe manipulation
import numpy as np
import pandas as pd

# vectorization
from sklearn.preprocessing import LabelEncoder, StandardScaler

# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import numpy as np

# system
from sys import exit as sysexit
import statistics
from datetime import datetime

In [157]:
# import original dataset
df = pd.read_csv("../datasets/KaDo.csv", sep=",")

In [158]:
# define vectorized dataframe and reindex it
vec_df = df.reindex(columns=['CLI_ID', 'MOIS_VENTE', 'PRIX_NET', 'FAMILLE', 'UNIVERS', 'MAILLE', 'LIBELLE'])
# vectorized_df = df.sort_values(by=['CLI_ID'])
# vectorized_df.cov()

In [159]:
# # normalize numerical variables
# scaler = StandardScaler()
# df[["MOIS_VENTE", "PRIX_NET"]] = scaler.fit_transform(df[["MOIS_VENTE", "PRIX_NET"]])

# vectorize text variables
encoder = LabelEncoder()
vec_df["FAMILLE"] = encoder.fit_transform(vec_df["FAMILLE"])
vec_df["UNIVERS"] = encoder.fit_transform(vec_df["UNIVERS"])
vec_df["MAILLE"] = encoder.fit_transform(vec_df["MAILLE"])

vec_df["LIBELLE"] = vec_df["LIBELLE"].apply(hash)

# normalize every variables
scaler = StandardScaler()
vec_df[["MOIS_VENTE", "PRIX_NET", "FAMILLE", "UNIVERS", "MAILLE", "LIBELLE"]] = scaler.fit_transform(vec_df[["MOIS_VENTE", "PRIX_NET", "FAMILLE", "UNIVERS", "MAILLE", "LIBELLE"]])

# remove unused columns
# vec_df.drop(['MOIS_VENTE', 'PRIX_NET', 'FAMILLE', 'UNIVERS', 'MAILLE', 'LIBELLE'], axis=1, inplace=True)

In [165]:
# export dataframe to csv
# vec_df.to_csv("../datasets/vectorized_KaDo.csv", sep=",", index=False)

In [181]:
# pairwise_distances(previous_purchases, other_purchases, metric='cosine')
# ACP (stat) (concat des colonnes entre-elles)

# cli_id = 1490281
# cli_id = 21351166
cli_id = 90822328
# cli_id = 126716008

# get the 5 most similar product depending client's previous purchases
client_purchases = vec_df[vec_df['CLI_ID'] == cli_id]
other_purchases = vec_df[vec_df['CLI_ID'] != cli_id]
similarities = cosine_similarity(client_purchases, other_purchases)
top_indices = similarities.argsort()[:, -5:]

# get input items from client purchases
product_input_label = df[df['CLI_ID'] == cli_id]['LIBELLE'].tolist()
product_input = df[df['CLI_ID'] == cli_id].values.tolist()

# format output items and similarities in a single dataframe
base_recommendation = pd.DataFrame()
similarities_percentages = []
for elem, i in zip(top_indices, range(len(top_indices))):
    base_recommendation = pd.concat([base_recommendation, df.iloc[elem]])
    top_articles = df.iloc[elem]['LIBELLE'].tolist()
    for line, n in zip(top_articles, range(len(top_articles))):
        similarities_percentages.append(str(similarities[i][top_indices[i]][n]))
base_recommendation['SIMIL'] = similarities_percentages

print(base_recommendation)

     TICKET_ID  MOIS_VENTE  PRIX_NET          FAMILLE  \
444   35473166           9     15.50       MAQUILLAGE   
166   33187009           1      1.50          HYGIENE   
252   34367874           6      3.00          HYGIENE   
482   35942197          11      4.45   SOINS DU CORPS   
86    33923246           4      3.45  SOINS DU VISAGE   
..         ...         ...       ...              ...   
449   36113648          12      1.90   SOINS DU CORPS   
545   33079331           1      5.95  SOINS DU VISAGE   
209   34083250           5      5.94       MAQUILLAGE   
312   36129395          12      1.95          HYGIENE   
88    34205098           5      2.50  SOINS DU VISAGE   

                         UNIVERS                       MAILLE  \
444             MAQ_LEV BRILLANT              MAQ_LEV_BASPRIX   
166       HYG_DOUCHE JARDINMONDE                      HYG_JDM   
252            HYG_DOUCHE HOMMES                    HYG_HOMME   
482   CORPS_SOIN PIEDS ET JAMBES  CORPS_HYDRA_NOURRI_ET

In [182]:
# client's preferences depending on its purchase history
current_month = datetime.now().month
current_month_df = df.loc[((df['CLI_ID'] == cli_id) & (df['MOIS_VENTE'] == current_month))]
# use purchases made at current time of year if available, otherwise use every purchases
input_df = current_month_df if not current_month_df.empty else df[df['CLI_ID'] == cli_id]

# print("Based on current month purchases : {}".format(not current_month_df.empty))

prefered_families = input_df["FAMILLE"].value_counts().index.to_list()
prefered_universes = input_df["UNIVERS"].value_counts().index.to_list()
prefered_mailles = input_df["MAILLE"].value_counts().index.to_list()

average_budget = round(statistics.fmean(input_df["PRIX_NET"].values), 2)

print("{}\n{}\n{}\n{}".format(
    prefered_families,
    prefered_universes,
    prefered_mailles,
    average_budget))
# print(base_recommendation['FAMILLE'])

['SOINS DU VISAGE', 'MAQUILLAGE', 'HYGIENE', 'SOINS DU CORPS']
['VIS_DEMAQ Jeunes Specifique', 'MAQ_LEV RAL Lum4', 'VIS_LOTIONS Jeunes Specifique', 'VIS_MASQUE Jeunes Specifique', 'MAQ_YEUX Crayons', 'VIS_CJOUR Jeunes Specifique', 'VIS_SOIN LEVRES', 'CORPS_HYDRA NOURRISANT', 'VIS_CJOUR AAAR', 'HYG_DOUCHE JARDINMONDE', 'HYG_DOUCHE PLAISIRNATURE', 'HYG_DOUCHE MONOI']
['VIS_JEUNE_ET_LEVRE', 'MAQ_LEV_BASPRIX', 'MAQ_YEUX_CLASSIQUE', 'CORPS_HYDRA_NOURRI_ET_SOINS', 'VIS_AAAR_HORS_DEMAQLOTION', 'HYG_JDM', 'HYG_PLAISIRNAT_BAIN_SAVON', 'HYG_MONOI_ET_EDIT_SPEC']
6.58


In [189]:
# df.loc[((df['CLI_ID'] == cli_id) & (df['MOIS_VENTE'] == current_month))]

# filter articles depending on the client's preferences
filtered_articles = base_recommendation.loc[(
    (base_recommendation['UNIVERS'].isin(prefered_families)) |
    (base_recommendation['FAMILLE'].isin(prefered_universes)) |
    (base_recommendation['MAILLE'].isin(prefered_mailles))
)]
# filter articles based on high similarity and low price
filtered_articles.sort_values(by=['SIMIL', 'PRIX_NET'])

print(filtered_articles.head(2).values.tolist())

[[35473166, 9, 15.5, 'MAQUILLAGE', 'MAQ_LEV BRILLANT', 'MAQ_LEV_BASPRIX', 'GLOSS SEXYPULP CRISTAL 08 CN3 10ML', 365845758, '1.0'], [33187009, 1, 1.5, 'HYGIENE', 'HYG_DOUCHE JARDINMONDE', 'HYG_JDM', 'CD JDM4 MACADAMIA FL 200ML', 93806295, '1.0']]
