# Imports

In [1]:
# General Imports
import numpy as np
import pandas as pd
import pickle
import os

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

# PCA
from sklearn.decomposition import PCA
from matplotlib.patches import Circle

import warnings
warnings.simplefilter(action = 'ignore')

# Sommaire
#!pip install jyquickhelper
from jyquickhelper import add_notebook_menu # attention, ne prend que les titres de niveau 2
add_notebook_menu()

In [2]:
# Set display preferences
pd.set_option('display.max_row', 250)
pd.set_option('display.max_column', 100)
pd.set_option('display.precision', 5)

In [3]:
# à run pour colab uniquement
#from google.colab import drive
#drive.mount('/content/drive')

# Import des données

## Import

In [4]:
# import des données

# Chemin d'accès aux données (à modifier si code en local)
#path = '/content/drive/MyDrive/OpenClassrooms/Projet5/' #-> pour colab
path = '' #-> en local

filepath = os.path.join(path, 'df')

with open(filepath, 'rb') as f:
    df_final = pickle.load(f)

df_final.head()

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,payment_sequential,payment_installments,payment_value,review_score,review_time,order_time,delivery_error_time,order_preparation_time,shipping_limit_respect,purchase_month,purchase_dayofweek,purchase_day,purchase_hour,full_written_review,written_review_title,distance,order_status,payment_type,product_category
0,41.0,1141.0,1.0,8683.0,1.0,2.0,146.87,4.0,4.94074,8.8125,-10.55862,6.80955,-0.80955,5.0,1.0,16.0,15.0,0.0,0.0,228.79538,delivered,credit_card,Furniture and Home Decor
1,43.0,1002.0,3.0,10150.0,1.0,8.0,335.48,5.0,11.94686,16.66175,-7.47131,2.84476,3.15524,1.0,4.0,12.0,20.0,0.0,0.0,302.71858,delivered,credit_card,Furniture and Home Decor
2,55.0,955.0,1.0,8267.0,1.0,7.0,157.73,5.0,0.50763,26.07715,1.7492,21.92488,-5.92488,5.0,5.0,19.0,16.0,0.0,0.0,29.14657,delivered,credit_card,Furniture and Home Decor
3,48.0,1066.0,1.0,12160.0,1.0,1.0,173.3,5.0,4.77554,14.99846,-12.33027,14.24541,-0.28572,3.0,1.0,13.0,16.0,0.0,0.0,18.11744,delivered,credit_card,Furniture and Home Decor
4,61.0,407.0,1.0,5200.0,1.0,8.0,252.25,5.0,7.08324,11.46132,-5.12792,1.2124,0.7876,7.0,6.0,29.0,9.0,1.0,1.0,187.4596,delivered,credit_card,


# Jeu de données complet

## Preprocessing

In [5]:
df_final.columns

Index(['product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'payment_sequential',
       'payment_installments', 'payment_value', 'review_score', 'review_time',
       'order_time', 'delivery_error_time', 'order_preparation_time',
       'shipping_limit_respect', 'purchase_month', 'purchase_dayofweek',
       'purchase_day', 'purchase_hour', 'full_written_review',
       'written_review_title', 'distance', 'order_status', 'payment_type',
       'product_category'],
      dtype='object')

In [6]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116467 entries, 0 to 117738
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   product_name_lenght         116467 non-null  float64
 1   product_description_lenght  116467 non-null  float64
 2   product_photos_qty          116467 non-null  float64
 3   product_weight_g            116467 non-null  float64
 4   payment_sequential          116467 non-null  float64
 5   payment_installments        116467 non-null  float64
 6   payment_value               116467 non-null  float64
 7   review_score                116467 non-null  float64
 8   review_time                 116467 non-null  float64
 9   order_time                  116467 non-null  float64
 10  delivery_error_time         116467 non-null  float64
 11  order_preparation_time      116467 non-null  float64
 12  shipping_limit_respect      116467 non-null  float64
 13  purchase_month

In [7]:
df_final

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,payment_sequential,payment_installments,payment_value,review_score,review_time,order_time,delivery_error_time,order_preparation_time,shipping_limit_respect,purchase_month,purchase_dayofweek,purchase_day,purchase_hour,full_written_review,written_review_title,distance,order_status,payment_type,product_category
0,41.0,1141.0,1.0,8683.0,1.0,2.0,146.87,4.0,4.94074,8.81250,-10.55862,6.80955,-0.80955,5.0,1.0,16.0,15.0,0.0,0.0,228.79538,delivered,credit_card,Furniture and Home Decor
1,43.0,1002.0,3.0,10150.0,1.0,8.0,335.48,5.0,11.94686,16.66175,-7.47131,2.84476,3.15524,1.0,4.0,12.0,20.0,0.0,0.0,302.71858,delivered,credit_card,Furniture and Home Decor
2,55.0,955.0,1.0,8267.0,1.0,7.0,157.73,5.0,0.50763,26.07715,1.74920,21.92488,-5.92488,5.0,5.0,19.0,16.0,0.0,0.0,29.14657,delivered,credit_card,Furniture and Home Decor
3,48.0,1066.0,1.0,12160.0,1.0,1.0,173.30,5.0,4.77554,14.99846,-12.33027,14.24541,-0.28572,3.0,1.0,13.0,16.0,0.0,0.0,18.11744,delivered,credit_card,Furniture and Home Decor
4,61.0,407.0,1.0,5200.0,1.0,8.0,252.25,5.0,7.08324,11.46132,-5.12792,1.21240,0.78760,7.0,6.0,29.0,9.0,1.0,1.0,187.45960,delivered,credit_card,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117734,51.0,489.0,2.0,611.0,1.0,6.0,88.78,4.0,14.46880,6.17940,-11.16207,3.41656,1.58344,4.0,5.0,7.0,15.0,0.0,1.0,329.23572,delivered,boleto,Leisure and Miscellaneous
117735,51.0,1193.0,1.0,1211.0,1.0,3.0,129.06,5.0,4.44172,7.44054,-8.21198,1.42179,4.57821,4.0,2.0,4.0,8.0,0.0,0.0,188.57356,delivered,credit_card,Leisure and Miscellaneous
117736,60.0,575.0,1.0,870.0,1.0,5.0,56.04,1.0,4.07022,30.95237,7.79392,0.89044,3.10956,4.0,6.0,8.0,20.0,1.0,1.0,1662.21122,delivered,credit_card,Leisure and Miscellaneous
117737,59.0,452.0,1.0,710.0,1.0,2.0,711.07,5.0,2.98215,12.95146,-18.16760,2.87038,3.11887,11.0,4.0,3.0,21.0,0.0,0.0,628.26583,delivered,credit_card,Electronics and Technology


In [8]:
df_quali = df_final.select_dtypes(include = 'object')
df_quali.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116467 entries, 0 to 117738
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   order_status      116467 non-null  object
 1   payment_type      116464 non-null  object
 2   product_category  116467 non-null  object
dtypes: object(3)
memory usage: 3.6+ MB


In [9]:
########## ORDINAL ENCODER ##########

# possible d'encoder plusieurs colonnes d'un coup (mais ici on en a une seule à encoder)
enc = OrdinalEncoder()
arr_ordinal = enc.fit_transform(df_quali[['order_status']])

enc.categories_

[array(['approved', 'delivered', 'invoiced', 'processing', 'shipped',
        'unavailable'], dtype=object)]

In [10]:
# Mapping des catégories :
[dict(enumerate(mapping)) for mapping in enc.categories_]

[{0: 'approved',
  1: 'delivered',
  2: 'invoiced',
  3: 'processing',
  4: 'shipped',
  5: 'unavailable'}]

In [11]:
arr_ordinal

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [12]:
arr_ordinal = arr_ordinal.reshape(-1, )

In [13]:
# Recréer un dataframe avec les données encodées
quali_order = pd.DataFrame()
quali_order['order_status'] = arr_ordinal.tolist()

In [14]:
quali_order

Unnamed: 0,order_status
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
116462,1.0
116463,1.0
116464,1.0
116465,1.0


In [15]:
########### LABEL ENCODER ##########
df_quali[['payment_type']]

Unnamed: 0,payment_type
0,credit_card
1,credit_card
2,credit_card
3,credit_card
4,credit_card
...,...
117734,boleto
117735,credit_card
117736,credit_card
117737,credit_card


In [16]:
le = LabelEncoder()

# encoder variables une à une
le.fit(df_quali[['payment_type']])
le.classes_

array(['boleto', 'credit_card', 'debit_card', 'voucher', nan],
      dtype=object)

In [17]:
# Mapping des catégories
dict(zip(le.classes_, le.transform(le.classes_)))

{'boleto': 0, 'credit_card': 1, 'debit_card': 2, 'voucher': 3, nan: 4}

In [18]:
arr_label_payment = le.transform(df_quali[['payment_type']])
arr_label_payment

array([1, 1, 1, ..., 1, 1, 0])

In [19]:
# Recréer un dataframe avec les données encodées

quali_payment = pd.DataFrame()
quali_payment[ 'payment_type'] = arr_label_payment

In [20]:
le = LabelEncoder()
# encoder variables une à une
le.fit(df_quali[['product_category']])
le.classes_

array(['', 'Electronics and Technology', 'Fashion and Accessories',
       'Furniture and Home Decor', 'Leisure and Miscellaneous'],
      dtype=object)

In [21]:
# Mapping des catégories
dict(zip(le.classes_, le.transform(le.classes_)))

{'': 0,
 'Electronics and Technology': 1,
 'Fashion and Accessories': 2,
 'Furniture and Home Decor': 3,
 'Leisure and Miscellaneous': 4}

In [22]:
arr_label_product = le.transform(df_quali[['product_category']])
arr_label_product

array([3, 3, 3, ..., 4, 1, 3])

In [23]:
# Recréer un dataframe avec les données encodées

quali_product = pd.DataFrame()
quali_product['product_category'] = arr_label_product

In [24]:
"""
########## ONE HOT ENCODER ##########

# Visualiser df simplifié
print("----------------------------------------------------")
print('df simplifié')
print("----------------------------------------------------")
display(df_quali.head())

# OneHotEncoder
enc = OneHotEncoder(categories = 'auto')

for col in df_quali.columns:
  # Convertir type des colonnes en categories
  df_quali[col] = df_quali[col].astype('category')

  # Leur assigner des valeurs numériques et créer nouvelles colonnes
  df_quali[col] = df_quali[col].cat.codes

# Appliquer l'encoding
feature_arr = enc.fit_transform(df_quali).toarray()

feature_labels = enc.get_feature_names_out(df_quali.columns)

# Recréer un dataframe avec les données encodées
features = pd.DataFrame(feature_arr, columns = feature_labels)
print("----------------------------------------------------")
print('df encodé')
print("----------------------------------------------------")
display(features.head())"""

'\n########## ONE HOT ENCODER ##########\n\n# Visualiser df simplifié\nprint("----------------------------------------------------")\nprint(\'df simplifié\')\nprint("----------------------------------------------------")\ndisplay(df_quali.head())\n\n# OneHotEncoder\nenc = OneHotEncoder(categories = \'auto\')\n\nfor col in df_quali.columns:\n  # Convertir type des colonnes en categories\n  df_quali[col] = df_quali[col].astype(\'category\')\n\n  # Leur assigner des valeurs numériques et créer nouvelles colonnes\n  df_quali[col] = df_quali[col].cat.codes\n\n# Appliquer l\'encoding\nfeature_arr = enc.fit_transform(df_quali).toarray()\n\nfeature_labels = enc.get_feature_names_out(df_quali.columns)\n\n# Recréer un dataframe avec les données encodées\nfeatures = pd.DataFrame(feature_arr, columns = feature_labels)\nprint("----------------------------------------------------")\nprint(\'df encodé\')\nprint("----------------------------------------------------")\ndisplay(features.head())'

In [25]:
########## CONCATENATION ##########
# concatener données qualitatives et quantitatives transformées

quali_product = quali_product.reset_index()
quali_payment = quali_payment.reset_index()
quali_order = quali_order.reset_index()

#features = features.reset_index()
df_quanti = df_final.select_dtypes(include = ['int', 'float'])
df_quanti = df_quanti.reset_index()

df = pd.concat([quali_product, quali_payment, quali_order, df_quanti], axis = 1)
df.head()

Unnamed: 0,index,product_category,index.1,payment_type,index.2,order_status,index.3,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,payment_sequential,payment_installments,payment_value,review_score,review_time,order_time,delivery_error_time,order_preparation_time,shipping_limit_respect,purchase_month,purchase_dayofweek,purchase_day,purchase_hour,full_written_review,written_review_title,distance
0,0,3,0,1,0,1.0,0,41.0,1141.0,1.0,8683.0,1.0,2.0,146.87,4.0,4.94074,8.8125,-10.55862,6.80955,-0.80955,5.0,1.0,16.0,15.0,0.0,0.0,228.79538
1,1,3,1,1,1,1.0,1,43.0,1002.0,3.0,10150.0,1.0,8.0,335.48,5.0,11.94686,16.66175,-7.47131,2.84476,3.15524,1.0,4.0,12.0,20.0,0.0,0.0,302.71858
2,2,3,2,1,2,1.0,2,55.0,955.0,1.0,8267.0,1.0,7.0,157.73,5.0,0.50763,26.07715,1.7492,21.92488,-5.92488,5.0,5.0,19.0,16.0,0.0,0.0,29.14657
3,3,3,3,1,3,1.0,3,48.0,1066.0,1.0,12160.0,1.0,1.0,173.3,5.0,4.77554,14.99846,-12.33027,14.24541,-0.28572,3.0,1.0,13.0,16.0,0.0,0.0,18.11744
4,4,0,4,1,4,1.0,4,61.0,407.0,1.0,5200.0,1.0,8.0,252.25,5.0,7.08324,11.46132,-5.12792,1.2124,0.7876,7.0,6.0,29.0,9.0,1.0,1.0,187.4596


In [26]:
# attention -> drop index
df.drop(['index'], axis = 1, inplace = True)

In [27]:
df

Unnamed: 0,product_category,payment_type,order_status,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,payment_sequential,payment_installments,payment_value,review_score,review_time,order_time,delivery_error_time,order_preparation_time,shipping_limit_respect,purchase_month,purchase_dayofweek,purchase_day,purchase_hour,full_written_review,written_review_title,distance
0,3,1,1.0,41.0,1141.0,1.0,8683.0,1.0,2.0,146.87,4.0,4.94074,8.81250,-10.55862,6.80955,-0.80955,5.0,1.0,16.0,15.0,0.0,0.0,228.79538
1,3,1,1.0,43.0,1002.0,3.0,10150.0,1.0,8.0,335.48,5.0,11.94686,16.66175,-7.47131,2.84476,3.15524,1.0,4.0,12.0,20.0,0.0,0.0,302.71858
2,3,1,1.0,55.0,955.0,1.0,8267.0,1.0,7.0,157.73,5.0,0.50763,26.07715,1.74920,21.92488,-5.92488,5.0,5.0,19.0,16.0,0.0,0.0,29.14657
3,3,1,1.0,48.0,1066.0,1.0,12160.0,1.0,1.0,173.30,5.0,4.77554,14.99846,-12.33027,14.24541,-0.28572,3.0,1.0,13.0,16.0,0.0,0.0,18.11744
4,0,1,1.0,61.0,407.0,1.0,5200.0,1.0,8.0,252.25,5.0,7.08324,11.46132,-5.12792,1.21240,0.78760,7.0,6.0,29.0,9.0,1.0,1.0,187.45960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116462,4,0,1.0,51.0,489.0,2.0,611.0,1.0,6.0,88.78,4.0,14.46880,6.17940,-11.16207,3.41656,1.58344,4.0,5.0,7.0,15.0,0.0,1.0,329.23572
116463,4,1,1.0,51.0,1193.0,1.0,1211.0,1.0,3.0,129.06,5.0,4.44172,7.44054,-8.21198,1.42179,4.57821,4.0,2.0,4.0,8.0,0.0,0.0,188.57356
116464,4,1,1.0,60.0,575.0,1.0,870.0,1.0,5.0,56.04,1.0,4.07022,30.95237,7.79392,0.89044,3.10956,4.0,6.0,8.0,20.0,1.0,1.0,1662.21122
116465,1,1,1.0,59.0,452.0,1.0,710.0,1.0,2.0,711.07,5.0,2.98215,12.95146,-18.16760,2.87038,3.11887,11.0,4.0,3.0,21.0,0.0,0.0,628.26583


In [28]:
# Vérifier qu'il ne reste pas de NaN
df.isna().sum().sum()

0

In [29]:
# standardisation - A DEPLACER AVANT ENCODING ?

scaler = StandardScaler()
df_std = scaler.fit_transform(df) # attendre un peu
df_std = pd.DataFrame(data = df_std, columns = df.columns)

In [30]:
# Question -> utiliser MinMaxScaler à la place ?

In [31]:
df_std.shape

(116467, 23)

## Kmeans

In [32]:
# K Means Clustering
# = algo itératif qui fonctionne en 2 étapes :

# 1) on affecte les points du dataset au centroïde le plus proche
# 2) puis on calcule la moyenne de chaque cluster et on déplace chaque centroïde au centre de son cluster

# et ainsi de suite jusqu'à ce que les centroïdes convergent à une position d'équilibre

# selon la position initiale des centroïdes, l'algo k-means peut donner de mauvais clusters (converger vers de mauvaises positions)
# pour éviter ce pb, on execute algo plusieurs fois en modifiant les positions de départ/la position initiale du centroïde

# pour chaque résultat donné, on mesure la distance entre les points d'un cluster et le centre de ce dernier et on retient la solution pour laquelle la somme de ces distances
# est la plus petite

# en gros, cherche la position des centres qui minimise la distance entre les points d'un cluster et le centre de ce dernier
# équivaut plus ou moins à minimiser la variance des clusters

### K optimal

In [33]:
# choix nb cluster -> voir méthodes elbow et coefficients de silhouette

In [None]:
%%time
# Elbow method
# -> detecter une zone de coude dans la minimisation du coût (inertia)

inertia = []
K_range = range(1, 20)
for k in K_range:
    model = KMeans(n_clusters = k).fit(df_std)
    inertia.append(model.inertia_)

# loooooooooong

In [None]:
# plot le coude
plt.plot(K_range, inertia)
plt.xlabel('nb de clusters')
plt.ylabel('coût du modèle')

In [None]:
# https://www.kaggle.com/code/mesofianeyou/customer-segmentation-with-k-means

In [None]:
%%time
# autre méthode
#!pip install yellowbrick

# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k = (1, 20), size = (1080, 500)) # on teste de k = 1 à k = 20 - pas pertinent d'aller au dessus dans notre cas

visualizer.fit(df_std)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
# extraire le nombre de clusters optimal (même si pas franchement un joli coude)
k = visualizer.elbow_value_
print(f'Nombre optimal de clusters : k = {k}')

In [None]:
%%time
# attention, long

# metric par défaut du KElbowVisualizer = distortion
# comparer le résultat obtenu avec d'autres métriques d'évaluation

# Elbow method with differents metrics
metrics = ['distortion', 'silhouette', 'calinski_harabasz']
i = 0

#fig, axes = plt.subplots(nrows = 1, ncols = 3, sharex = False, sharey = False, figsize = (24, 8))
plt.figure()
for m in metrics:
    i+=1
    plt.subplot(int(f'13{i}'))
    visualizer = KElbowVisualizer(KMeans(), k = (4, 15), metric = m)
    visualizer.fit(df_std)
    visualizer.show()

plt.show()

In [None]:
# essayer les différentes métriques séparemment

In [None]:
# voir si on a le même k optimal avec les différentes métriques

In [None]:
# Silhouette score

In [None]:
%%time

kmeans1 = KMeans(n_clusters = k, init = 'k-means++', random_state = 42) # attention -> prendre même random state partout
visualizer = SilhouetteVisualizer(kmeans1, size = (1080, 500))

visualizer.fit(df_std)    # Fit the data to the visualizer
visualizer.show()    # Finalize and render the figure

In [None]:
# Silhouette analysis can be used to study the separation distance between the resulting clusters. The silhouette plot displays a measure of
# how close each point in one cluster is to points in the neighboring clusters and thus provides a way to assess parameters like number of
# clusters visually. This measure has a range of [-1, 1].

# Silhouette coefficients (as these values are referred to as) near +1 indicate that the sample is far away from the neighboring clusters.
# A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values
# indicate that those samples might have been assigned to the wrong cluster.

# Silhoutte plots may be used to analyze the appropriateness of a chosen number of clusters for the given data, by bumping up the cluster
# count every time that you see a poorly fitted cluster. They are a heavier approach to cluster number selection than the elbow plot,
# but also tell you much more about the composition of the clusters themselves.

In [None]:
# Nb de points par 'blob'
# Regarder si un blob ( = cluster) à des données plus faiblement 'fittées'

# Silhouette Coefficient
# The silhouette coefficient measures the quality of clustering by computing the average distance between each data point and other points within the same cluster,
# relative to the distance between the data point and points in neighboring clusters. The coefficient ranges from -1 to +1, where a higher value indicates better clustering.

### Clustering

In [None]:
# Maintenant qu'on a notre k optimal -> on peut faire notre clustering avec Kmeans

In [None]:
%%time

model = KMeans(n_clusters = k # nb de clusters qu'on veut avoir
               #n_init = 10, # nb d'initialisations qu'on veut faire - 10 = valeur par défaut
               #max_iter = 300, # nb d'itérations max - par défaut 300
               #init = 'k-means++'' # quelle stratégie d'initialisation on utilise
               )

# K-Means++ -> méthode d'initialisation qui consiste à placer les centroïdes sur des points du dataset éloignés les uns des autres - facilite convergence

model.fit(df_std) # pour entrainer modèle
# model.labels_ ou
model.predict(df_std)

In [None]:
# Evaluation du clustering : silhouette score
silhouette_kmeans = round(silhouette_score(df_std, model.fit_predict(df_std)), 2)
print('The average silhouette score is:', silhouette_kmeans)

In [None]:
# A score closer to 1 indicates that the clustering is good, while a score closer to -1 indicates that the clustering is bad.

### Description des clusters

In [None]:
# Créer une copie du df pour ajouter les clusters
df_clusters = df_std

In [None]:
# Description des clusters obtenus

labels = model.labels_
df_clusters['cluster_num'] = labels
df_clusters.head()

In [None]:
df_clusters.groupby('cluster_num').mean().reset_index()

In [None]:
df_clusters.columns

In [None]:
from collections import Counter
def get_most_common(srs):
    x = list(srs)
    my_counter = Counter(x)
    return my_counter.most_common(1)[0][0]

df_clusters.groupby('cluster_num').agg({'payment_type' : 'get_most_common',
                                        'payment_value' : 'mean',
                                        'full_written_review' : 'get_most_common',
                                        'product_category' : 'get_most_common',
                                        'order_time' : 'mean',
                                        'shipping_limit_respect' : 'get_most_common',
                                        'purchase_dayofweek' : 'get_most_common',
                                        'purchase_hour' : 'get_most_common',
                                        'purchase_day' : 'get_most_common'
                                        })

In [None]:
# https://stackoverflow.com/questions/46844654/how-to-visualize-kmeans-clustering-on-multidimensional-data
plt.figure()
pd.plotting.parallel_coordinates(df_clusters, 'cluster_num')
plt.show()

In [None]:
# illisible mais il y a de l'idée... à creuser

In [None]:
# autre idée de visualisation -> pairplot et colorer par cluster

# Pairplot
plt.figure()
#sns.pairplot(df_clusters, hue = 'cluster_num') # sera surement trop lourd, à faire avec df réduit
plt.show()

In [None]:
# Idées de graphs
# https://echarts.apache.org/examples/en/index.html#chart-type-line

In [None]:
# Plotter chaque feature

df_clusters['Constant'] = 'patate' # juste là pour 'accrocher' mon graph en x
df_clusters.shape # pour voir combien de features

In [None]:
df_clusters.head()

In [None]:
# Plotter chaque feature

df_clusters['Constant'] = 'patate' # juste là pour 'accrocher' mon graph en x

f, axes = plt.subplots(4, 5, figsize = (20, 25), sharex = False) #create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
#f, axes = plt.subplots(1, 5, figsize = (20, 25), sharex = False)
f.subplots_adjust(hspace = 0.2, wspace = 0.7) #Scooch em apart, give em some room
#In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
#for i in range(0, len(list(df_clusters))-2): #minus two because I don't want to plot labels or constant
for i in range(0, 5):
    col = df_clusters.columns[i]
    ax = sns.stripplot(x = df_clusters['Constant'], y = df_clusters[col].values, hue = df_clusters['cluster_num'], jitter = True, ax = axes[0, (i)])
    ax.set_title(col)

In [None]:
import plotly.graph_objs as go
def plot_radars(data, group):

    fig = go.Figure()

    for k in data[group]:
        fig.add_trace(go.Scatterpolar(
            r = data[data[group] == k].iloc[:,1:].values.reshape(-1),
            theta = data.columns[1:],
            fill = 'toself',
            name = 'Cluster ' + str(k)
        ))

    fig.update_layout(
        polar = dict(
        radialaxis = dict(
          visible = True,
          range = [0, 1]
        )),
        showlegend = True,
        title = {
            'text': 'Comparaison des moyennes par variable des clusters',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        title_font_color = "blue",
        title_font_size = 18)

    fig.show()

In [None]:
plot_radars(data = df_clusters,
            group = 'cluster_num')

# trop gourmand en ram -> fait planter la session... essayer avec jeu de données réduit

In [None]:
# position des centroïdes
# model.cluster_centers_

# model.inertia_ -> somme des distances au centroïdes
# model.score(X) -> idem mais exprimé de manière négative

In [None]:
# autres algo à tester:
# - DBSCAN
# - AgglomerativeClustering
# - Spectral Clustering

## ACP

In [None]:
# Df total
df_std.shape

In [None]:
# Réduction dimmensionnelle avec 'Principal Component Analysis'
pca = PCA()
df_pca = pca.fit_transform(df_std)
df_pca

In [None]:
# Représentation graphique
fig = plt.figure(1, figsize = (12, 4))
plt.bar(range(len(pca.explained_variance_ratio_)), 100*pca.explained_variance_ratio_)
plt.ylim(0, 1.05*100)
plt.plot(range(len(pca.explained_variance_ratio_)), 100*np.cumsum(pca.explained_variance_ratio_), 's-')
plt.ylabel('Pourcentage de variance')
plt.xlabel('Nombre de composantes')
plt.title('Variance expliqué par PCA')
plt.show()

In [None]:
# meh

In [None]:
# autre représentation de la variance expliquée :
plt.figure(figsize = (12, 4))
sns.barplot(x = np.arange(1, len(pca.explained_variance_ratio_) + 1),
            y = np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance Ratio')
plt.show()

In [None]:
pca.explained_variance_ratio_

In [None]:
# Scree plot
# ( = line plot of the eigenvalues of factors or principal components in an analysis)

ind = np.arange(0, 24)
(fig, ax) = plt.subplots(figsize = (8, 6))
sns.pointplot(x = ind, y = pca.explained_variance_ratio_)
ax.set_title('Scree plot')
ax.set_xticks(ind)
ax.set_xticklabels(ind)
ax.set_xlabel('Component Number')
ax.set_ylabel('Explained Variance')
plt.show()

In [None]:
# Test en ne gardant que 2 dimensions :

pca = PCA(n_components = 2)
df_pca = pca.fit_transform(df_std)

In [None]:
print('Explained variance regarding first dimension : ' + '{:.1%}'.format(pca.explained_variance_ratio_[0]))
print('Explained variance regarding second dimension : ' + '{:.1%}'.format(pca.explained_variance_ratio_[1]))
print('Total variance included in those 2 dimensions : ' + '{:.1%}'.format(pca.explained_variance_ratio_.sum()))

In [None]:
# nuuuuuul

In [None]:
# Test en conservant 90% de la variance

pca_90 = PCA(n_components = 0.90)  # 90% de la variance
df_pca_90 = pca_90.fit_transform(df_std)

# Mise sous forme de DataFrame
df_pca_90 = pd.DataFrame(df_pca_90, columns = ['Composante_' + str(i) for i in range(df_pca_90.shape[1])])
df_pca_90.shape

In [None]:
# 32 variables...

In [None]:
# Plot the correlation circle
from matplotlib.patches import Ellipse

def plot_correlation_circle(pca, features, dimensions = (0, 1), scale_factor = 1):
    num_features = len(features)
    num_principal_components = pca.components_.shape[0]

    if dimensions[0] >= num_principal_components or dimensions[1] >= num_principal_components:
        raise ValueError('Invalid dimensions. Ensure that dimensions are within the range of available principal components.')

    plt.figure(figsize = (10, 10))

    # Scatter plot (avec transparence)
    plt.scatter(pca.components_[dimensions[0], :],
                pca.components_[dimensions[1], :],
                alpha = 0.7)

    # Plot arrows
    for i in range(num_features):
        plt.arrow(0,
                  0,
                  pca.components_[dimensions[0], i],
                  pca.components_[dimensions[1], i],
                  head_width = 0.05,
                  head_length = 0.05,
                  fc = 'grey',
                  ec = 'grey',
                  alpha = 0.5)

        plt.text(pca.components_[dimensions[0], i] * scale_factor,
                 pca.components_[dimensions[1], i] * scale_factor,
                 features[i],
                 ha = 'left',
                 va = 'bottom',
                 fontsize = 8)

    # Add circle patch
    ellipse = Ellipse((0, 0),
                      2,
                      2,
                      edgecolor = 'black',
                      facecolor = 'none',
                      linewidth = 2)
    plt.gca().add_patch(ellipse)

    # Add reference lines
    plt.axhline(0, color = 'black', linestyle = '--', linewidth = 1)
    plt.axvline(0, color = 'black', linestyle = '--', linewidth = 1)

    # Set axis labels
    plt.xlabel(f'Principal Component {dimensions[0] + 1}')
    plt.ylabel(f'Principal Component {dimensions[1] + 1}')

    # Set plot title
    plt.title('Correlation Circle')

# Plot correlation circle using the first two principal components
plot_correlation_circle(pca_90, features = df_std.columns, dimensions = (0, 1))

plt.show()

In [None]:
# illisible...

In [None]:
# à tester :
"""
# https://rasbt.github.io/mlxtend/user_guide/plotting/plot_pca_correlation_graph/

from mlxtend.plotting import plot_pca_correlation_graph

X, y = df_pca

X_norm = X / X.std(axis=0) # Normalizing the feature columns is recommended

feature_names = [
  'sepal length',
  'sepal width',
  'petal length',
  'petal width']

figure, correlation_matrix = plot_pca_correlation_graph(X_norm,
                                                        feature_names,
                                                        dimensions=(1, 2),
                                                        figure_axis_size=10)

correlation_matrix"""

In [None]:
# https://nirpyresearch.com/pca-correlation-circle/
# regarder ça, super interessant

# ARI Score

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

In [None]:
#https://amueller.github.io/aml/04-model-evaluation/17-cluster-evaluation.html

In [None]:
# https://reval.readthedocs.io/en/latest/

In [None]:
# https://github.com/FlorentF9/skstab