# Clustering Client

In [1]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:33285")
client

0,1
Client  Scheduler: tcp://127.0.0.1:33285  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 33.58 GB


In [22]:
import cudf
import pandas as pd
import numpy as np
import dask.array as da
import dask.dataframe as dd

from cuml.cluster import KMeans

from sklearn.metrics import silhouette_score
from sklearn import preprocessing

## 1 - Import des jeux de données

In [3]:
data_3_months = dd.read_csv("data/clean/data_3_months.csv", parse_dates=["order_purchase_timestamp"])
data_6_months = dd.read_csv("data/clean/data_6_months.csv", parse_dates=["order_purchase_timestamp"])
data_9_months = dd.read_csv("data/clean/data_9_months.csv", parse_dates=["order_purchase_timestamp"])
data_12_months = dd.read_csv("data/clean/data_12_months.csv", parse_dates=["order_purchase_timestamp"])
data_15_months = dd.read_csv("data/clean/data_15_months.csv", parse_dates=["order_purchase_timestamp"])
data_18_months = dd.read_csv("data/clean/data_18_months.csv", parse_dates=["order_purchase_timestamp"])
data_all = dd.read_csv("data/clean/data_all.csv", parse_dates=["order_purchase_timestamp"])

## 2 - Features engineering

In [4]:
def review(df):
    grouped = df.groupby("customer_unique_id").agg({"review_score": ["count", "min", "max", "mean"]})
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    return grouped

def value(df):
    grouped = df.groupby("customer_unique_id").agg({
        "payment_value": ["mean", "min", "max"]})
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    return grouped

def payment_facility(df):
    df_pf = df.groupby("customer_unique_id").agg({
        "payment_installments": "sum"
    })
    cond_mask = (df_pf["payment_installments"] > 1)

    df_pf["payment_installments"] = df_pf["payment_installments"].mask(cond=cond_mask, other=0)
    df_pf = df_pf.rename(columns={"payment_installments": "payment_facility"})
    return df_pf

def category_transf(df):
    df_ct = dd.get_dummies(df[["product_category_name", "customer_unique_id"]].categorize(), columns=['product_category_name'], prefix="PCat").groupby(['customer_unique_id']).sum()
    return df_ct

def state(df):
    df_state = dd.get_dummies(df[["customer_state", "customer_unique_id"]].categorize(), columns=["customer_state"], prefix="CState").groupby(["customer_unique_id"]).sum()
    return df_state

def payment(df):
    df_p = dd.get_dummies(df[["payment_type", "customer_unique_id"]].categorize(), columns=['payment_type'], prefix="PType").groupby(['customer_unique_id']).sum()
    return df_p

def rfm(df):
    df = df.compute()
    return df.groupby("customer_unique_id").agg(
        recency=("order_purchase_timestamp", lambda date: (df["order_purchase_timestamp"].max() - date.max()).days),
        frequency=("order_id", "count"),
        monetary=("payment_value", sum))

def features_engineering(df):
    df_fe = dd.merge(rfm(df), value(df))
    df_fe = dd.merge(df_fe, review(df))
    df_fe = dd.merge(df_fe, payment(df))
    df_fe = dd.merge(df_fe, state(df))
    df_fe = dd.merge(df_fe, category_transf(df))
    df_fe = dd.merge(df_fe, payment_facility(df))
    return df_fe

In [5]:
data_3_months = features_engineering(data_3_months).compute()
data_3_months.head()

Unnamed: 0_level_0,recency,frequency,monetary,payment_value_mean,payment_value_min,payment_value_max,review_score_count,review_score_min,review_score_max,review_score_mean,...,PCat_fashion_underwear_beach,PCat_books_technical,PCat_dvds_blu_ray,PCat_security_and_services,PCat_tablets_printing_image,PCat_furniture_bedroom,PCat_cine_photo,PCat_fashio_female_clothing,PCat_books_imported,payment_facility
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000f46a3911fa3c0805444483337064,20,1,86.22,86.22,86.22,86.22,1,3,3,3.0,...,0,0,0,0,0,0,0,0,0,0
0005e1862207bf6ccc02e4228effd9a0,26,1,150.12,150.12,150.12,150.12,1,4,4,4.0,...,0,0,0,0,0,0,0,0,0,0
00115fc7123b5310cf6d3a3aa932699e,68,1,76.11,76.11,76.11,76.11,1,4,4,4.0,...,0,0,0,0,0,0,0,0,0,1
001f3c4211216384d5fe59b041ce1461,11,1,35.84,35.84,35.84,35.84,1,3,3,3.0,...,0,0,0,0,0,0,0,0,0,0
002043098f10ba39a4600b6c52fbfe3c,4,1,237.81,237.81,237.81,237.81,1,4,4,4.0,...,0,0,0,0,0,0,0,0,0,0


## 3 - Normalisation

In [15]:
standardize = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
data_3_months_std = standardize.fit_transform(data_3_months).tolist()
data_3_months = cudf.DataFrame(data_3_months_std, index=data_3_months.index, columns=data_3_months.columns)

## 4 - Clustering à l'aide des baselines
### 4.1 - KMeans

In [26]:
base_km_cluster_number = []
base_km_score = []
for n in range(2, 25):
    kmean = KMeans(n_clusters=n, max_iter=3000)
    kmean.fit(data_3_months)
    base_km_cluster_number.append(n)
    base_km_score.append(silhouette_score(data_3_months.to_pandas(), kmean.labels_, metric='euclidean'))

plt.figure(figsize=(10, 5))
plt.plot(base_km_cluster_number, base_km_score, color="skyblue", lw=2)
plt.xlabel("Nombre de cluster")
plt.ylabel("Score de silhouette")
plt.title("Baseline Kmean")
plt.tight_layout()
plt.show()




ValueError: object __array__ method not producing an array