# KMeans

In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import pickle
import os

#### Import CSV

In [2]:
df = pd.read_csv("..{}data{}KaDo.csv".format(os.sep, os.sep), sep=',')

CATEGORIES = []

CATEGORIES.extend(df["LIBELLE"].unique().tolist())
CATEGORIES.extend(df["FAMILLE"].unique().tolist())
CATEGORIES.extend(df["MAILLE"].unique().tolist())
CATEGORIES.extend(df["UNIVERS"].unique().tolist())
CATEGORIES.extend(df["TICKET_ID"].unique().tolist())

CATEGORIES = list(dict.fromkeys(CATEGORIES))

df.head()

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
0,35592159,10,1.67,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281
1,35592159,10,1.66,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281
2,35592159,10,7.45,SOINS DU VISAGE,VIS_CJOUR Jeunes Specifique,VIS_JEUNE_ET_LEVRE,CR JR PARF BIO.SPE AC.SENT.50ML,1490281
3,35592159,10,5.95,SOINS DU VISAGE,VIS_DEMAQ AAAR,VIS_AAAR_DEMAQLOTION,EAU MICELLAIRE 3 THES FL200ML,1490281
4,35592159,10,1.67,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 TIARE FL 200ML,1490281


#### Prepare DF

In [3]:
df_all = df.copy()

df_all["FAMILLE"] = pd.Categorical(df_all["FAMILLE"],
                                        categories=CATEGORIES)
df_all["FAMILLE"] = df_all["FAMILLE"].cat.codes

df_all["UNIVERS"].cat = pd.Categorical(df_all["UNIVERS"],
                                            categories=CATEGORIES)
df_all["UNIVERS"] = df_all["UNIVERS"].cat.codes

df_all["MAILLE"] = pd.Categorical(df_all["MAILLE"],
                                       categories=CATEGORIES)
df_all["MAILLE"] = df_all["MAILLE"].cat.codes

df_all["LIBELLE"] = pd.Categorical(df_all["LIBELLE"],
                                        categories=CATEGORIES)
df_all["LIBELLE"] = df_all["LIBELLE"].cat.codes

df_all.head()

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
0,35592159,10,1.67,1484,1527,1493,0,1490281
1,35592159,10,1.66,1484,1527,1493,0,1490281
2,35592159,10,7.45,1485,1528,1494,1,1490281
3,35592159,10,5.95,1485,1529,1495,2,1490281
4,35592159,10,1.67,1484,1527,1493,3,1490281


#### Prepare DF by Products

In [None]:
df_products = df.copy()
df_products = df_products[['LIBELLE', 'FAMILLE', 'MAILLE', 'UNIVERS', 'PRIX_NET']]

df_products["FAMILLE"] = pd.Categorical(df_products["FAMILLE"],
                                        categories=CATEGORIES)
df_products["FAMILLE"] = df_products["FAMILLE"].cat.codes

df_products["UNIVERS"].cat = pd.Categorical(df_products["UNIVERS"], categories=CATEGORIES)
df_products["UNIVERS"] = df_products["UNIVERS"].cat.codes

df_products["MAILLE"] = pd.Categorical(df_products["MAILLE"],
                                       categories=CATEGORIES)
df_products["MAILLE"] = df_products["MAILLE"].cat.codes

df_products["LIBELLE"] = pd.Categorical(df_products["LIBELLE"],
                                        categories=CATEGORIES)
df_products["LIBELLE"] = df_products["LIBELLE"].cat.codes

df_products.head()

#### Prepare DF by Tickets

In [None]:
df_tickets = df.copy()
df_tickets = df_tickets[['LIBELLE', 'FAMILLE', 'MAILLE', 'UNIVERS', 'TICKET_ID', 'PRIX_NET', 'MOIS_VENTE']]

df_tickets["FAMILLE"] = pd.Categorical(df_tickets["FAMILLE"], categories=CATEGORIES)
df_tickets["FAMILLE"] = df_tickets["FAMILLE"].cat.codes

df_tickets["LIBELLE"] = pd.Categorical(df_tickets["LIBELLE"],
                                       categories=CATEGORIES)
df_tickets["LIBELLE"] = df_tickets["LIBELLE"].cat.codes

df_tickets["MAILLE"] = pd.Categorical(df_tickets["MAILLE"],
                                      categories=CATEGORIES)
df_tickets["MAILLE"] = df_tickets["MAILLE"].cat.codes

df_tickets["UNIVERS"] = pd.Categorical(df_tickets["UNIVERS"], categories=CATEGORIES)
df_tickets["UNIVERS"] = df_tickets["UNIVERS"].cat.codes

df_tickets.head()

#### Train KMean

In [4]:
kmeans = KMeans(n_clusters=10)

# kmeans.fit(df_products)
kmeans.fit(df_all)
# kmeans.fit(df_tickets)

kmeans.labels_

array([7, 7, 7, ..., 6, 6, 6])

#### Save Model

In [5]:
# pickle.dump(kmeans, open("..{}models{}model_tickets.pkl".format(os.sep, os.sep), "wb"))
pickle.dump(kmeans, open("..{}models{}model_all.pkl".format(os.sep, os.sep), "wb"))
# pickle.dump(kmeans, open("..{}models{}model_products.pkl".format(os.sep, os.sep), "wb"))

# Testing KMean

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import pickle

#### Import Data

In [None]:
df = pd.read_csv("..{}data{}KaDo.csv".format(os.sep, os.sep), sep=",")

CATEGORIES = []

CATEGORIES.extend(df["LIBELLE"].unique().tolist())
CATEGORIES.extend(df["FAMILLE"].unique().tolist())
CATEGORIES.extend(df["MAILLE"].unique().tolist())
CATEGORIES.extend(df["UNIVERS"].unique().tolist())
CATEGORIES.extend(df["TICKET_ID"].unique().tolist())

CATEGORIES = list(dict.fromkeys(CATEGORIES))

df.head()

#### Format Data

In [None]:
df_products = df.copy()
df_products = df_products[[
    'LIBELLE', 'FAMILLE', 'MAILLE', 'UNIVERS', 'PRIX_NET'
]]

df_products["FAMILLE"] = pd.Categorical(df_products["FAMILLE"], categories=CATEGORIES)
df_products["FAMILLE"] = df_products["FAMILLE"].cat.codes

df_products["UNIVERS"] = pd.Categorical(df_products["UNIVERS"], categories=CATEGORIES)
df_products["UNIVERS"] = df_products["UNIVERS"].cat.codes

df_products["MAILLE"] = pd.Categorical(df_products["MAILLE"],
                                       categories=CATEGORIES)
df_products["MAILLE"] = df_products["MAILLE"].cat.codes

df_products["LIBELLE"] = pd.Categorical(df_products["LIBELLE"],
                                        categories=CATEGORIES)
df_products["LIBELLE"] = df_products["LIBELLE"].cat.codes

df_products.head()

In [None]:
df_tickets = df.copy()
df_tickets = df_tickets[[
    'LIBELLE', 'FAMILLE', 'MAILLE', 'UNIVERS', 'TICKET_ID', 'PRIX_NET',
    'MOIS_VENTE'
]]

df_tickets["FAMILLE"] = pd.Categorical(df_tickets["FAMILLE"],
                                        categories=CATEGORIES)
df_tickets["FAMILLE"] = df_tickets["FAMILLE"].cat.codes

df_tickets["UNIVERS"] = pd.Categorical(df_tickets["UNIVERS"],
                                        categories=CATEGORIES)
df_tickets["UNIVERS"] = df_tickets["UNIVERS"].cat.codes

df_tickets["MAILLE"] = pd.Categorical(df_tickets["MAILLE"],
                                       categories=CATEGORIES)
df_tickets["MAILLE"] = df_tickets["MAILLE"].cat.codes

df_tickets["LIBELLE"] = pd.Categorical(df_tickets["LIBELLE"],
                                        categories=CATEGORIES)
df_tickets["LIBELLE"] = df_tickets["LIBELLE"].cat.codes

df_tickets.head()

#### Import Model

In [None]:
model = pickle.load(open("..{}models{}model_tickets.pkl".format(os.sep, os.sep), "rb"))

model.labels_

#### Predict

In [None]:
data = [df_tickets.iloc[500].tolist()]

model.predict(data)[0]

# Applying to Client

In [6]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import pickle

In [7]:
CLIENT_ID = 984943411

In [8]:
df = pd.read_csv("..{}data{}KaDo.csv".format(os.sep, os.sep), sep=',')
df_all = df.copy()
df = df[df["CLI_ID"] == CLIENT_ID]

CATEGORIES = []

CATEGORIES.extend(df_all["LIBELLE"].unique().tolist())
CATEGORIES.extend(df_all["FAMILLE"].unique().tolist())
CATEGORIES.extend(df_all["MAILLE"].unique().tolist())
CATEGORIES.extend(df_all["UNIVERS"].unique().tolist())
CATEGORIES.extend(df_all["TICKET_ID"].unique().tolist())

CATEGORIES = list(dict.fromkeys(CATEGORIES))

df.head()


Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
3643163,32991432,1,2.95,CAPILLAIRES,CAP_AP SHAMP,CAPILLAIRE_AUTRE,SVC ECLAT COULEUR AP SH 150ML,984943411
3643164,32991432,1,2.95,CAPILLAIRES,CAP_AP SHAMP,CAPILLAIRE_AUTRE,SVC ECLAT COULEUR AP SH 150ML,984943411
3643165,32991432,1,1.95,HYGIENE,HYG_DOUCHE FRAICHEUR VEG,HYG_AUTRES,GEL MOUSS THE VERT FV FL200 REF,984943411
3643166,32991432,1,17.5,PARFUMAGE,PARF_EAUX PARFUMS,PARF_PARFUM,EDT NATURELLE VAPO 75ML,984943411
3643167,32991432,1,1.95,HYGIENE,HYG_DOUCHE FRAICHEUR VEG,HYG_AUTRES,GEL MOUSS FL200 VERVEINE FV,984943411


In [9]:
df.shape[0]

205

In [10]:
df["FAMILLE"] = pd.Categorical(df["FAMILLE"], categories=CATEGORIES)
df["FAMILLE"] = df["FAMILLE"].cat.codes

df["LIBELLE"] = pd.Categorical(df["LIBELLE"], categories=CATEGORIES)
df["LIBELLE"] = df["LIBELLE"].cat.codes

df["MAILLE"] = pd.Categorical(df["MAILLE"], categories=CATEGORIES)
df["MAILLE"] = df["MAILLE"].cat.codes

df["UNIVERS"] = pd.Categorical(df["UNIVERS"], categories=CATEGORIES)
df["UNIVERS"] = df["UNIVERS"].cat.codes

df.head()

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
3643163,32991432,1,2.95,1489,1536,1501,234,984943411
3643164,32991432,1,2.95,1489,1536,1501,234,984943411
3643165,32991432,1,1.95,1484,1597,1510,577,984943411
3643166,32991432,1,17.5,1486,1530,1496,884,984943411
3643167,32991432,1,1.95,1484,1597,1510,243,984943411


In [11]:
model = pickle.load(open("..{}models{}model_all.pkl".format(os.sep, os.sep), "rb"))

model.labels_

array([7, 7, 7, ..., 6, 6, 6])

#### Attach Client to Cluster

In [12]:
distance = model.transform(df)
distance = [sum(elts) for elts in zip(*distance)]

min_value = min(distance)

client_cluster = distance.index(min_value)

client_cluster

1

#### Get products corresponding to clusters

In [13]:
cluster = np.where(np.isin(model.labels_, client_cluster))[0]

cluster

array([3549092, 3549093, 3549094, ..., 5222894, 5222895, 5222896],
      dtype=int64)

In [14]:
df_client = df_all.loc[cluster]

df_client.head()

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
3549092,35221468,8,45.5,SOINS DU VISAGE,VIS_CNUIT AAAR,VIS_AAAR_HORS_DEMAQLOTION,REPACK AR BIEN NUIT RC2 P 50ml,981961119
3549093,33758611,4,1.66,SOINS DU VISAGE,VIS_SOIN LEVRES,VIS_JEUNE_ET_LEVRE,BAUME LEVR SENT FRAMBOISE 4G,984238969
3549094,33758611,4,1.67,SOINS DU VISAGE,VIS_SOIN LEVRES,VIS_JEUNE_ET_LEVRE,BAUME LEVR SENT FRAMBOISE 4G,984238969
3549095,33758611,4,2.5,SOINS DU VISAGE,VIS_SOIN LEVRES,VIS_JEUNE_ET_LEVRE,BAUME LEVR SENT FRAMBOISE 4G,984238969
3549096,33758611,4,1.67,SOINS DU VISAGE,VIS_SOIN LEVRES,VIS_JEUNE_ET_LEVRE,BAUME LEVR SENT FRAMBOISE 4G,984238969


In [15]:
df_client.shape[0]

1652841

#### Remove already bought items (?)

In [16]:
try:
    df_client = df_client[df_client["CLI_ID"] != CLIENT_ID]
except KeyError:
    pass

df_client.shape[0]

1652636