In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd "drive/MyDrive/Customer Segmentation in US"

/content/drive/MyDrive/Customer Segmentation in US


# 3_Modeling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from src.model_selection import Modeling
from src.wrangle import wrangle
from sklearn.decomposition import PCA

In [None]:
#reading data
data=pd.read_csv("data/data.csv")
data.head()

Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,12200.0,-6710.0,0.0,3900.0,5490.0
1,12600.0,-4710.0,0.0,6300.0,7890.0
2,15300.0,-8115.0,0.0,5600.0,7185.0
3,14100.0,-2510.0,0.0,10000.0,11590.0
4,15400.0,-5715.0,0.0,8100.0,9685.0


## model training and hyperprameter tuning

In [None]:
m = Modeling()
sil_scores, inertia_vals = m.model_tuning(data, k_max=15)


In [None]:
# plotting line curves of both lists
# Silhouette
fig1 = px.line(
    x=range(2, 15),
    y=sil_scores,
    title="Silhouette Score",
    labels={"x":"Number of Clusters","y":"Silhouette Score"}
)
fig1.show()

# Inertia
fig2 = px.line(
    x=range(2, 15),
    y=inertia_vals,
    title="Inertia",
    labels={"x":"Number of Clusters","y":"Inertia"}
)
fig2.show()

from the previous curves it seems the best n_clusters is 4 so we will make the final model with 3n_clusters

In [None]:
final_model=m.model_pip(4)
final_model.fit(data)

## Comunication

extract final model labels

In [None]:
labels=final_model.named_steps["kmeans"].labels_
labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int32)

Create a DataFrame xgb that contains the mean values of the features

In [None]:
xgb = data.groupby(labels).mean()

print("xgb type:", type(xgb))
print("xgb shape:", xgb.shape)
xgb

xgb type: <class 'pandas.core.frame.DataFrame'>
xgb shape: (4, 5)


Unnamed: 0,DEBT,NETWORTH,HOUSES,NFIN,ASSET
0,205111.440049,231433.3,261927.688504,351265.3,436544.7
1,28650.810927,11765.36,14647.482838,28252.56,40416.17
2,669360.967742,581950.7,788306.451613,1118355.0,1251312.0
3,266576.27451,1432137.0,339117.647059,1295769.0,1698713.0


create a side-by-side bar chart from xgb

In [None]:
# Create side-by-side bar chart of `xgb`
fig =px.bar(
    xgb,
    barmode="group",
    title="Mean Values of Features by Cluster"
)
fig.update_layout(
    xaxis_title="Cluster",
    yaxis_title="Mean Value",
    legend_title="Feature"
)
fig.show()

Create a PCA transformer, use it to reduce the dimensionality of the data

In [None]:
pca = PCA(n_components=2 , random_state=42)

# Transform `data`
X_t =pca.fit_transform(data)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t ,columns=["Pca1" ,"Pca2"])

print("X_pca type:", type(X_pca))
print("X_pca shape:", X_pca.shape)
X_pca.head()

X_pca type: <class 'pandas.core.frame.DataFrame'>
X_pca shape: (4418, 2)


Unnamed: 0,Pca1,Pca2
0,-221525.42453,22052.273003
1,-217775.100722,22851.358068
2,-219519.642175,19023.646333
3,-212195.720367,22957.107039
4,-215540.507551,20259.749306


In [None]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame=X_pca,
    x="Pca1",
    y="Pca2",
    color=labels.astype(str)
)
fig.update_layout(
    title="PCA Scatter Plot",
    xaxis_title="PC1",
    yaxis_title="PC2",
    legend_title="Cluster"
)
fig.show()