In [None]:
import pandas  as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("smartcart_customers.csv")

In [3]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,172,88,88,3,8,10,4,7,0,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,2,1,6,2,1,1,2,5,0,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,111,21,42,1,8,2,10,4,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,10,3,5,2,2,0,4,6,0,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,46,27,15,5,5,3,6,5,0,0


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [5]:
df.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Complain                0
Response                0
dtype: int64

# DATA PREPROCESSING

# 1. HANDLE MISSING VALUES

In [6]:
df["Income"] = df["Income"].fillna(df["Income"].median())

# 2. FEATURE ENGINEERING

In [7]:
# age 
df["Age"] = 2026 - df["Year_Birth"]

In [8]:
#days
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"] , dayfirst=True)
recent_year = df["Dt_Customer"].max()
df["Customer_Tenure_Days"] = (recent_year - df["Dt_Customer"]).dt.days

In [9]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'Complain', 'Response', 'Age', 'Customer_Tenure_Days'],
      dtype='object')

In [10]:
#spending
df["Total_Spending"] = df["MntWines"] + df["MntFruits"] + df["MntMeatProducts"] + df["MntFishProducts"] + df["MntSweetProducts"] + df["MntGoldProds"]

In [11]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Customer_Tenure_Days,Total_Spending
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,3,8,10,4,7,0,1,69,663,1617
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,2,1,1,2,5,0,0,72,113,27
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,1,8,2,10,4,0,0,61,312,776
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,2,2,0,4,6,0,0,42,139,53
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,5,3,6,5,0,0,45,161,422


In [12]:
df["Total_Children"] = df["Kidhome"] + df["Teenhome"]

In [13]:
# education

df["Education"] = df["Education"].replace({
    "Basic" : "Undergraduate" , "2n Cycle" : "Undergraduate" , 
    "Graduation" : "Graduate" , 
    "Master" : "Postgraduate" , "PhD" : "Postgraduate"
})

df["Education"].value_counts()

Education
Graduate         1127
Postgraduate      856
Undergraduate     257
Name: count, dtype: int64

In [14]:
# marital status

df["Living_With"] = df["Marital_Status"].replace({
    "Married" : "Partner" , "Together" : "Partner" , 
    "Single" : "Alone" , "Divorced" : "Alone" , "Widow" : "Alone" , "Alone" : "Alone" , "Absurd" : "Alone", "YOLO" : "Alone"
})    

df["Living_With"].value_counts()

Living_With
Partner    1444
Alone       796
Name: count, dtype: int64

# 3. DROP COLUMNS

In [15]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'Complain', 'Response', 'Age', 'Customer_Tenure_Days', 'Total_Spending',
       'Total_Children', 'Living_With'],
      dtype='object')

In [16]:
cols = ["ID" , "Year_Birth"  , "Marital_Status" , "Kidhome", "Teenhome" , "Dt_Customer"]
spending_cols = ['MntWines' , 'MntFruits' , 'MntMeatProducts' , 'MntFishProducts' , 'MntSweetProducts' , 'MntGoldProds' ]

cols_to_drop = cols + spending_cols
df_cleaned = df.drop(cols_to_drop , axis=1)

In [17]:
df_cleaned.head()

Unnamed: 0,Education,Income,Recency,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Customer_Tenure_Days,Total_Spending,Total_Children,Living_With
0,Graduate,58138.0,58,3,8,10,4,7,0,1,69,663,1617,0,Alone
1,Graduate,46344.0,38,2,1,1,2,5,0,0,72,113,27,2,Alone
2,Graduate,71613.0,26,1,8,2,10,4,0,0,61,312,776,0,Partner
3,Graduate,26646.0,26,2,2,0,4,6,0,0,42,139,53,1,Partner
4,Postgraduate,58293.0,94,5,5,3,6,5,0,0,45,161,422,1,Partner


# ANOMALY DETECTION (manualy) 

In [None]:
cols = ["Income" , "Recency", "Response" , "Age" , "Total_Spending" , "Total_Children" ]

sns.pairplot(df_cleaned[cols])


<seaborn.axisgrid.PairGrid at 0x1ac00b30d70>

In [None]:
# remove outliers 

df_cleaned = df_cleaned[ (df_cleaned["Income"] < 600_000 ) ]
df_cleaned = df_cleaned[ (df_cleaned["Age"] < 80 ) ]

len(df_cleaned)

# HEATMAP

In [None]:
corr = df_cleaned.corr(numeric_only=True)
corr

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(
    corr,
    annot=True,
    annot_kws={"size" : 6},
    cmap="coolwarm"
)

In [None]:
# encoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
cat_cols = ["Education" , "Living_With"] 
enc_cols = ohe.fit_transform(df_cleaned[cat_cols])

In [None]:
enc_df = pd.DataFrame(enc_cols.toarray() , columns=ohe.get_feature_names_out(cat_cols) , index=df_cleaned.index)

In [None]:
enc_df.head()

In [None]:
df_encoded = pd.concat([df_cleaned.drop(cat_cols , axis=1 ) , enc_df] , axis=1)

In [None]:
df_encoded.head()

# SCALING

In [None]:
X = df_encoded

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# VISUALUZE

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111 , projection="3d")
ax.scatter(X_pca[: , 0] , X_pca[:,1] , X_pca[:, 2] )
ax.set_xlabel("PCA1")
ax.set_ylabel("PCA2")
ax.set_zlabel("PCA3")

In [None]:
import plotly.express as px
# Create the interactive 3D scatter plot
fig = px.scatter_3d( 
                    x=X_pca[:,0], 
                    y=X_pca[:,1], 
                    z=X_pca[:,2],
                    opacity=0.7)

fig.update_layout(
    width=900,
    height=800,
    margin=dict(l=0, r=0, b=0, t=0) 
)
fig.show()

# ELBOW METHOD 

In [None]:
from sklearn.cluster import KMeans 

wcss = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k , random_state=42)
    kmeans.fit_predict(X_pca)
    wcss.append(kmeans.inertia_)
    

In [None]:
sns.lineplot(x=range(1,11) , y=wcss , marker="o")

# KNEELOCATOR 

In [None]:
from kneed import KneeLocator

knee = KneeLocator( range(1,11) , wcss , curve="convex" , direction="decreasing" )
print(knee.elbow)

# SILHOUETTE SCORE 

In [None]:
from sklearn.metrics import silhouette_score

scores = []

for k in range(2,11):
    kmeans = KMeans(n_clusters=k , random_state=42)
    labels = kmeans.fit_predict(X_pca )
    score = silhouette_score(X_pca , labels )
    scores.append(score)

plt.plot(range(2,11) , scores , marker="o")
plt.xlabel("k")
plt.ylabel("scores")


# MUTUAL GRAPH 

In [None]:
k_range = range(2 , 11)

fig , ax1 = plt.subplots(figsize=(10,8))
ax1.plot(k_range  , wcss[: len(k_range)] , color="blue" , marker="o")
ax1.set_xlabel("K")
ax1.set_ylabel("wcss")

ax2 = ax1.twinx()
ax2.plot(k_range , scores[: len(k_range)] , color="red" , marker="o" , linestyle="--")
ax2.set_ylabel("scores")

# CLUSTERING

 # 1. KMEANS

In [None]:
kmeans = KMeans(n_clusters=5 , random_state=42)
labels = kmeans.fit_predict(X_pca)

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111 , projection="3d")
ax.scatter(X_pca[: , 0] , X_pca[:,1] , X_pca[:, 2] , c=labels)
ax.set_xlabel("PCA1")
ax.set_ylabel("PCA2")
ax.set_zlabel("PCA3")


# 2. AGGLOMERATIVE CLUSTERING

In [None]:
from sklearn.cluster import AgglomerativeClustering
agl = AgglomerativeClustering(n_clusters=4 , linkage="ward")
labels2 = agl.fit_predict(X_pca)

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111 , projection="3d")
ax.scatter(X_pca[: , 0] , X_pca[:,1] , X_pca[:, 2] , c=labels2)
ax.set_xlabel("PCA1")
ax.set_ylabel("PCA2")
ax.set_zlabel("PCA3")


# CHARACTERIZATION

In [None]:
df_cleaned["cluster"] = labels2
pal = ["green" , "blue" , "red" , "yellow"]


In [None]:
sns.scatterplot(x=df_cleaned["Total_Spending"] , y = df_cleaned["Income"] , hue=df_cleaned["cluster"] , palette=pal)

In [None]:
X["cluster"] = labels2
summary_clusters = X.groupby("cluster").mean()
print(summary_clusters)