# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import scipy
from sklearn.preprocessing import StandardScaler , LabelEncoder , RobustScaler , StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score , davies_bouldin_score
from sklearn.manifold import TSNE

In [None]:
pd.set_option('display.max_columns',99)

# Read Dataset

<h5><strong> Source Dataset </strong>:

[Link dataset](https://www.kaggle.com/datasets/vishakhdapat/customer-segmentation-clustering/data)

In [None]:
data = pd.read_csv('/kaggle/input/customer-segmentation-clustering/customer_segmentation.csv')
data

<h4 style='text-align:center;'><strong> CUSTOMER SEGMENTATION DATASET </strong></h4>
<ol>
    <li><span style='font-weight:bold;'>ID</span> : A unique identifier for each customer. It is used to identify customers individually. </li>
    <li><strong>Year_Birth </strong>: The year the customer was born </li>
    <li><strong>Education </strong>: The customer's education level. This provides insights into the educational background of customers and how it might relate to their buying behavior or segmentation.</li>
    <li><strong>Marital_Status </strong>: The customer's marital status </li>
    <li><strong>Income </strong>: The customer's annual income.</li>
    <li><strong>Kidhome </strong>: The number of children living at the customer's home.</li>
    <li><strong>Teenhome </strong>: The number of teenagers living at the customer's home. </li>
    <li><strong>Dt_Customer </strong>: The date when the customer first became a customer. This can be used to calculate the duration of the customer relationship or for retention analysis. </li>
    <li><strong>Recency </strong>: The number of days since the customer last interacted or made a purchase with the company.</li>
    <li><strong>MntWines </strong>: The amount the customer spent on wine.</li>
    <li><strong>MntFruits </strong>: The amount the customer spent on fruit.</li>
    <li><strong>MntMeatProducts </strong>: The amount the customer spent on meat products.</li>
    <li><strong>MntFishProducts </strong>: The amount the customer spent on fish products.</li>
    <li><strong>MntSweetProducts </strong>: The amount the customer spent on sweet products.</li>
    <li><strong>MntGoldProds </strong>: The amount the customer spent on gold products.</li>
    <li><strong>NumDealsPurchases </strong>: The number of purchases made using a discount or promotion by customer.</li>
    <li><strong>NumWebPurchases </strong>: The number of purchases made via the website.</li>
    <li><strong>NumCatalogPurchases </strong>: The number of purchases made through a catalog.</li>
    <li><strong>NumStorePurchases </strong>: The number of purchases made in a physical store. </li>
    <li><strong>NumWebVisitsMonth </strong>: The number of times the customer visits the website per month.</li>
    <li><strong>AcceptedCmp3 </strong>: Indicates whether the customer accepted campaign 3 (1 = accepted, 0 = not accepted).</li>
    <li><strong>AcceptedCmp4 </strong>: Indicates whether the customer accepted campaign 4 (1 = accepted, 0 = not accepted).</li>
    <li><strong>AcceptedCmp5 </strong>: Indicates whether the customer accepted campaign 5 (1 = accepted, 0 = not accepted).</li>
    <li><strong>AcceptedCmp1 </strong>: Indicates whether the customer accepted campaign 1 (1 = accepted, 0 = not accepted).</li>
    <li><strong>AcceptedCmp2 </strong>: Indicates whether the customer accepted campaign 2 (1 = accepted, 0 = not accepted).</li>
    <li><strong>Complain </strong>: Indicates whether the customer filed a complaint (1 = complained, 0 = did not complain).</li>
    <li><strong>Z_CostContact </strong>: This refers to the cost incurred by the company each time they interact with a customer (e.g., through emails, calls, or advertisements). </li>
    <li><strong>Z_Revenue </strong>: The revenue generated from each contact with the customer.</li>
    <li><strong>Response </strong>: Indicates whether the customer responded to or participated in a marketing campaign (1 = responded, 0 = did not respond).</li>



# Exploratory Data Analysis 

In [None]:
# CHECK UNIQUE COLUMNS
data.columns

In [None]:
# CHECK TYPE DATA EACH COLUMNS
data.info()

In [None]:
# CHECK TOTAL OF UNIQUE CLASS IN EACH FEATURE/COLUMNS
data.nunique()

In [None]:
# CHECK HOW MUCH NULL COLUMNS
data.isna().sum()

Income Columns has 24 Missing Values. but we can handle it later

## - Check Duplicate Data

In [None]:
# CHECK DUPLICATED SAMPLE
data.duplicated().sum()

## - Check Descriptive Statistics 

In [None]:
# CHECK DESCRIPTIVE STATISTICS
data.describe(include='all')

## -   Handling Missing Values

In [None]:
data.isna().sum()

In [None]:
# DELETE MISSING VALUES

data_clean = data.dropna(axis=0, how='any')
data_clean

In [None]:
# CHECK BACK MISSING VALUES
data_clean.isna().sum()

In [None]:
# SEPARATING NUMERIC AND OBJECT DATA

object_data = data_clean.select_dtypes('object')
numeric_data = data_clean.select_dtypes(('int64','float64'))
numeric_data.drop(columns=['Z_CostContact','Z_Revenue'],inplace=True)  # DROP USELESS FEATURE

object_data.shape , numeric_data.shape , type(object_data) , type(numeric_data)

## -  Check Distribution Data

In [None]:
# DISPLAY EDUCATION BAR CHART
axes = sns.countplot(data= object_data, x = 'Education', palette='Set2')

# TO DISPLAY VALUE IN EACH BAR
for ax in axes.patches:
    height = ax.get_height()  # GET HEIGHT AXIS
    axes.text(x = ax.get_x() + ax.get_width() / 2. , y = height + 1, s = int(height), ha='center', va='bottom', fontsize=10)

plt.title('Education Distribution')
plt.ylabel('Frequency Customer')
plt.show()

In [None]:
# DISPLAY MARITAL STATUS

axes = sns.countplot(data= object_data, x= 'Marital_Status', palette='Set2')

for ax in axes.patches:
    height = ax.get_height()

    axes.text(x = ax.get_x() + ax.get_width() / 2, y = height , s= int(height), ha='center', va='bottom', fontsize=10)

plt.title('Marital Distribution')
plt.ylabel('Frequency Customer')
plt.show()

In [None]:

sns.countplot(data= object_data, x='Marital_Status', hue='Education')
plt.title('Distribution of marital status based on education')


In [None]:
# DISPLAY Year Birth Customer

axes = sns.histplot(data= numeric_data, x='Year_Birth', kde=True, binwidth=5)

# DISPLAY VALUE IN EACH BAR
for ax in axes.patches:
    height = ax.get_height()

    axes.text(x = ax.get_x() + ax.get_width() / 2, y = height , s= int(height), ha='center', va='bottom', fontsize=8)

plt.title('Year Birth Customer')
plt.ylabel('Frequency Customer')
plt.show()

In [None]:
plt.figure(figsize=(35,12))
sns.countplot(data= data, x='Year_Birth', hue='Marital_Status')
plt.title('Distribution of Birth Year Based on Marital Status',fontsize=25)

In [None]:
# DISPLAY KIDHOME AND TEENHOME

fig , Axes = plt.subplots(nrows=1 , ncols=2, figsize=(20,8))

# DISPLAY KIDHOME INFO
axes = sns.countplot(data= numeric_data, x='Kidhome', palette= 'pastel', ax= Axes[0])
Axes[0].set_title('Kidhome Distribution')
for ax in axes.patches:
    height = ax.get_height()
    axes.text(x= ax.get_x() + ax.get_width() / 2. , y = height , s=int(height), ha='center', va='bottom',fontsize=12)



# DISPLAY TEENHOME INFO
axes = sns.countplot(data=numeric_data, x='Teenhome', palette='pastel', ax= Axes[1])
Axes[1].set_title('Teenhome Distribution')
for ax in axes.patches:
    height = ax.get_height()
    axes.text(x= ax.get_x() + ax.get_width() / 2, y = height, s= int(height), ha='center', va='bottom', fontsize=12)

plt.show()

In [None]:
# DISPLAY INCOME CHART

plt.figure(figsize=(14,6))
sns.histplot(data= numeric_data, x='Income', kde=True)

plt.title('Income Distribution')
plt.show()

In [None]:
# DISPLAY INCOME DISTRIBUTION USING BOXPLOT AND VIOLINPLOT

plt.figure(figsize=(16,6))

plt.subplot(1,3,1)
sns.boxplot(data= numeric_data, y='Income', color='skyblue')
plt.title('Box Plot')

plt.subplot(1,3,2)
sns.violinplot(data= numeric_data, y='Income', palette='deep')
plt.title('Violin Plot')

plt.subplot(1,3,3)
sns.stripplot(data= numeric_data, y='Income', palette='bright')
plt.title('Strip Plot')

plt.suptitle('Income Distribution')
plt.show()

There is 1 sample where the difference is very big. 

In [None]:
# DELETE OUTLIER

data_clean = data_clean[data_clean['Income'] < 120000]
data_clean

In [None]:
# CHECK IT BACK

plt.figure(figsize=(12,6))

plt.subplot(1,2,1)
sns.boxplot(data= data_clean, y='Income')

plt.subplot(1,2,2)
sns.violinplot(data= data_clean, y='Income')

plt.show()


In [None]:
# DISPLAY RECENCY COLUMNS INFO

axes = sns.histplot(data= numeric_data, x='Recency', kde=True)

# DISPLAY VALUE IN EACH BAR
for ax in axes.patches:
    axes.text(x= ax.get_x() + ax.get_width() / 2 , y = ax.get_height() , s= ax.get_height(), ha='center', va='bottom', fontsize=10)

plt.title('Recency Distribution')
plt.show()

In [None]:
# RECENCY BOXPLOT AND VIOLIN PLOT

plt.figure(figsize=(11,6))

plt.subplot(1,2,1)
sns.boxplot(data=data_clean, y='Recency', palette='pastel')
plt.ylabel('')

plt.subplot(1,2,2)
sns.violinplot(data=data_clean, y='Recency', palette='deep')
plt.ylabel('')

plt.suptitle('Recency Distribution')
plt.show()

In [None]:

fig1 = px.histogram(data_frame=data, x='Recency', color='Marital_Status', title='Recency Distribution based on Marital Status')
fig2 = px.histogram(data_frame=data, x='Recency', color='Education', title='Recency Distribution based on Education')


fig1.show()
fig2.show()

In [None]:
# DISPLAY 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts','MntSweetProducts', 'MntGoldProds' Columns

fig , axes = plt.subplots(nrows=2, ncols=3, figsize=(20,16))

sns.histplot(data= numeric_data, x='MntWines', kde=True, ax= axes[0,0])
axes[0,0].set_title('Amount of Wines Purchases by Customer')

sns.histplot(data= numeric_data, x='MntFruits', kde=True, ax= axes[0,1])
axes[0,1].set_title('Amount of Fruit Purchases by Customer')

sns.histplot(data= numeric_data, x='MntMeatProducts', kde=True, ax= axes[0,2])
axes[0,2].set_title('Amount of Meat Purchases by Customer')

sns.histplot(data= numeric_data, x='MntFishProducts', kde=True, ax=axes[1,0])
axes[1,0].set_title('Amount of Fish Purchases by Customer')

sns.histplot(data= numeric_data, x='MntSweetProducts', kde=True, ax=axes[1,1])
axes[1,1].set_title('Amount of Sweet Purchases by Customer')

sns.histplot(data= numeric_data, x='MntGoldProds', kde=True, ax=axes[1,2])
axes[1,2].set_title('Amount of Gold Purchases by Customer')


plt.show()


In [None]:
# Distribution of Wines Purchases based on Education , Marital_Status , Kidhome , Teenhome

fig1 = px.histogram(data_frame= data, x='MntWines', color='Education', title='Distribution of Wine Purchases by Education')
fig2 = px.histogram(data_frame= data, x='MntWines', color='Marital_Status', title='Distribution of Wines Purchases by Marital Status')
fig3 = px.histogram(data_frame= data, x='MntWines', color='Kidhome', title='Distribution of Wines Purchases by Kidhome')
fig4 = px.histogram(data_frame= data, x='MntWines', color='Teenhome', title='Distribution of Wines Purchases by Teenhome')

fig1.show()
fig2.show()
fig3.show()
fig4.show()


# CHECK USING T-TEST 

# Is there a significant difference in purchasing wines between customers who have Kid and those who do not?
customer_no_kid = data[data['Kidhome'] == 0]['MntWines']
customer_with_kid = data[data['Kidhome'] >= 1]['MntWines']

t_stat , p_value = scipy.stats.ttest_ind(a = customer_no_kid, b= customer_with_kid)

print('Result T-Test:\n')
if p_value < 0.05: 
    print("There is a significant difference in Purchasing Wines between Customers who have Kid and those who dont")
else: 
    print("There is no significant difference in Purchasing Wines between Customers who have Kid and those who dont")
print('\n\n')

# Is there a significant difference in purchasing wines between customers who have Kid and those who do not?
customer_no_teen = data[data['Teenhome'] == 0]['MntWines']
customer_with_teen = data[data['Teenhome'] >= 1]['MntWines']

print('Result T-Test:\n')
if p_value < 0.05: 
    print("There is a significant difference in Purchasing Wines between Customers who have Teen and those who dont")
else: 
    print("There is no significant difference in Purchasing Wines between Customers who have Teen and those who dont")

<ul>
    <li>Most customers spend less than 200 on wine.</li>
    <li>basic education level only spends 0-50 for wine</li>
    <li>The majority of customers have a graduate education level</li>
    <li>Customers who don't have children spend more wines than those who have children</li>
</ul>

In [None]:
# Distribution of Fruits Purchases

fig1 = px.histogram(data_frame=data, x='MntFruits', color='Kidhome', title='Distribution of Fruits Purchases based Kidhome')
fig2 = px.histogram(data_frame=data, x='MntFruits', color='Teenhome', title='Distribution of Fruits Purchases based Teenhome')

fig1.show()
fig2.show()

In [None]:
# DISPLAY 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases'

plt.figure(figsize=(18,8))

plt.subplot(2,2,1)
sns.histplot(data= numeric_data, x='NumDealsPurchases', kde=True)
plt.title('Number of Purchases using Discount')

plt.subplot(2,2,2)
sns.histplot(data= numeric_data, x='NumWebPurchases', kde=True)
plt.title('Number of Purchases via Website')

plt.subplot(2,2,3)
sns.histplot(data= numeric_data, x='NumCatalogPurchases', kde=True)
plt.title('Number of Purcases through Catalog')

plt.subplot(2,2,4)
sns.histplot(data= numeric_data, x='NumStorePurchases', kde=True)
plt.title('Number of Purchases via Physical Store')

plt.tight_layout()
plt.show()

In [None]:
data.columns

In [None]:
# DISPLAY NumWebVisitsMonth Histogram

axes = sns.countplot(data= numeric_data, x= 'NumWebVisitsMonth', palette='bright')

for ax in axes.patches:
    axes.text(x= ax.get_x() + ax.get_width() / 2, y= ax.get_height(), s= int(ax.get_height()), ha='center', va='bottom', fontsize=8)
plt.title('Number of Web Visited per Month by Customer')
plt.xlabel('Web Visited')
plt.show()

In [None]:
# DISPLAY 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5','AcceptedCmp1', 'AcceptedCmp2'

fig, axes = plt.subplots(nrows= 2, ncols=3, figsize=(20,12))

axes[0,0].pie(data['AcceptedCmp1'].value_counts(), labels=['Yes','No'], autopct='%1.1f%%',startangle=25, shadow=True, colors=['lightblue','red'])
axes[0,0].set_title('Customer who Accepted Campaign 1')

axes[0,1].pie(data['AcceptedCmp2'].value_counts(), labels=['Yes','No'], autopct='%1.1f%%', startangle=10, shadow=True)
axes[0,1].set_title('Customer who Accepted Campaign 2')

axes[0,2].pie(data['AcceptedCmp3'].value_counts(), labels=['Yes','No'], autopct='%1.1f%%', startangle=10, shadow=True)
axes[0,2].set_title('Customer who Accepted Campaign 3')

axes[1,0].pie(data['AcceptedCmp4'].value_counts(), labels=['Yes','No'], autopct='%1.1f%%',startangle=25, shadow=True, colors=['lightblue','red'])
axes[1,0].set_title('Customer who Accepted Campaign 4')

axes[1,1].pie(data['AcceptedCmp5'].value_counts(), labels=['Yes','No'], autopct='%1.1f%%', startangle=10, shadow=True)
axes[1,1].set_title('Customer who Accepted Campaign 5')

axes[1,2] = None

In [None]:
# DISPLAY RESPONSE CUSTOMER

plt.pie(data.Response.value_counts(), autopct='%1.1f%%', shadow=True, explode=[0,0.15], startangle= 50, colors=['cyan','orange'], labels=['Yes','No'])
plt.title('Customer that Response to the Campaign')

<h5>85% Customer Accept the Campaign

In [None]:
# CUSTOMER WHO COMPLAIN

plt.figure(figsize=(14,6))

# SHOW PIE CHART
plt.subplot(1,2,1)
plt.pie(x = data['Complain'].value_counts(), labels=['Yes','No'], autopct='%1.2f%%', shadow=True)
plt.title('Pie Chart')

# SHOW BAR CHART
plt.subplot(1,2,2)
axes = sns.countplot(data= data, x='Complain', palette='muted')
plt.title('Bar Chart')

for ax in axes.patches:
    axes.text(x= ax.get_x() + ax.get_width() / 2, y= ax.get_height(), s= int(ax.get_height()), ha='center', va='bottom', fontsize=13)

plt.suptitle('Customer who Complain',fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Data Preprocessing

## - Label Encoding

In [None]:
# CONVERT EDUCATION AND MARITAL STATUS USING LABEL ENCODER

encoder = LabelEncoder()
data_clean['Education'] = encoder.fit_transform(data_clean['Education'])
data_clean['Marital_Status'] = encoder.fit_transform(data_clean['Marital_Status'])

data_clean.head(4)

## - Feature Engineering

In [None]:
# ADD NEW 'AGE' FEATURE and 'Total items Purchases' Feature

# DECLARE CURRENT YEAR
current_year = datetime.now().year

data_clean['Age'] = current_year - data_clean['Year_Birth']
data_clean['Total_Items_Purchases'] = data_clean['MntWines'] + data_clean['MntFruits'] + data_clean['MntMeatProducts'] + data_clean['MntFishProducts'] + data_clean['MntSweetProducts'] + data_clean['MntGoldProds']

data_clean.head(4)

## - Feature Selection

In [None]:

#CHOOSE COLUMNS TO CLUSTER
cols_to_cluster = ['Income','Age','Education','Marital_Status','Kidhome','Teenhome','Recency','MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','Total_Items_Purchases','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases']

data_cluster = data_clean[cols_to_cluster]

data_cluster

<h5> FEATURE THAT I CHOOSE TO CLUSTER:</h5>
<ul>
    <li><strong>Income </strong>: Customer income is one of the important factors in customer segmentation. Typically, customers with high income have different purchasing behavior compared to customers with lower income.</li><br>
    <li><strong>Age </strong>: Age can provide important insights into customer preferences and needs. For example, younger customers may be more likely to shop online, while older customers may prefer offline purchases or specific products. </li><br>
    <li><strong>Education </strong>: Education can be related to product preferences or purchasing channels, as education levels are often associated with different incomes or consumption patterns. </li><br>
    <li><strong>Marital_Status </strong>: Marital status can influence purchasing patterns, especially in the context of products that are more related to the family or individual. For example, customers who are married or have a partner may purchase products with a higher value or shop more often for the family.</li><br>
    <li><strong>Kidhome, Teenhome </strong>: The number of children in the household can be an important indicator in determining customer segments. For example, customers with children tend to buy different products than customers without children. </li><br>
    <li><strong>Recency </strong>: This column measures how recently a customer’s last interaction or purchase was with a company. This is important for understanding how active or engaged the customer is. Customers with high recency tend to be “warmer” or more ready to engage or purchase again, while those with low recency may have been out of touch for longer and may have lost interest. </li><br>
    <li><strong>MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds , Total_Items_Purchases</strong>: These columns show how much customers spend on different product categories. This is very useful for identifying customer consumption behavior and product preferences. </li><br>
    <li><strong>NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases </strong>: This column shows the purchasing channels used by customers. Customers who shop more often online (NumWebPurchases) may have different preferences and habits than those who shop more often in physical stores (NumStorePurchases). </li><br>

## - Feature Scaling

In [None]:
# 

# CHOOSE FEATURE TO NORMALIZE
cols_to_normalize = ['Income']

# FIT MODEL
robust = RobustScaler()
data_cluster[cols_to_normalize] = robust.fit_transform(data_cluster[cols_to_normalize])

data_cluster.head(5)

In [None]:
data_cluster['Income'].describe()

## - Check Correlation 

In [None]:
# CORRELATION

data_corr = data_cluster.corr('pearson')

plt.figure(figsize=(20,20))
sns.heatmap(data= data_corr, annot=True)

Check relationship between customer income and Products Purchases

In [None]:

fig, axes = plt.subplots(nrows=3, ncols= 2,figsize=(24,18))

# Membuat lmplot
sns.regplot(data=data_clean, x='Income', y='MntWines', ax=axes[0,0])
axes[0,0].set_title('Wines Purchases')

sns.regplot(data=data_clean, x='Income', y='MntFruits', ax=axes[0,1])
axes[0,1].set_title('Fruits Purchases')

sns.regplot(data=data_clean, x='Income', y='MntMeatProducts', ax=axes[1,0])
axes[1,0].set_title('Meat Products Purchases')

sns.regplot(data=data_clean, x='Income', y='MntFishProducts', ax=axes[1,1])
axes[1,1].set_title('Fish Products Purchases')

sns.regplot(data=data_clean, x='Income', y='MntSweetProducts', ax=axes[2,0])
axes[2,0].set_title('Sweet Products Purchases')

sns.regplot(data=data_clean, x='Income', y='MntGoldProds', ax=axes[2,1])
axes[2,1].set_title('Gold Products Purchases')
plt.tight_layout()
plt.show()

<h5><strong>there is a moderate correlation between income and item purchases . This means that the higher a person's income, the more items they buy.

<h5><strong>CLUSTER MAP</strong>

In [None]:
sns.clustermap(data=data_cluster, cmap='coolwarm', figsize=(20,20), standard_scale=1, metric='euclidean', method='ward')

## - Dimensionality Reduction

In [None]:
# REDUCE DIMENSION

pca = PCA(n_components= None)

data_reduced = pca.fit_transform(data_cluster)

print(f'Number of Components : {pca.n_components_}\n')
print(f'Ratio every Component / PC : \n{pca.explained_variance_ratio_}\n')
print(f'PCA Components : \n{pca.components_}\n')
print(f'Eigen Value of Components : \n{pca.explained_variance_}\n')
print(f'Feature of Components : \n{pca.feature_names_in_}')

<h5>The first principal component (PC1) explains the overwhelming majority (92.12%) of the variance in the data. PC2 explains 6.10%, and after that, the contributions drop even further. ok lets plot it on dataframe</h5>

In [None]:

highest_variance_PC = pca.components_[:3]

PC_df = pd.DataFrame(highest_variance_PC.T, index=data_cluster.columns, columns=['PC1','PC2','PC3']) 
PC_df = PC_df.sort_values(by=['PC1','PC2','PC3'], ascending=False)
PC_df

<h5><ol><li><strong>PC1 : 92.12%</strong>
<ul>
    <li>The first component (PC1) has the highest weight on the Total_Items Purchases variable with a value of 0.855084. MntWines (0.439485), MntMeatProducts (0.264177), dan MntFishProducts (0.047442)</li>
    <li>PC1 appears to reflect dimensions related to purchasing patterns, especially in terms of frequency or total purchases of larger items.</li>
</ul></li>
<li><strong>PC2 : 6.10% </strong>
<ul>
    <li>PC2 shows a significant contribution from the variable MntWines (-0.729632), which has a very high negative loading. This indicates that this variable has an inverse relationship with this component. MntMeatProducts (0.650785) has positive loading</li>
    <li>This shows that PC2 is more focused on purchasing patterns for certain, more specific products.</li>
</ul></li>
<li><strong>PC 3 : 0.9% </strong>
<ul>
    <li>It can be seen that variables with negative contributions such as MntGoldProds (-0.403708) and MntFishProducts (-0.388070) have large weights, indicating that PC3 is related to the purchase of certain goods such as jewelry and gold-related products, and smaller factors related to demographics.</li>



# K-Means Clustering

## - Find Optimal K - Cluster

In [None]:
# FIND K-OPTIMAL WITH ELBOW METHOD

k_range = range(1,11)
inertia = []

for k in k_range:
    kmean = KMeans(n_clusters=k , init='k-means++', random_state=12, n_init=10)
    kmean.fit(data_reduced)  # FIT THE MODEL
    inertia.append(kmean.inertia_)  # GET THE COST FUNCTION

plt.plot(k_range, inertia)
plt.title('OPTIMAL K VALUE')
plt.xlabel('CLUSTER')
plt.ylabel('WCSS')
plt.show()

ok lets choose K = 3

In [None]:
# CHECK HOW GOOD THE CLUSTER USING SILHOUETTE COEFFICIENT

k_range = range(2,11)
sil_scores = []

for k in k_range:
    kmean = KMeans(n_clusters=k, random_state=12, init='k-means++', n_init=10)
    kmean.fit(data_reduced)
    label = kmean.labels_
    sil_score = silhouette_score(X= data_reduced, labels=label, metric='euclidean')
    sil_scores.append(sil_score)

plt.plot(k_range, sil_scores)
plt.title('Silhouette Coefficient')
plt.xlabel('CLUSTER')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# DAVIES BOULDIN SCORE

k_range = range(2,11)
davies_score = []

for k in k_range:
    kmean = KMeans(n_clusters=k, init='k-means++', random_state=12, n_init=10)
    kmean.fit(X= data_reduced)
    davies = davies_bouldin_score(X= data_reduced, labels=kmean.labels_)
    davies_score.append(davies)

plt.plot(k_range, davies_score)
plt.title('Davies Bouldin Score')
plt.xlabel('CLUSTER')
plt.ylabel('Davies Score')
plt.show()

ok lets create K-means model with K = 3

## - Model Fitting

In [None]:
# K-MEANS

fixed_kmean = KMeans(n_clusters=3, init='k-means++', random_state=12, n_init=1)
fixed_kmean.fit(data_reduced)

## - Evaluation

In [None]:
# EVALUATION

wcss = fixed_kmean.inertia_
sil_coef = silhouette_score(X= data_reduced, labels=fixed_kmean.labels_)
davies_bouldin = davies_bouldin_score(X= data_reduced, labels= fixed_kmean.labels_)

print(f'WCSS Score : {wcss}')
print(f'Silhouette Coefficient Score : {sil_coef}')
print(f'Davies Bouldin Score : {davies_bouldin}')

now lets Visualize it

## - Result

In [None]:
# VISUALIZE K-MEANS USING 2D SCATTER PLOT

centroids = fixed_kmean.cluster_centers_

# SCATTER PLOT
plt.figure(figsize=(20,12))
plt.scatter(x= data_reduced[:,0], y= data_reduced[:,1], c= fixed_kmean.labels_, cmap='viridis', s=250, alpha=0.7)
plt.scatter(x= centroids[:,0], y=centroids[:,1], c='red', marker='X', label='Centroid', s=500)

In [None]:
# VISUALIZE 3D SCATTER PLOT USING TSNE

tsne = TSNE(n_components=3)
tsne_data = tsne.fit_transform(data_reduced)

fig = plt.figure(figsize=(14,7))
ax = fig.add_subplot(1,1,1,projection='3d')

scatter = plt.scatter(x = tsne_data[:,0], y= tsne_data[:,1], s= tsne_data[:,2], c=fixed_kmean.labels_)

ax.set_title('CLUSTER')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

fig.colorbar(scatter)

plt.show()

In [None]:
# CREATE INTERACTIVE VISUAL 3D USING PLOTLY

fig = px.scatter_3d(x= data_reduced[:,0], y=data_reduced[:,1], z=data_reduced[:,2], color=fixed_kmean.labels_,
                    title='K-Mean Clustering',labels={'color' : 'Cluster'}, width=1000, height=600)
fig.update_traces(marker=dict(size=8))

fig.show()

In [None]:
# SAVE A K-MEAN CLUSTER TO DATAFRAME
data_kmean = pd.DataFrame(data= data_cluster, columns=data_cluster.columns)
prediction = fixed_kmean.predict(data_reduced)

#RETURN INCOME VALUE BACK TO THE NORMAL
data_kmean['Income'] = robust.inverse_transform(data_kmean[['Income']])

# ADD CLUSTER COLUMN TO DATAFRAME
data_kmean['Cluster'] = prediction + 1
data_kmean.head(5)

In [None]:
# DISPLAY THE NUMBER OF EACH CLUSTER
data_kmean.groupby('Cluster')['Age'].count()

Cluster 2 has 1236 point, then cluster 3 has 600 points , and Cluster 0 has 379 points

ok lets try some visualize

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data= data_kmean.corr(), annot=True, cmap='coolwarm')

In [None]:
sns.histplot(data=data_kmean, x='Income', hue=data_kmean['Cluster'], kde=True)
plt.show()

In [None]:
data_kmean.groupby('Cluster')['Income'].mean()

Cluster 1 has a higher average income than other clusters

In [None]:
sns.histplot(data=data_kmean, x='Total_Items_Purchases', hue='Cluster')

In [None]:
data_kmean.groupby('Cluster')['Total_Items_Purchases'].mean()

Cluster 1 has the highest total purchases, indicating high activity, while Cluster 2 shows the lowest, suggesting less frequent buying. Cluster 3 falls in between, with moderate purchase totals. This variation highlights different purchasing behaviors across the clusters.

In [None]:
sns.histplot(data=data_kmean, x='Age', hue='Cluster')
plt.title('Age')

In [None]:
fig , axes = plt.subplots(nrows=1, ncols=2, figsize=(18,6))

sns.countplot(data=data_kmean, x='Kidhome', hue='Cluster', ax=axes[0])
sns.countplot(data=data_kmean, x='Teenhome', hue='Cluster', ax=axes[1])

In [None]:
sns.histplot(data=data_kmean, x='Recency', hue='Cluster')

In [None]:
sns.countplot(data=data_kmean, x='NumDealsPurchases', hue='Cluster')

data_kmean.groupby('Cluster')['NumDealsPurchases'].mean()

On average, cluster 1 uses less discount than other clusters.

OK, now we separate the data for each cluster.

In [None]:
# SEPARATE THE DATA FOR EACH CLUSTER

Cluster_1 = data_kmean[data_kmean['Cluster'] == 1]  # CLUSTER 1
Cluster_2 = data_kmean[data_kmean['Cluster'] == 2]  # CLUSTER 2
Cluster_3 = data_kmean[data_kmean['Cluster'] == 3]  # CLUSTER 3

len(Cluster_1) , len(Cluster_2) , len(Cluster_3)

# Cluster Summary

<h4 style='color:deepskyblue;'><strong>CLUSTER 1

In [None]:
Cluster_1.describe()

<h5><ul><li>The average income in Cluster 1 is around 76,596 with a relatively high standard deviation (10,204)</li>
        <li>The mean age in this cluster was 56 years, with a large age variation (13 years).</li>
        <li>This cluster has an average value in the Marital Status category (3.7) which shows that most individuals in this cluster are married, while the average education level is 2.6, which shows that the majority have education levels up to high school or university.</li>
        <li>Average spending in categories such as MntWines (849), MntMeatProducts (502), and MntGoldProds (77) shows that members of this cluster tend to purchase more premium products or make more expensive consumer goods purchases more frequently than other clusters.</li>
        <li>Purchases made through multiple channels show a higher number of purchases, such as at NumStorePurchases with an average of 8.37 purchases per individual.</li>

<h4 style='color:yellow'><strong>CLUSTER 2

In [None]:
Cluster_2.describe()

<h5><ul><li>The average income in this cluster is 37,221, much lower than Cluster 1. With a standard deviation of 13,620, there is still variation, but in general, their incomes tend to be lower, ranging from 1,730 to 113,734.</li>
        <li>The average age was 53.7 years. Compared to Cluster 1, the average age was younger, but still older than the general population average.</li>
        <li>Most of the members of this cluster are married with an average score of 3.7 on Marital Status. They have an average education level of 2.3,</li>
        <li>This cluster has lower spending than Cluster 1. For example, MntWines has an average of only 69, which is much lower than Cluster 1.</li>
        <li>This cluster tends to shop less across channels than Cluster 1.</li>

<h4 style='color:green;'><strong>CLUSTER 3

In [None]:
Cluster_3.describe()

<h5><ul><li>The average income in this cluster is 65,583, higher than Cluster 2, but lower than Cluster 1. With a standard deviation of 11,003, it shows that although the income of this cluster is higher than Cluster 2, there is little variation in income.</li>
        <li>The average age is 57.8 years, indicating that this cluster is older than Cluster 2, but younger than Cluster 1.</li>
        <li>Most of the members of this cluster have a married status with an average of 3.7 in Marital Status. Their education level is slightly higher with an average of 2.4</li>
        <li>Spending on premium products such as MntWines (451) is lower than Cluster 1, but higher than Cluster 2. This indicates a tendency to make higher consumption product purchases, but still tends to be more economical than Cluster 1.</li>
        <li>This cluster makes purchases more frequently than Cluster 2, especially in the NumStorePurchases category with an average of 8.6 purchases.</li>

<h4><strong>CONCLUSION : </strong></h4>
<h5>
<ol>
    <li style='color:deepskyblue; font-weight:bold;'>CLUSTER 1</li>
    <ul>
        <li>This cluster consists of individuals with higher incomes and older ages. They make more frequent purchases of premium consumer goods, especially in the food sector and luxury products such as gold. They are more active in purchasing goods offline (through stores).</li>
        <li><strong>Name Cluster : Premium Consumers </li></strong>
    </ul><br>
    <li style='color:yellow; font-weight:bold;'>CLUSTER 2</li>
    <ul>
        <li>This cluster consists of individuals with lower incomes and younger ages than Cluster 1. They tend to be more frugal in spending and make fewer premium purchases. Their purchases are mostly made online or through other more affordable channels.</li>
        <li><strong>Name Cluster : Value Seekers </li></strong>
    </ul><br>
    <li style='color:green; font-weight:bold;'>CLUSTER 3 </li>
    <ul>
        <li>Individuals in Cluster 3 have higher incomes than Cluster 2, with slightly higher expenditures than Cluster 2 but lower than Cluster 1. They tend to be older and shop more frequently through multiple channels.</li>
        <li><strong>Name Cluster : Middle-Class Shoppers </li></strong>


<h4><strong><center>BUSSINESS ANALYSIS</center></h4></strong>
<h4><strong>MARKETING STRATEGY : </h4></strong>

<ol>
    <li style='color:deepskyblue; font-weight:bold;'>CLUSTER 1 (PREMIUM CONSUMERS)</li>
    <ul>
        <li>Focus on premium and luxury products, such as wine, high-quality meat products, or luxury goods such as gold jewelry.</li>
        <li>Offer an exclusive and personalized shopping experience, for example through more personalized customer service or special events.</li>
        <li>Consider offering exclusive deals to loyal customers in physical stores or through loyalty programs.</li>
    </ul><br>
    <li style='color:yellow; font-weight:bold;'>CLUSTER 2 (VALUE SEEKERS)</li>
    <ul>
        <li>Focus on offering more affordable products with added value, such as big discounts or bundled packages.</li>
        <li>Strengthen online sales channels, including promotions through e-commerce and digital campaigns that highlight price and value.</li>
        <li>Provide special offers for younger customers, such as discounts or seasonal promotions that can attract their attention.</li>
    </ul><br>
    <li style='color:green; font-weight:bold;'>CLUSTER 3 (MIDDLE-CLASS SHOPPERS)</li>
    <ul>
        <li>Focus on products that provide a balance between quality and price, such as everyday necessities or products with affordable but premium brands.</li>
        <li>Take advantage of multiple channels, both online and offline, with offers tailored to both in-store and online shopping.</li>
        <li>Offer loyalty programs that can attract frequent shoppers, such as reward points for repeat purchases.</li>
    </ul>