In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install kneed
from kneed import KneeLocator
import seaborn as sns



In [None]:
# load dataset
url = "https://raw.githubusercontent.com/NathaliaMinoque/datasets/refs/heads/main/mall_customers.csv"
df = pd.read_csv(url)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# cek missing value
df.isnull().sum()

In [None]:
# rename the Income($)  column
df = df.rename(columns={"Annual Income (k$)": "Income", "Spending Score (1-100)":"Spending Score"})
df.head()

# 3 Visualizations

In [None]:
plt.figure(figsize=(15,4))

# Gender distribution
plt.subplot(1,3,1)
sns.countplot(x='Gender', data=df, palette='pastel')
plt.title('Gender Distribution')

# Age distribution
plt.subplot(1,3,2)
sns.histplot(df['Age'], bins=15, kde=True, color='skyblue')
plt.title('Age Distribution')

# Income vs Spending
plt.subplot(1,3,3)
sns.scatterplot(x='Income', y='Spending Score', hue='Gender', data=df)
plt.title('Income vs Spending Score')
plt.tight_layout()
plt.show()

## **Explanations**

1. **Gender Distribution:** The number of male and female customers is relatively balanced, meaning that marketing strategies can be tailored for both segments.

2. **Age Distribution:** Most customers are between 25-40 years old, indicating that the mall's primary target audience consists of young, productive adults.

3. **Income vs Spending Score:** An interesting pattern appears there are groups with high income but low spending, and others with low income but high spending. This suggests differences in shopping behaviour that can be further identified through clustering.

# Data Transformation

In [None]:
# Encode Gender Column
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

# Cek Encode Column
print("Data setelah encoding 'Gender':")
print(df.head())
df.info()

# Data Scaling

In [None]:
# scale dataset with decimal scaling
def decimal_scaling(df):
    normalized_df = df.copy()
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns  # hanya kolom numerik
    for column in numeric_cols:
        j = 0
        max_value = df[column].max()
        while max_value > 1:
            max_value /= 10
            j += 1
        normalized_df[column] = df[column] / (10 ** j)
    return normalized_df

# Example usage:
df_decimal_scaled = decimal_scaling(df)

df_decimal_scaled.head()

# visualization
plt.scatter(df_decimal_scaled["Income"], df_decimal_scaled["Spending Score"])
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.title("Income vs Spending Score (Decimal Scaled)")
plt.grid(axis='both', linestyle='--')
plt.show()


In [None]:
# scale dataset with standard scaler

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Pilih kolom yang relevan untuk clustering
features = df[['Age', 'Income', 'Spending Score']]

# Lakukan scaling
sc = StandardScaler()
scaled_features = sc.fit_transform(features)

# Buat DataFrame hasil scaling
df_sc_scaled = pd.DataFrame(scaled_features, columns=['Age', 'Income', 'Spending Score'])

# Lihat hasil scaling
print(df_sc_scaled.head())

# Visualisasi
plt.figure(figsize=(7,5))
plt.scatter(df_sc_scaled["Spending Score"], df_sc_scaled["Income"], alpha=0.6)
plt.xlabel("Spending Score (scaled)")
plt.ylabel("Income (scaled)")
plt.title("Spending Score and Income Patterns (Standard Scaled)")
plt.grid(axis='both', linestyle='--')
plt.show()

In [None]:
# scale dataset with Min Max scaler

from sklearn.preprocessing import MinMaxScaler

features = ["Age", "Income", "Spending Score"]

scaler = MinMaxScaler()

df_minmax_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)

df_minmax_scaled.head()

# visualization
plt.figure(figsize=(6,5))
plt.scatter(df_minmax_scaled["Income"], df_minmax_scaled["Spending Score"], alpha=0.7)
plt.xlabel("Annual Income (scaled)")
plt.ylabel("Spending Score (scaled)")
plt.title("Income vs Spending Score (Min-Max Scaled)")
plt.grid(axis='both', linestyle='--')
plt.show()

# Choosing K

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []  # Within-Cluster Sum of Squares

# Predict score K from 2 - 10
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_minmax_scaled)
    wcss.append(kmeans.inertia_)

# Visualizations result Elbow Method
plt.figure(figsize=(7,5))
plt.plot(range(2, 11), wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.grid(axis='both', linestyle='--')
plt.show()

##**Conclusion Choosing the Final K**

Based on the Elbrow Method, the Within-Cluster Sum of Squares (WCSS) decreases sharply at the beginning and then starts to flatten after a certain point, forming an “elbow” shape around K = 5. This indicates that five clusters provide the best balance between cluster compactness and separation. Therefore, the optimal number of clusters chosen for this analysis is K = 5, which will be used in the next step to perform K-Means modeling and identify distinct customer segments.

# Modeling with K-Means

In [None]:
from sklearn.cluster import KMeans

X = df_sc_scaled[["Age", "Income", "Spending Score"]]

wcss = []
for i in range(1, 11):   #clusters 1-10
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the elbow method graph
plt.figure(figsize=(7,5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.grid(axis='both', linestyle='--')
plt.show()

In [None]:
# Create a DataFrame to display the WCSS values for each number of clusters
wcss_table = pd.DataFrame({
    "Number of Clusters": range(1, 11),
    "WCSS": wcss
})

# Display the table
print(wcss_table)

In [None]:
# Example WCSS values and cluster numbers
clusters = range(1, 11)

# Automatically find the elbow point using the KneeLocator
knee_locator = KneeLocator(clusters, wcss, curve="convex", direction="decreasing")
optimal_clusters = knee_locator.knee

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(clusters, wcss, marker='o', linestyle='--', label="WCSS")
plt.axvline(optimal_clusters, linestyle='--', color='red', label=f'Optimal Clusters: {optimal_clusters}')
plt.scatter(optimal_clusters, wcss[optimal_clusters-1], c='red', s=100, zorder=5)  # Highlight elbow point
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.legend()
plt.grid(True)
plt.show()

# Print the optimal number of clusters
print(f"The optimal number of clusters is: {optimal_clusters}")

In [None]:
# Fit K-Means to the data with the optimal number of K cluster = 3
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, random_state=0)
clusters = kmeans.fit_predict(df_sc_scaled)

# Add the cluster labels to the DataFrame
df['KMeans_Cluster'] = clusters

# Show the df
df.head()

In [None]:
# visualize the result using scater chart on scaled data
plt.figure(figsize=(10,7))

sns.scatterplot(x = "Income", y = "Spending Score", hue=clusters, palette="viridis", data=df_sc_scaled, s = 60)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c="black", label="Centroids")
plt.xlabel("Scaled Income")
plt.ylabel("Scaled Spending Score")
plt.title("Clustering Result with K-Means Method (Scaled Data)")
plt.legend()
plt.show()

In [None]:
cluster_counts = df['KMeans_Cluster'].value_counts()

plt.figure(figsize=(8, 6))
cluster_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Cluster Membership Counts')
plt.xlabel('K-Means Cluster')
plt.ylabel('Number of Customers')
plt.grid(axis='y')
plt.show()

In [None]:
# Group the data by cluster and calculate the mean spending and annual income for each cluster
cluster_summary = df.groupby(clusters).agg({'Age': ['mean', 'min', 'max'],
                                              'Income': ['mean', 'min', 'max']})

cluster_summary = cluster_summary.rename_axis('KMeans_cluster')

cluster_summary

# Managerial Implications Based on the Four Clusters (Income & Spending Score)

Cluster 0 – Middle Income, Low Spending

Profile:

- Income: Moderate (~$46K average).

- Age: Older group (around 52 years on average).

- Tend to be cautious or conservative spenders.

- Likely prioritize practical and essential goods over luxury or lifestyle products.

Implications:

- Focus marketing on value-for-money and product durability.

- Introduce discounts, bundled packages, or loyalty rewards to increase purchase frequency.

- Emphasize trust, reliability, and customer support in brand communication.


Cluster 1 – High Income, High Spending

Profile:

- Income: Highest among all clusters (~$86K average).

- Age: Middle-aged (around 39 years old).

- Represent affluent and active spenders who enjoy luxury and premium experiences.

- This group contributes the most to profitability.

Implications:

- Focus marketing on premium, exclusive, and personalized offerings.

- Provide VIP memberships, priority services, and luxury experiences to maintain loyalty.

- Emphasize brand prestige, status, and lifestyle appeal in campaigns.

Cluster 2 – Moderate Income, High Spending

Profile:

- Income: Medium (~$61K average).

- Age: Youngest group (around 28 years old).

- Often impulsive and trend-oriented buyers, influenced by visuals and social media.

- Seek emotional satisfaction and social recognition from purchases.

Implications:

- Use digital marketing, influencer collaborations, and limited-time offers to capture attention.

- Provide flexible payment options (installments, digital wallets, etc.).

- Maintain brand freshness through modern designs and youth-oriented campaigns.

# Overall Insights

The analysis reveals three distinct customer segments based on income, spending score, and age:

- Cluster 1: High-value customers who should be the focus of premium and loyalty strategies.

- Cluster 2: Growth-oriented segment—young, active, and easily influenced by trends.

- Cluster 0: Conservative customers who require practical, value-based marketing approaches.

By customizing marketing strategies for each cluster, businesses can optimize resource allocation, improve customer engagement, and maximize profitability across all segments.