In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('customer_segmentation_dataset.csv')

# Step 2: Data Exploration and Cleaning

# Check for missing values
print(df.isnull().sum())

# Handle any missing values if necessary
df.fillna(df.mean(), inplace=True)

# Summary statistics
print(df.describe())

In [None]:
# Step 3: Descriptive Statistics

average_income = df['Income'].mean()
print(f"Average Income: {average_income}")

In [None]:
# Step 4: Customer Segmentation

# Select relevant features for clustering
features = [
    'Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
    'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
    'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
    'NumWebVisitsMonth', 'Age', 'Customer_Days', 'MntTotal', 'MntRegularProds'
]

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Determine the optimal number of clusters using the elbow method
inertia = []
for n in range(1, 11):
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

# Fit the K-means model with the optimal number of clusters (e.g., 3)
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

In [None]:
# Step 5: Visualization

# Scatter plot of clusters based on Income and MntTotal
sns.scatterplot(x='Income', y='MntTotal', hue='Cluster', data=df, palette='viridis')
plt.title('Customer Segments by Income and Total Spending')
plt.show()

# Bar plot of average total purchase value per cluster
cluster_summary = df.groupby('Cluster')['MntTotal'].mean().reset_index()
sns.barplot(x='Cluster', y='MntTotal', data=cluster_summary, palette='viridis')
plt.title('Average Total Purchase Value per Cluster')
plt.show()

In [None]:
# Step 6: Insights and Recommendations

# Summarize the characteristics of each segment
summary = df.groupby('Cluster').agg({
    'Income': 'mean',
    'Kidhome': 'mean',
    'Teenhome': 'mean',
    'Recency': 'mean',
    'MntWines': 'mean',
    'MntFruits': 'mean',
    'MntMeatProducts': 'mean',
    'MntFishProducts': 'mean',
    'MntSweetProducts': 'mean',
    'MntGoldProds': 'mean',
    'NumDealsPurchases': 'mean',
    'NumWebPurchases': 'mean',
    'NumCatalogPurchases': 'mean',
    'NumStorePurchases': 'mean',
    'NumWebVisitsMonth': 'mean',
    'Age': 'mean',
    'Customer_Days': 'mean',
    'MntTotal': 'mean',
    'MntRegularProds': 'mean'
}).reset_index()

print(summary)




In [None]:
# Provide recommendations based on segment characteristics
for cluster in summary['Cluster']:
    print(f"Cluster {cluster}:")
    print(f" - Average Income: {summary.loc[summary['Cluster'] == cluster, 'Income'].values[0]}")
    print(f" - Average Total Purchase Value: {summary.loc[summary['Cluster'] == cluster, 'MntTotal'].values[0]}")
    print(" - Recommendations:")
    if cluster == 0:
        print("   * Focus on retaining these high-value customers with loyalty programs and exclusive offers.")
    elif cluster == 1:
        print("   * Encourage higher purchase frequency with targeted promotions and reminders.")
    else:
        print("   * Identify potential high-value customers in this segment and target them with personalized marketing.")
    print()