In [None]:
# Step 1:Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns  # Import for additional visualizations (optional)
from sklearn.metrics import silhouette_score

# Step 2:Read CSV data
data = pd.read_csv('Mall_Customers.csv')

# Step3: Data Exploration
print(data.describe())  # Get summary statistics
print(data['Gender'].value_counts())  # Check distribution of categorical features

# Check for missing values
print("Missing values",data.isnull().sum())  # Identify missing values (if any)


# Data Visualization
# Explore relationships between features with scatter plots or pairplots
sns.pairplot(data)  # Consider using Seaborn for more advanced visualizations
plt.show()

# Step 4: Feature Selection
# Choose features relevant for customer segmentation
features = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# Step 5: Data Standardization (recommended for K-means)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step6: WCSS (Within Cluster Sum of Squares) values for different k
wcss = []
for k in range(2, 11):
  kmeans = KMeans(n_clusters=k, random_state=42)
  kmeans.fit(scaled_features)
  wcss.append(kmeans.inertia_)  # Inertia is another name for WCSS

# Plot the elbow curve
plt.plot(range(2, 11), wcss)
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS')
plt.title('Elbow Method for KMeans Clustering')
plt.show()

# Identify the elbow point (visually)
# Look for a sharp decrease in WCSS followed by a more gradual decrease

# Step 7:K-means Clustering (set k based on step 6)
k = 5
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(scaled_features)

# Step 8:Predict cluster labels for each customer
cluster_labels = kmeans.predict(scaled_features)

# Add cluster label as a new column to the original data
data['Cluster'] = cluster_labels

# Step9: Analyze Customer Segments

# A. Cluster Distribution Visualization
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], c=cluster_labels)
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('Customer Clusters')
plt.show()

# B. Cluster Characteristics (descriptive statistics)
print(data.groupby('Cluster').describe())

# C. Analyze Spending Habits by Cluster (Optional)
# Explore boxplots or violin plots to visualize spending score distribution within clusters
sns.boxplot(x="Cluster", y="Spending Score (1-100)", showmeans=True, data=data)
plt.xlabel('Cluster')
plt.ylabel('Spending Score')
plt.title('Spending Score Distribution by Cluster')
plt.show()

# D. Analyze Income Distribution by Cluster (Optional)
sns.violinplot(x="Cluster", y="Annual Income (k$)",  data=data)
plt.xlabel('Cluster')
plt.ylabel('Annual Income')
plt.title('Annual Income Distribution by Cluster')
plt.show()

# E. Analyze Gender Distribution by Cluster (Optional)
gender_distribution = data.groupby('Cluster')['Gender'].value_counts().unstack()
print(gender_distribution)

