In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Load dataset
df = pd.read_csv("Mall_Customers.csv")

# Display basic info
print(df.info())
print(df.describe())

# Visualizing the distribution of Age, Annual Income, and Spending Score
plt.figure(figsize=(12,5))
sns.histplot(df['Age'], bins=20, kde=True, color='blue')
plt.title("Age Distribution")
plt.show()

plt.figure(figsize=(12,5))
sns.histplot(df['Annual Income (k$)'], bins=20, kde=True, color='green')
plt.title("Annual Income Distribution")
plt.show()

plt.figure(figsize=(12,5))
sns.histplot(df['Spending Score (1-100)'], bins=20, kde=True, color='red')
plt.title("Spending Score Distribution")
plt.show()

# Selecting features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# Finding optimal number of clusters using Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10,5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--', color='b')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()

# Applying K-Means Clustering (Assuming optimal clusters = 5)
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# Visualizing clusters
plt.figure(figsize=(10,6))
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', s=100)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s=300, c='red', marker='X', label='Centroids')
plt.title("Customer Segments")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.legend()
plt.show()
