# Customer Segmentation using K-Means Clustering
In this notebook, we perform customer segmentation using the K-Means clustering algorithm. We also determine the optimal number of clusters for better segmentation.

### Question 1: Customer Segmentation using K-Means Clustering
1. Load the dataset
2. Clean the data (handle missing values and duplicates)
3. Normalize or standardize the features
4. Apply K-Means clustering
5. Visualize the resulting clusters

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load dataset
url = 'https://path_to_your_customer_data.csv'  # Replace with your dataset URL
df = pd.read_csv(url)

# Clean the data
df.fillna(df.mean(), inplace=True)  # Handle missing values
df.drop_duplicates(inplace=True)  # Remove duplicates

# Normalize features
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numerical_cols])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualize the resulting clusters
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(pca_components, columns=['PC1', 'PC2'])
df_pca['Cluster'] = df['Cluster']
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=df_pca, palette='Set1')
plt.title('Customer Segmentation using K-Means Clustering')
plt.show()

### Question 2: Determining the Optimal Number of Clusters
1. Apply the Elbow Method
2. Optionally, compute the Silhouette Score
3. Select the optimal value of k
4. Re-run the K-Means algorithm with optimal k
5. Visualize the final clustering results

In [None]:
# Apply Elbow Method to determine optimal k
wcss = []
for i in range(1, 11):  # Try k from 1 to 10
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Graph
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o', color='blue')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()

# Compute Silhouette Scores for different k
from sklearn.metrics import silhouette_score
silhouette_scores = []
for i in range(2, 11):  # Try k from 2 to 10
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    score = silhouette_score(df_scaled, kmeans.labels_)
    silhouette_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(8, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o', color='green')
plt.title('Silhouette Scores for Different k')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

# Choose optimal k (e.g., k=3 based on elbow/silhouette)
optimal_k = 3
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = final_kmeans.fit_predict(df_scaled)

# Final Visualization
final_pca_components = pca.fit_transform(df_scaled)
df_pca_final = pd.DataFrame(final_pca_components, columns=['PC1', 'PC2'])
df_pca_final['Cluster'] = df['Cluster']
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=df_pca_final, palette='Set1')
plt.title(f'Final Customer Segmentation with k={optimal_k}')
plt.show()