# Day 9: Unsupervised Learning and Dimensionality Reduction

This notebook covers K-Means clustering, Hierarchical clustering, and PCA.

## 1. Setting up the Environment

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage

# Set random seed for reproducibility
np.random.seed(42)

## 2. Generate Customer Dataset

In [None]:
# Generate customer data
n_samples = 1000

# Features
annual_income = np.concatenate([
    np.random.normal(40000, 10000, n_samples//3),  # Low income
    np.random.normal(80000, 15000, n_samples//3),  # Middle income
    np.random.normal(150000, 20000, n_samples//3)  # High income
])

spending_score = np.concatenate([
    np.random.normal(30, 10, n_samples//3),  # Low spenders
    np.random.normal(50, 15, n_samples//3),  # Medium spenders
    np.random.normal(80, 10, n_samples//3)   # High spenders
])

age = np.random.normal(40, 15, n_samples)
age = np.clip(age, 18, 80)

shopping_frequency = np.random.poisson(5, n_samples)
shopping_frequency = np.clip(shopping_frequency, 0, 20)

# Create DataFrame
customer_data = pd.DataFrame({
    'annual_income': annual_income,
    'spending_score': spending_score,
    'age': age,
    'shopping_frequency': shopping_frequency
})

# Scale the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(customer_data)
data_scaled = pd.DataFrame(data_scaled, columns=customer_data.columns)

print("Sample of the dataset:")
print(customer_data.head())

print("\nDataset description:")
print(customer_data.describe())

## 3. K-Means Clustering

In [None]:
# Elbow Method
inertias = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_scaled)
    inertias.append(kmeans.inertia_)

# Plot Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Apply K-Means with optimal k
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_data['Cluster'] = kmeans.fit_predict(data_scaled)

# Visualize clusters (using first two features)
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(data=customer_data, x='annual_income', y='spending_score', 
                hue='Cluster', palette='deep')
plt.title('Customer Segments: Income vs Spending')

plt.subplot(1, 2, 2)
sns.scatterplot(data=customer_data, x='age', y='shopping_frequency', 
                hue='Cluster', palette='deep')
plt.title('Customer Segments: Age vs Shopping Frequency')

plt.tight_layout()
plt.show()

# Analyze clusters
print("\nCluster Analysis:")
print(customer_data.groupby('Cluster').mean().round(2))

## 4. Hierarchical Clustering

In [None]:
# Create linkage matrix
linkage_matrix = linkage(data_scaled, method='ward')

# Plot dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Apply Hierarchical Clustering
n_clusters = 3
hc = AgglomerativeClustering(n_clusters=n_clusters)
customer_data['HC_Cluster'] = hc.fit_predict(data_scaled)

# Compare with K-Means
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sns.scatterplot(data=customer_data, x='annual_income', y='spending_score', 
                hue='Cluster', palette='deep', title='K-Means Clustering')
plt.title('K-Means Clustering')

plt.subplot(1, 2, 2)
sns.scatterplot(data=customer_data, x='annual_income', y='spending_score', 
                hue='HC_Cluster', palette='deep')
plt.title('Hierarchical Clustering')

plt.tight_layout()
plt.show()

## 5. Dimensionality Reduction with PCA

In [None]:
# Apply PCA
pca = PCA()
data_pca = pca.fit_transform(data_scaled)

# Explained variance ratio
print("Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

# Cumulative explained variance
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance_ratio) + 1), 
         cumulative_variance_ratio, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance vs Number of Components')
plt.show()

# Visualize data in 2D PCA space with cluster labels
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=customer_data['Cluster'], 
            cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA of Customer Data\nColored by K-Means Clusters')

# Feature importance in PCA
feature_importance = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(len(customer_data.columns))],
    index=customer_data.columns
)

plt.subplot(1, 2, 2)
sns.heatmap(feature_importance, annot=True, cmap='coolwarm')
plt.title('PCA Components Heatmap')

plt.tight_layout()
plt.show()

## 6. t-SNE Visualization

In [None]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
data_tsne = tsne.fit_transform(data_scaled)

# Visualize t-SNE results
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=customer_data['Cluster'],
            cmap='viridis')
plt.title('t-SNE Visualization\nColored by K-Means Clusters')

plt.subplot(1, 2, 2)
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=customer_data['HC_Cluster'],
            cmap='viridis')
plt.title('t-SNE Visualization\nColored by Hierarchical Clusters')

plt.tight_layout()
plt.show()

## 7. Customer Segmentation Analysis

In [None]:
# Detailed analysis of clusters
def analyze_clusters(data, cluster_col):
    print(f"\nAnalysis for {cluster_col}:")
    
    # Cluster sizes
    print("\nCluster Sizes:")
    print(data[cluster_col].value_counts())
    
    # Cluster profiles
    print("\nCluster Profiles:")
    profiles = data.groupby(cluster_col).mean()
    print(profiles.round(2))
    
    # Cluster standard deviations
    print("\nCluster Standard Deviations:")
    std_devs = data.groupby(cluster_col).std()
    print(std_devs.round(2))

# Analyze both clustering methods
analyze_clusters(customer_data, 'Cluster')  # K-Means
analyze_clusters(customer_data, 'HC_Cluster')  # Hierarchical Clustering

# Visualize cluster profiles
plt.figure(figsize=(15, 6))

# K-Means profiles
plt.subplot(1, 2, 1)
cluster_profiles = customer_data.groupby('Cluster').mean()
sns.heatmap(cluster_profiles, annot=True, cmap='coolwarm', center=0)
plt.title('K-Means Cluster Profiles')

# Hierarchical clustering profiles
plt.subplot(1, 2, 2)
hc_profiles = customer_data.groupby('HC_Cluster').mean()
sns.heatmap(hc_profiles, annot=True, cmap='coolwarm', center=0)
plt.title('Hierarchical Cluster Profiles')

plt.tight_layout()
plt.show()