# Unsupervised Learning - Comprehensive Guide

Learn clustering and dimensionality reduction.

## Table of Contents
1. [K-Means Clustering](#kmeans)
2. [Hierarchical Clustering](#hierarchical)
3. [DBSCAN](#dbscan)
4. [PCA](#pca)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs, load_iris
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

sns.set_theme()

## K-Means Clustering <a id='kmeans'></a>

In [None]:
# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)

# Apply K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
y_pred = kmeans.fit_predict(X)

# Visualize
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
plt.title('True Labels')

plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            s=300, c='red', marker='X', edgecolors='black', label='Centroids')
plt.title('K-Means Clustering')
plt.legend()
plt.tight_layout()
plt.show()

# Silhouette score
score = silhouette_score(X, y_pred)
print(f"Silhouette Score: {score:.4f}")

In [None]:
# Elbow method
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.grid(True)
plt.show()

## Hierarchical Clustering <a id='hierarchical'></a>

In [None]:
# Generate sample data
X, _ = make_blobs(n_samples=50, centers=3, random_state=42)

# Create dendrogram
linkage_matrix = linkage(X, method='ward')

plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix)
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

# Apply hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
y_pred = hierarchical.fit_predict(X)

plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
plt.title('Hierarchical Clustering Result')
plt.show()

## DBSCAN <a id='dbscan'></a>

In [None]:
# Generate sample data with noise
X, _ = make_blobs(n_samples=300, centers=3, cluster_std=0.5, random_state=42)
noise = np.random.rand(50, 2) * 10 - 5
X = np.vstack([X, noise])

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
y_pred = dbscan.fit_predict(X)

# Visualize
plt.figure(figsize=(10, 6))
unique_labels = set(y_pred)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

for label, color in zip(unique_labels, colors):
    if label == -1:
        color = 'black'
    mask = (y_pred == label)
    plt.scatter(X[mask, 0], X[mask, 1], c=[color], label=f'Cluster {label}',
                alpha=0.7, edgecolors='black' if label == -1 else None)

plt.title('DBSCAN Clustering')
plt.legend()
plt.show()

print(f"Number of clusters: {len(set(y_pred)) - (1 if -1 in y_pred else 0)}")
print(f"Number of noise points: {list(y_pred).count(-1)}")

## PCA (Principal Component Analysis) <a id='pca'></a>

In [None]:
# Load iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualize
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolors='black')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA of Iris Dataset')
plt.colorbar(scatter, label='Species')
plt.show()

print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

In [None]:
# Scree plot
pca_full = PCA()
pca_full.fit(X_scaled)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
         np.cumsum(pca_full.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.grid(True)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.legend()
plt.show()

## Summary

Covered:
- K-Means clustering with elbow method
- Hierarchical clustering with dendrograms
- DBSCAN for density-based clustering
- PCA for dimensionality reduction
- Evaluation metrics

Congratulations on completing the ML Revision course! 🎉