In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, cut_tree
from scipy.spatial.distance import pdist, squareform
from sklearn.datasets import make_blobs, load_iris, load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# 1. Create synthetic dataset
X_synthetic, y_true = make_blobs(
    n_samples=150, 
    centers=4, 
    n_features=2, 
    cluster_std=1.5, 
    random_state=42
)

# 2. Load Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
iris_feature_names = iris.feature_names

# 3. Load Wine dataset
wine = load_wine()
X_wine = wine.data
y_wine = wine.target
wine_feature_names = wine.feature_names

print("Dataset shapes:")
print(f"Synthetic: {X_synthetic.shape}")
print(f"Iris: {X_iris.shape}")
print(f"Wine: {X_wine.shape}")

# Visualize synthetic data
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
scatter = plt.scatter(X_synthetic[:, 0], X_synthetic[:, 1], c=y_true, cmap='viridis', alpha=0.7)
plt.title('Synthetic Data (True Clusters)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(scatter)

plt.subplot(1, 3, 2)
plt.scatter(X_iris[:, 0], X_iris[:, 1], c=y_iris, cmap='viridis', alpha=0.7)
plt.title('Iris Dataset (Sepal Length vs Width)')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')

plt.subplot(1, 3, 3)
plt.scatter(X_wine[:, 0], X_wine[:, 1], c=y_wine, cmap='viridis', alpha=0.7)
plt.title('Wine Dataset (First 2 Features)')
plt.xlabel('Alcohol')
plt.ylabel('Malic Acid')

plt.tight_layout()
plt.show()


In [None]:
# Compare different linkage methods using synthetic data
linkage_methods = ['single', 'complete', 'average', 'ward']

fig, axes = plt.subplots(2, 4, figsize=(20, 10))

for i, method in enumerate(linkage_methods):
    # Compute linkage matrix
    Z = linkage(X_synthetic, method=method)
    
    # Create dendrogram
    axes[0, i].set_title(f'{method.capitalize()} Linkage Dendrogram')
    dendrogram(Z, ax=axes[0, i], leaf_rotation=90)
    axes[0, i].set_xlabel('Sample Index')
    axes[0, i].set_ylabel('Distance')
    
    # Get cluster labels (using 4 clusters)
    clusters = fcluster(Z, 4, criterion='maxclust')
    
    # Plot clusters
    scatter = axes[1, i].scatter(X_synthetic[:, 0], X_synthetic[:, 1], 
                                c=clusters, cmap='viridis', alpha=0.7)
    axes[1, i].set_title(f'{method.capitalize()} Linkage Clusters')
    axes[1, i].set_xlabel('Feature 1')
    axes[1, i].set_ylabel('Feature 2')
    
    # Calculate silhouette score
    sil_score = silhouette_score(X_synthetic, clusters)
    axes[1, i].text(0.02, 0.98, f'Silhouette: {sil_score:.3f}', 
                   transform=axes[1, i].transAxes, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

# Print comparison of linkage methods
print("Linkage Method Comparison:")
print("="*50)
for method in linkage_methods:
    Z = linkage(X_synthetic, method=method)
    clusters = fcluster(Z, 4, criterion='maxclust')
    sil_score = silhouette_score(X_synthetic, clusters)
    ari_score = adjusted_rand_score(y_true, clusters)
    print(f"{method.capitalize():>10}: Silhouette={sil_score:.3f}, ARI={ari_score:.3f}")
