# Breast Cancer Clustering Analysis

This notebook performs clustering analysis on the breast cancer dataset using K-means and Hierarchical clustering algorithms.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

# Set random seed for reproducibility
np.random.seed(42)

## Load and Prepare Data

In [None]:
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

print("Features shape:", X.shape)
print("\nFeature names:")
print(data.feature_names)

## Data Preprocessing

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Display first few rows of scaled data
X_scaled.head()

## K-means Clustering

In [None]:
# Find optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow curve
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()

In [None]:
# Perform K-means clustering with optimal k
optimal_k = 2  # Based on the dataset having 2 classes
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Compare with actual labels
print("K-means Clustering Results:")
print("Silhouette Score:", silhouette_score(X_scaled, kmeans_labels))
print("\nCluster Distribution:")
print(pd.Series(kmeans_labels).value_counts())

## Hierarchical Clustering

In [None]:
# Create linkage matrix
linkage_matrix = linkage(X_scaled, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Perform hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
hierarchical_labels = hierarchical.fit_predict(X_scaled)

# Compare with actual labels
print("Hierarchical Clustering Results:")
print("Silhouette Score:", silhouette_score(X_scaled, hierarchical_labels))
print("\nCluster Distribution:")
print(pd.Series(hierarchical_labels).value_counts())

## Visualize Clusters

In [None]:
# Use PCA to reduce dimensions for visualization
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(15, 5))

# Plot K-means clusters
plt.subplot(1, 3, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis')
plt.title('K-means Clustering')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

# Plot Hierarchical clusters
plt.subplot(1, 3, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=hierarchical_labels, cmap='viridis')
plt.title('Hierarchical Clustering')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

# Plot actual labels
plt.subplot(1, 3, 3)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.title('Actual Labels')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

plt.tight_layout()
plt.show()

## Compare Clustering Results with Actual Labels

In [None]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

print("K-means Clustering Evaluation:")
print("Adjusted Rand Score:", adjusted_rand_score(y, kmeans_labels))
print("Adjusted Mutual Information Score:", adjusted_mutual_info_score(y, kmeans_labels))

print("\nHierarchical Clustering Evaluation:")
print("Adjusted Rand Score:", adjusted_rand_score(y, hierarchical_labels))
print("Adjusted Mutual Information Score:", adjusted_mutual_info_score(y, hierarchical_labels))