In [5]:
# 1. Install Kaggle library
!pip install kaggle

# 2. Upload your kaggle.json file
from google.colab import files
print("Please upload your kaggle.json file")
files.upload()

# 3. Set up the Kaggle directory and permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API setup complete.")

Please upload your kaggle.json file


Saving kaggle (1).json to kaggle (1).json
mv: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Kaggle API setup complete.


In [12]:
# Create the directory the Kaggle API expects
!mkdir -p ~/.kaggle

# Copy your key file to the correct location with the correct name
!cp 'kaggle (1).json' ~/.kaggle/kaggle.json

# Set the required permissions so the API will read the file
!chmod 600 ~/.kaggle/kaggle.json

In [13]:
# Download the dataset from Kaggle

!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

# Unzip the downloaded file
print("Unzipping dataset...")
!unzip -q chest-xray-pneumonia.zip

# List the contents to confirm
!ls

print("Dataset is ready in the 'chest_xray' folder.")

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
License(s): other
Downloading chest-xray-pneumonia.zip to /content
 99% 2.28G/2.29G [00:19<00:00, 255MB/s]
100% 2.29G/2.29G [00:19<00:00, 125MB/s]
Unzipping dataset...
 chest_xray   chest-xray-pneumonia.zip	'kaggle (1).json'   sample_data
Dataset is ready in the 'chest_xray' folder.


In [14]:
# --- Cell 1: Imports and Helper Functions ---
import cv2
import numpy as np
import os
from skimage.feature import hog, local_binary_pattern
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
import seaborn as sns
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Part 1: Data Acquisition and Preprocessing ---

def preprocess_image(image_path, size=(256, 256)):
    """Loads, converts to grayscale, and resizes an image."""
    try:
        img = cv2.imread(image_path)
        if img is None:

            from google.colab.patches import cv2_imshow
            print(f"Warning: Could not read image {image_path}")
            return None

        if len(img.shape) == 3 and img.shape[2] == 3:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        img_resized = cv2.resize(img, size, interpolation=cv2.INTER_AREA)
        return img_resized
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

def load_data(base_dir):
    """Loads all images from train, test, and val sets and assigns labels."""
    images = []
    labels = []

    sets = ['train', 'test', 'val']
    categories = ['NORMAL', 'PNEUMONIA']

    for s in sets:
        for i, category in enumerate(categories):
            path = os.path.join(base_dir, s, category)
            if not os.path.isdir(path):
                print(f"Warning: Directory not found {path}")
                continue

            for filename in os.listdir(path):
                if filename.endswith('.jpeg') or filename.endswith('.png'):
                    img_path = os.path.join(path, filename)
                    img = preprocess_image(img_path)
                    if img is not None:
                        images.append(img)
                        labels.append(i) # 0 for NORMAL, 1 for PNEUMONIA

    return images, np.array(labels)

# --- Part 2: Feature Vector Generation ---

def extract_features(image):
    """Extracts a concatenated feature vector (Hist, HOG, LBP, Hu) from a single image."""

    # 1. Histogram (v_hist)
    v_hist = cv2.calcHist([image], [0], None, [256], [0, 256]).flatten()

    # 2. HOG (v_hog)
    v_hog = hog(image, orientations=9, pixels_per_cell=(8, 8),
                cells_per_block=(2, 2), visualize=False).flatten()

    # 3. LBP (v_lbp)
    P = 8
    R = 1
    lbp = local_binary_pattern(image, P, R, method='uniform')
    (v_lbp, _) = np.histogram(lbp.ravel(),
                              bins=np.arange(0, P + 3),
                              range=(0, P + 2))
    v_lbp = v_lbp.astype("float")

    # 4. Hu Moments (v_hu)
    moments = cv2.moments(image)
    v_hu = cv2.HuMoments(moments).flatten()

    combined_features = np.hstack([v_hist, v_hog, v_lbp, v_hu])
    return combined_features

In [15]:
# --- Cell 2: Main Execution ---

# 1. Load Data
# This is the correct path in Colab after unzipping
DATASET_PATH = 'chest_xray'
print("Loading and preprocessing images...")
images, y_true = load_data(DATASET_PATH)
print(f"Loaded {len(images)} images.")

# 2. Extract Features (This will take a few minutes)
print("Extracting features from all images...")
all_features = []
for img in tqdm(images):
    features = extract_features(img)
    all_features.append(features)

X = np.array(all_features)
print(f"Feature matrix shape: {X.shape}")

# 3. Normalization
print("Normalizing feature matrix...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Setup complete. X_scaled and y_true are ready.")

Loading and preprocessing images...
Loaded 5856 images.
Extracting features from all images...


100%|██████████| 5856/5856 [04:34<00:00, 21.32it/s]


Feature matrix shape: (5856, 34869)
Normalizing feature matrix...
Setup complete. X_scaled and y_true are ready.


In [None]:
# --- Cell 3: Clustering (Part 3) ---

K = 2 # Target K
results = {}
metrics = {}

# 1. K-Means
print("\nRunning K-Means...")
inertias = []
K_range = range(2, 11)
for k_i in K_range:
    km = KMeans(n_clusters=k_i, random_state=42, n_init=10).fit(X_scaled)
    inertias.append(km.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(K_range, inertias, 'bx-')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Inertia')
plt.title('K-Means Elbow Method')
plt.show()

kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
results['K-Means'] = kmeans_labels
print(f"K-Means (K={K}) complete.")


# 2. Hierarchical Clustering (Agglomerative)
print("\nRunning Hierarchical Clustering...")
# Use a sample for the dendrogram, as 5800+ points is too much to plot
sample_size = min(5000, X_scaled.shape[0])
# Fix for potential issue if dataset is smaller than 5000
np.random.seed(42) # for reproducibility
sample_indices = np.random.choice(X_scaled.shape[0], sample_size, replace=False)
X_sample = X_scaled[sample_indices]

print("Calculating linkage for dendrogram...")
linkage_matrix = linkage(X_sample, method='ward')

plt.figure(figsize=(15, 7))
plt.title('Hierarchical Clustering Dendrogram (Sampled Data)')
plt.xlabel('Sample index')
plt.ylabel('Distance (Ward)')
dendrogram(linkage_matrix, p=5, truncate_mode='lastp', show_leaf_counts=True)
plt.show()

agglo = AgglomerativeClustering(n_clusters=K, linkage='ward')
agglo_labels = agglo.fit_predict(X_scaled)
results['Hierarchical (Ward)'] = agglo_labels
print("Hierarchical Clustering complete.")


# 3. DBSCAN
print("\nRunning DBSCAN...")
min_pts = 10
nn = NearestNeighbors(n_neighbors=min_pts)
nn.fit(X_scaled)
distances, indices = nn.kneighbors(X_scaled)

k_distances = np.sort(distances[:, min_pts-1], axis=0)

plt.figure(figsize=(8, 4))
plt.plot(k_distances)
plt.ylabel(f"{min_pts}-th Nearest Neighbor Distance")
plt.xlabel("Points sorted by distance")
plt.title("DBSCAN k-distance graph (for finding eps)")
plt.grid(True)
plt.show()

# The y-value at that elbow is your EPSILON.

EPSILON = 30.0 #TUNE THIS VALUE BASED ON YOUR K-DISTANCE PLOT
# -------------------------

dbscan = DBSCAN(eps=EPSILON, min_samples=min_pts)
dbscan_labels = dbscan.fit_predict(X_scaled)
results['DBSCAN'] = dbscan_labels
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)
print(f"DBSCAN complete. Found {n_clusters_dbscan} clusters and {n_noise} noise points.")


Running K-Means...


In [None]:
# --- Cell 4: Evaluation (Part 4) ---

# 1. Dimensionality Reduction for Visualization (t-SNE)
print("\nRunning t-SNE for visualization (this will take a few minutes)...")

# We can run PCA first to reduce to 50 dims, then t-SNE. This is much faster.
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_scaled)

tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
X_2D = tsne.fit_transform(X_pca) # Fit t-SNE on the PCA-reduced data
print("t-SNE complete.")

# 2. Plotting Cluster Visualizations
fig, axes = plt.subplots(1, 4, figsize=(28, 7), sharex=True, sharey=True)
fig.suptitle('Cluster Visualization using t-SNE (on PCA-reduced data)', fontsize=20)

# Plot 1: Ground Truth
sns.scatterplot(ax=axes[0], x=X_2D[:, 0], y=X_2D[:, 1],
                hue=y_true,
                palette=['#1f77b4', '#ff7f0e'], # Blue, Orange
                legend='full')
axes[0].set_title(f'Ground Truth (0=NORMAL, 1=PNEUMONIA)')

# Plot 2: K-Means
sns.scatterplot(ax=axes[1], x=X_2D[:, 0], y=X_2D[:, 1],
                hue=results['K-Means'],
                palette='viridis',
                legend='full')
axes[1].set_title(f'K-Means (K={K})')

# Plot 3: Hierarchical
sns.scatterplot(ax=axes[2], x=X_2D[:, 0], y=X_2D[:, 1],
                hue=results['Hierarchical (Ward)'],
                palette='viridis',
                legend='full')
axes[2].set_title(f'Hierarchical (K={K})')

# Plot 4: DBSCAN
sns.scatterplot(ax=axes[3], x=X_2D[:, 0], y=X_2D[:, 1],
                hue=results['DBSCAN'],
                palette='deep', # 'deep' handles noise (-1) well
                legend='full')
axes[3].set_title(f'DBSCAN (Clusters={n_clusters_dbscan})')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# 3. Calculate Intrinsic Metrics
print("\nCalculating metrics...")
metric_results = []

for name, labels in results.items():
    if len(set(labels)) < 2:
        print(f"Skipping metrics for {name}: only 1 cluster found.")
        continue

    if name == 'DBSCAN':
        mask = labels != -1
        if len(set(labels[mask])) < 2:
            print(f"Skipping metrics for {name}: not enough non-noise clusters.")
            continue
        X_eval = X_scaled[mask]
        labels_eval = labels[mask]
    else:
        X_eval = X_scaled
        labels_eval = labels

    sil = silhouette_score(X_eval, labels_eval)
    db = davies_bouldin_score(X_eval, labels_eval)

    metric_results.append({
        "Method": name,
        "Silhouette Score": sil,
        "Davies-Bouldin Index": db
    })

# Display metrics table
metrics_df = pd.DataFrame(metric_results)
print("\n--- Final Metrics Table ---")
print(metrics_df.to_markdown(index=False))