# Hyperspectral Data Clustering Example

This notebook demonstrates how to use the hyperspectral convolutional autoencoder for clustering hyperspectral data, visualizing clusters, and interpreting the results.

In [None]:
# Import standard libraries
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
from pathlib import Path

# Import our custom modules
from hyperspectral_dataset import HyperspectralDataset, load_hyperspectral_data
from hyperspectral_models import HyperspectralCAEVariable
from hyperspectral_clustering import (
    run_hyperspectral_clustering,
    visualize_clustering_results,
    extract_encoded_features,
    prepare_features_for_clustering,
    cluster_features,
    map_clusters_to_image,
    visualize_clusters_2d,
    visualize_cluster_maps,
    compare_cluster_maps
)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Hyperspectral Data and Trained Model

First, let's load our hyperspectral data and the trained autoencoder model.

In [None]:
# Set the path to your data file
data_path = "../Data/Kiwi Experiment/pickles/masked_KiwiData.pkl"

# Load the data
data_dict = load_hyperspectral_data(data_path)

# Create dataset
dataset = HyperspectralDataset(
    data_dict,
    normalize=True,  # Apply global normalization to [0,1]
    downscale_factor=1  # Use full resolution (adjust based on memory constraints)
)

# Initialize the emission_wavelengths attribute if not already done
if not hasattr(dataset, 'emission_wavelengths'):
    dataset.emission_wavelengths = {}

# Get all processed data
all_data = dataset.get_all_data()
spatial_height, spatial_width = dataset.get_spatial_dimensions()

print(f"Processed data dimensions: {spatial_height}x{spatial_width}")

In [None]:
# Load the trained model
model = HyperspectralCAEVariable(
    excitations_data={ex: data.numpy() for ex, data in all_data.items()},
    k1=20,
    k3=20,
    filter_size=5,
    sparsity_target=0.1,
    sparsity_weight=1.0,
    dropout_rate=0.5
)

# Load the weights from the trained model
try:
    # Try loading the best model
    model.load_state_dict(torch.load("best_hyperspectral_model.pth", map_location=device))
    print("Loaded best model")
except:
    try:
        # Try loading the final model
        model.load_state_dict(torch.load("hyperspectral_cae_final_model.pth", map_location=device))
        print("Loaded final model")
    except:
        print("No saved model found. Please train the model first.")
        
# Set model to evaluation mode
model.eval()

## 2. Extract Encoded Features for Clustering

Now, let's extract the encoded features from the trained model to use for clustering.

In [None]:
# Extract encoded features
encoded_features, spatial_shapes = extract_encoded_features(model, all_data, device)

# Print encoded feature shapes
print("Encoded feature shapes:")
for ex, features in encoded_features.items():
    print(f"  Excitation {ex}nm: {features.shape}")

## 3. K-means Clustering

Let's try K-means clustering on the encoded features.

In [None]:
# Define clustering configuration for K-means
kmeans_config = {
    'method': 'kmeans',
    'n_clusters': 5,
    'combine_excitations': True,
    'reduction_method': 'pca',
    'n_components': 8
}

# Run the clustering pipeline
kmeans_results = run_hyperspectral_clustering(
    model,
    dataset,
    clustering_config=kmeans_config,
    device=device
)

In [None]:
# Visualize the K-means clustering results
kmeans_visualizations = visualize_clustering_results(
    kmeans_results,
    original_data_dict=all_data
)

## 4. DBSCAN Clustering (Density-Based)

Now let's try DBSCAN clustering, which can find clusters of arbitrary shapes.

In [None]:
# Define clustering configuration for DBSCAN
dbscan_config = {
    'method': 'dbscan',
    'eps': 0.5,  # The maximum distance between two samples to be considered in the same neighborhood
    'min_samples': 10,  # The number of samples in a neighborhood for a point to be considered a core point
    'combine_excitations': True,
    'reduction_method': 'pca',
    'n_components': 8
}

# Run the clustering pipeline
dbscan_results = run_hyperspectral_clustering(
    model,
    dataset,
    clustering_config=dbscan_config,
    device=device
)

In [None]:
# Visualize the DBSCAN clustering results
dbscan_visualizations = visualize_clustering_results(
    dbscan_results,
    original_data_dict=all_data
)

## 5. Gaussian Mixture Model Clustering (GMM)

GMM clustering assumes that the data points are generated from a mixture of several Gaussian distributions.

In [None]:
# Define clustering configuration for GMM
gmm_config = {
    'method': 'gmm',
    'n_clusters': 6,  # Number of Gaussian components
    'combine_excitations': True,
    'reduction_method': 'pca',
    'n_components': 8
}

# Run the clustering pipeline
gmm_results = run_hyperspectral_clustering(
    model,
    dataset,
    clustering_config=gmm_config,
    device=device
)

In [None]:
# Visualize the GMM clustering results
gmm_visualizations = visualize_clustering_results(
    gmm_results,
    original_data_dict=all_data
)

## 6. Separate Clustering for Each Excitation Wavelength

Instead of combining all excitations, we can also apply clustering separately to each excitation wavelength.

In [None]:
# Define clustering configuration for separate excitations
separate_config = {
    'method': 'kmeans',
    'n_clusters': 4,
    'combine_excitations': False,  # Process each excitation separately
    'reduction_method': 'pca',
    'n_components': 5
}

# Run the clustering pipeline
separate_results = run_hyperspectral_clustering(
    model,
    dataset,
    clustering_config=separate_config,
    device=device
)

In [None]:
# Visualize the separate clustering results
separate_visualizations = visualize_clustering_results(
    separate_results,
    original_data_dict=all_data
)

## 7. Compare Clustering Methods

Let's compare the results of the different clustering methods.

In [None]:
# Function to print clustering metrics
def print_clustering_metrics(results, method_name):
    print(f"\n--- {method_name} Clustering Metrics ---")
    
    quality_metrics = results['quality_metrics']
    
    if isinstance(quality_metrics, dict):
        # Print metrics for each excitation
        for ex, metrics in quality_metrics.items():
            print(f"\nExcitation {ex}nm:")
            for metric, value in metrics.items():
                if not np.isnan(value):
                    print(f"  {metric}: {value:.4f}")
    else:
        # Print overall metrics
        for metric, value in quality_metrics.items():
            if not np.isnan(value):
                print(f"  {metric}: {value:.4f}")

# Print metrics for each clustering method
print_clustering_metrics(kmeans_results, "K-means")
print_clustering_metrics(dbscan_results, "DBSCAN")
print_clustering_metrics(gmm_results, "GMM")
print_clustering_metrics(separate_results, "Separate K-means")

In [None]:
# Create a visual comparison of the different clustering methods
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# Choose an excitation wavelength to compare
ex_to_compare = list(all_data.keys())[0]  # First excitation

# Define colormap
cmap = 'tab10'

# K-means
if ex_to_compare in kmeans_results['cluster_maps']:
    im1 = axes[0, 0].imshow(kmeans_results['cluster_maps'][ex_to_compare], cmap=cmap)
    axes[0, 0].set_title(f"K-means ({kmeans_config['n_clusters']} clusters)")
    plt.colorbar(im1, ax=axes[0, 0])

# DBSCAN
if ex_to_compare in dbscan_results['cluster_maps']:
    im2 = axes[0, 1].imshow(dbscan_results['cluster_maps'][ex_to_compare], cmap=cmap)
    n_clusters_dbscan = len(np.unique(dbscan_results['cluster_maps'][ex_to_compare]))
    axes[0, 1].set_title(f"DBSCAN ({n_clusters_dbscan} clusters)")
    plt.colorbar(im2, ax=axes[0, 1])

# GMM
if ex_to_compare in gmm_results['cluster_maps']:
    im3 = axes[1, 0].imshow(gmm_results['cluster_maps'][ex_to_compare], cmap=cmap)
    axes[1, 0].set_title(f"GMM ({gmm_config['n_clusters']} clusters)")
    plt.colorbar(im3, ax=axes[1, 0])

# Separate K-means
if ex_to_compare in separate_results['cluster_maps']:
    im4 = axes[1, 1].imshow(separate_results['cluster_maps'][ex_to_compare], cmap=cmap)
    axes[1, 1].set_title(f"Separate K-means ({separate_config['n_clusters']} clusters)")
    plt.colorbar(im4, ax=axes[1, 1])

plt.suptitle(f"Comparison of Clustering Methods for Excitation {ex_to_compare}nm", fontsize=16)
plt.tight_layout()
plt.show()

## 8. Deep Dive into the Best Clustering Method

Based on the metrics, let's perform a deeper analysis on the most promising clustering method.

In [None]:
# Choose the best method based on the metrics (assuming K-means for this example)
best_results = kmeans_results
best_method = "K-means"

# Extract cluster maps
cluster_maps = best_results['cluster_maps']

# For each excitation, analyze the spectral signature of each cluster
for ex in all_data.keys():
    if ex in cluster_maps:
        print(f"\n=== Analyzing Clusters for Excitation {ex}nm ===")
        
        # Get original data and cluster map
        original_data = all_data[ex].numpy()
        cluster_map = cluster_maps[ex]
        
        # Get unique clusters
        unique_clusters = np.unique(cluster_map)
        print(f"Found {len(unique_clusters)} unique clusters")
        
        # Calculate average spectrum for each cluster
        cluster_spectra = {}
        cluster_sizes = {}
        
        for cluster_id in unique_clusters:
            # Create mask for this cluster
            mask = cluster_map == cluster_id
            
            # Count pixels in this cluster
            size = np.sum(mask)
            cluster_sizes[cluster_id] = size
            
            # Calculate mean spectrum for this cluster
            cluster_spectrum = np.mean(original_data[mask], axis=0)
            cluster_spectra[cluster_id] = cluster_spectrum
            
            print(f"Cluster {cluster_id}: {size} pixels ({size/mask.size*100:.1f}% of total)")
        
        # Plot average spectra for each cluster
        plt.figure(figsize=(12, 6))
        
        for cluster_id, spectrum in cluster_spectra.items():
            plt.plot(range(len(spectrum)), spectrum, label=f"Cluster {cluster_id} ({cluster_sizes[cluster_id]} pixels)")
        
        plt.title(f"{best_method} Clustering: Average Spectra for Excitation {ex}nm")
        plt.xlabel("Emission Band Index")
        plt.ylabel("Mean Intensity (Normalized)")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()

## 9. Finding Optimal Number of Clusters

Let's determine the optimal number of clusters for K-means.

In [None]:
# Prepare features for clustering
prepared_features, excitation_indices, excitation_wavelengths = prepare_features_for_clustering(
    encoded_features,
    combine_excitations=True,
    reduction_method='pca',
    n_components=8
)

# Try different numbers of clusters
k_range = range(2, 11)
silhouette_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []

for k in k_range:
    print(f"Testing k={k}...")
    # Apply K-means
    cluster_labels, _ = cluster_features(prepared_features, method='kmeans', n_clusters=k)
    
    # Evaluate clustering
    metrics = evaluate_clustering(prepared_features, cluster_labels)
    
    # Save scores
    silhouette_scores.append(metrics['silhouette'])
    davies_bouldin_scores.append(metrics['davies_bouldin'])
    calinski_harabasz_scores.append(metrics['calinski_harabasz'])

# Plot the results
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

ax1.plot(k_range, silhouette_scores, 'o-')
ax1.set_title('Silhouette Score (higher is better)')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Score')
ax1.grid(True, alpha=0.3)

ax2.plot(k_range, davies_bouldin_scores, 'o-')
ax2.set_title('Davies-Bouldin Score (lower is better)')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Score')
ax2.grid(True, alpha=0.3)

ax3.plot(k_range, calinski_harabasz_scores, 'o-')
ax3.set_title('Calinski-Harabasz Score (higher is better)')
ax3.set_xlabel('Number of Clusters (k)')
ax3.set_ylabel('Score')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Determine optimal k based on silhouette score
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters based on Silhouette Score: {optimal_k}")

# Determine optimal k based on Davies-Bouldin score
optimal_k_db = k_range[np.argmin(davies_bouldin_scores)]
print(f"Optimal number of clusters based on Davies-Bouldin Score: {optimal_k_db}")

# Determine optimal k based on Calinski-Harabasz score
optimal_k_ch = k_range[np.argmax(calinski_harabasz_scores)]
print(f"Optimal number of clusters based on Calinski-Harabasz Score: {optimal_k_ch}")

## 10. Run Clustering with Optimal Parameters

Based on the analysis, let's run the clustering with the optimal parameters.

In [None]:
# Define clustering configuration with optimal parameters
optimal_config = {
    'method': 'kmeans',
    'n_clusters': optimal_k,  # Use the optimal k determined above
    'combine_excitations': True,
    'reduction_method': 'pca',
    'n_components': 8
}

# Run the clustering pipeline
optimal_results = run_hyperspectral_clustering(
    model,
    dataset,
    clustering_config=optimal_config,
    device=device
)

In [None]:
# Visualize the optimal clustering results
optimal_visualizations = visualize_clustering_results(
    optimal_results,
    original_data_dict=all_data
)

## 11. Export Cluster Maps for Further Analysis

Save the cluster maps for further analysis or integration with other tools.

In [None]:
# Export cluster maps to numpy files
output_dir = Path("cluster_maps")
output_dir.mkdir(exist_ok=True)

# Save optimal clustering results
for ex, cluster_map in optimal_results['cluster_maps'].items():
    output_file = output_dir / f"cluster_map_{ex}nm.npy"
    np.save(output_file, cluster_map)
    print(f"Saved cluster map for excitation {ex}nm to {output_file}")

# Save clustering configuration
config_file = output_dir / "clustering_config.npy"
np.save(config_file, optimal_results['config'])
print(f"Saved clustering configuration to {config_file}")

## 12. Summary of Clustering Results

Let's summarize the findings from our clustering analysis.

In [None]:
# Print summary of clustering results
print("\n=== Summary of Hyperspectral Clustering Results ===")
print(f"\nOptimal Clustering Method: K-means with {optimal_k} clusters")
print(f"Dimensionality Reduction: PCA with 8 components")

# Print cluster distributions
print("\nCluster Distributions:")
for ex in optimal_results['cluster_maps']:
    cluster_map = optimal_results['cluster_maps'][ex]
    unique, counts = np.unique(cluster_map, return_counts=True)
    percentages = counts / np.sum(counts) * 100
    
    print(f"\nExcitation {ex}nm:")
    for cluster_id, count, percentage in zip(unique, counts, percentages):
        print(f"  Cluster {cluster_id}: {count} pixels ({percentage:.1f}%)")

# Print quality metrics
print("\nClustering Quality Metrics:")
if isinstance(optimal_results['quality_metrics'], dict):
    for ex, metrics in optimal_results['quality_metrics'].items():
        print(f"\nExcitation {ex}nm:")
        for metric, value in metrics.items():
            if not np.isnan(value):
                print(f"  {metric}: {value:.4f}")
else:
    for metric, value in optimal_results['quality_metrics'].items():
        if not np.isnan(value):
            print(f"  {metric}: {value:.4f}")