In [1]:
# CNN-Autoencoder.ipynb
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split
import time
from pathlib import Path
import pickle

# Import our modules - importing from the current directory
from HyperspectralDataset import HyperspectralDataset
from HyperspectralAutoencoder import PatchedHyperspectralAutoencoder
from training_utils import train_autoencoder, plot_training_history, evaluate_clustering
from visualization_utils import (
    visualize_reconstructions,
    visualize_cluster_map,
    visualize_feature_space,
    visualize_spectral_signatures
)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create output directories
output_dir = "output"
model_dir = os.path.join(output_dir, 'models')
results_dir = os.path.join(output_dir, 'results')
os.makedirs(model_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

Using device: cuda


In [2]:
# Set path to your data pickle file - update this to point to your data
data_path = "../Data/Kiwi Experiment/pickles/masked_KiwiData.pkl"

# For this example, we'll use patch-based training
use_patches = True
patch_size = 64

dataset = HyperspectralDataset(
    data_path,
    patch_size=patch_size if use_patches else None
)

n_excitations = len(dataset.excitation_wavelengths)
n_emissions = dataset.max_emissions
height, width = dataset.height, dataset.width

print(f"Dataset created with dimensions: {height}x{width}, {n_excitations} excitations, {n_emissions} emissions")

Loading data from ..\Data\Kiwi Experiment\pickles\masked_KiwiData.pkl...
Found 21 excitation wavelengths
Image dimensions: 1024 x 1392
Maximum emission bands: 31
Created 1302 patches of size 64x64
Dataset created with dimensions: 1024x1392, 21 excitations, 31 emissions


In [3]:
# Split dataset into train and validation sets
val_split = 0.2  # 20% for validation

if use_patches:
    train_size = int((1 - val_split) * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    print(f"Split into {train_size} training patches and {val_size} validation patches")
else:
    # For whole-image training, we don't split
    train_dataset, val_dataset = dataset, None
    print("Using whole image - no validation split applied")

# Create data loaders
batch_size = 32 if use_patches else 1

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True if use_patches else False,
    num_workers=0  # Changed from 4 to 0 to avoid the multiprocessing error
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0  # Changed from 4 to 0
) if val_dataset is not None else None

Split into 1041 training patches and 261 validation patches


In [4]:
# Model hyperparameters
latent_dim = 128
n_clusters = 10

# Create the model
model = PatchedHyperspectralAutoencoder(
    n_excitations=n_excitations,
    n_emissions=n_emissions,
    patch_size=patch_size,
    latent_dim=latent_dim,
    n_clusters=n_clusters
)
print(f"Created patched autoencoder with {patch_size}x{patch_size} patches")

# Print model summary
print(f"Model has {sum(p.numel() for p in model.parameters()):,} parameters")

Created patched autoencoder with 64x64 patches
Model has 5,710,827 parameters


In [5]:
# First, move the model to the same device as the data
model = model.to(device)
print(f"Model moved to {device}")

# Check the shapes of data from the DataLoader
for data in train_loader:
    print(f"Input data shape: {data.shape}")
    break

# Test a forward pass through the model
with torch.no_grad():
    # Now both model and data are on the same device
    data = data.to(device)
    reconstructed, latent, _ = model(data)
    print(f"Original shape: {data.shape}")
    print(f"Reconstructed shape: {reconstructed.shape}")
    print(f"Latent shape: {latent.shape}")

Model moved to cuda
Input data shape: torch.Size([32, 21, 31, 64, 64])
Original shape: torch.Size([32, 21, 31, 64, 64])
Reconstructed shape: torch.Size([32, 21, 31, 64, 64])
Latent shape: torch.Size([32, 128])


In [7]:
# Before training, check if input data already contains NaNs
for batch_idx, data in enumerate(train_loader):
    if torch.isnan(data).any():
        print(f"Input batch {batch_idx} contains {torch.isnan(data).sum().item()} NaN values!")
        # Print where they occur
        nan_indices = torch.where(torch.isnan(data))
        print(f"First few NaN positions: {[(i.item(), j.item()) for i, j in zip(nan_indices[0][:5], nan_indices[1][:5])]}")
    else:
        print(f"Batch {batch_idx} input is clean (no NaNs)")
    break

Input batch 0 contains 8714372 NaN values!
First few NaN positions: [(0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [6]:
# Training parameters - use smaller epochs for initial testing
epochs = 10  # For a full run, use 50+ epochs
learning_rate = 0.001
update_interval = 5
patience = 10

model, history = train_autoencoder(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    n_epochs=epochs,
    n_clusters=n_clusters,
    learning_rate=learning_rate,
    device=device,
    model_save_path=model_dir,
    update_interval=update_interval,
    patience=patience
)

# Plot training history
history_path = os.path.join(results_dir, 'training_history.png')
plot_training_history(history, save_path=history_path)



Starting training for 10 epochs...
All samples contain NaN values - replacing NaNs with zeros


  return fit_method(estimator, *args, **kwargs)


EarlyStopping counter: 1 out of 10
Epoch 1/10, Time: 19.24s, Loss: nan, Recon: nan, Cluster: nan, Reg: nan, Val Loss: nan
EarlyStopping counter: 2 out of 10
Epoch 2/10, Time: 38.12s, Loss: nan, Recon: nan, Cluster: nan, Reg: nan, Val Loss: nan
EarlyStopping counter: 3 out of 10
Epoch 3/10, Time: 57.09s, Loss: nan, Recon: nan, Cluster: nan, Reg: nan, Val Loss: nan
EarlyStopping counter: 4 out of 10
Epoch 4/10, Time: 76.37s, Loss: nan, Recon: nan, Cluster: nan, Reg: nan, Val Loss: nan
EarlyStopping counter: 5 out of 10
Epoch 5/10, Time: 95.78s, Loss: nan, Recon: nan, Cluster: nan, Reg: nan, Val Loss: nan


ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
eval_dataset = HyperspectralDataset(data_path)

eval_loader = DataLoader(
    eval_dataset,
    batch_size=1,  # Process as a single batch for full image
    shuffle=False,
    num_workers=num_workers
)

# Perform clustering
result = evaluate_clustering(
    model=model,
    dataloader=eval_loader,
    n_clusters=n_clusters,
    device=device
)

print(f"Clustering completed with {n_clusters} clusters")
print(f"Metrics: {result['metrics']}")

In [None]:
# Get original and reconstructed data for visualization
model = model.to(device)
model.eval()

with torch.no_grad():
    for batch in eval_loader:
        batch = batch.to(device)
        reconstructed, _, _ = model(batch)
        original_data = batch.cpu().numpy()
        reconstructed_data = reconstructed.cpu().numpy()
        break  # Just need one batch for visualization

print("Data extracted for visualization")

In [None]:
# 1. Visualize Reconstructions
fig = visualize_reconstructions(
    original_data, reconstructed_data, n_samples=5,
    excitation_idx=0, emission_idx=None,
    save_path=os.path.join(results_dir, 'reconstructions.png')
)
plt.show()

# 2. Visualize Cluster Map
fig = visualize_cluster_map(
    result['cluster_labels'], height, width, n_clusters,
    save_path=os.path.join(results_dir, 'cluster_map.png')
)
plt.show()

# 3. Visualize Feature Space with PCA
fig = visualize_feature_space(
    result['features'], result['cluster_labels'], n_clusters, method='pca',
    save_path=os.path.join(results_dir, 'feature_space_pca.png')
)
plt.show()

# 4. Visualize Spectral Signatures for Each Cluster
fig = visualize_spectral_signatures(
    eval_dataset.data_dict, result['cluster_labels'],
    eval_dataset.data_dict, height, width, n_clusters,
    save_path=os.path.join(results_dir, 'spectral_signatures.png')
)
plt.show()

In [None]:
# Reshape labels to match image dimensions
cluster_image = result['cluster_labels'].reshape(height, width)

# Count pixels in each cluster
cluster_counts = np.bincount(result['cluster_labels'], minlength=n_clusters)
cluster_percentages = cluster_counts / len(result['cluster_labels']) * 100

# Print cluster statistics
print(f"\nCluster Statistics (n_clusters={n_clusters}):\n")
print(f"{'Cluster':<10} {'Pixels':<10} {'Percentage':<10}")
print("-" * 30)

for i in range(n_clusters):
    print(f"{i:<10} {cluster_counts[i]:<10} {cluster_percentages[i]:.2f}%")

# Save the clustering results
results_file = os.path.join(results_dir, 'clustering_results.pkl')
with open(results_file, 'wb') as f:
    pickle.dump(result, f)

print(f"Clustering results saved to {results_file}")

# Save model for later use
model_file = os.path.join(model_dir, 'final_model.pt')
torch.save({
    'model_state_dict': model.state_dict(),
    'n_excitations': n_excitations,
    'n_emissions': n_emissions,
    'patch_size': patch_size,
    'latent_dim': latent_dim,
    'n_clusters': n_clusters
}, model_file)

print(f"Model saved to {model_file}")
print(f"\nEntire pipeline completed successfully!")