In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.colors as mcolors
from matplotlib.colors import ListedColormap
from typing import Tuple, List, Optional, Dict, Union
import os
from pathlib import Path
import time
from sklearn.neighbors import NearestNeighbors

class HyperspectralDBSCAN:
    """
    Class for unsupervised DBSCAN clustering of hyperspectral data and visualizing results.
    """

    def __init__(self, df: pd.DataFrame, image_shape: Optional[Tuple[int, int]] = None):
        """
        Initialize with the flattened hyperspectral dataframe.

        Args:
            df: DataFrame containing flattened hyperspectral data with x, y coordinates
            image_shape: Tuple of (height, width) of the original image. If None, inferred from data.
        """
        self.df = df

        # Extract spatial coordinates and feature columns
        self.x_col = 'x'
        self.y_col = 'y'

        # Validate that x and y columns exist
        if self.x_col not in df.columns or self.y_col not in df.columns:
            raise ValueError(f"DataFrame must contain '{self.x_col}' and '{self.y_col}' columns")

        # Get feature columns (all columns except x and y)
        self.feature_cols = [col for col in df.columns if col not in [self.x_col, self.y_col]]

        if len(self.feature_cols) == 0:
            raise ValueError("No feature columns found in DataFrame")

        print(f"Found {len(self.feature_cols)} feature columns in the data")

        # Determine image shape if not provided
        if image_shape is None:
            self.height = int(df[self.y_col].max()) + 1
            self.width = int(df[self.x_col].max()) + 1
        else:
            self.height, self.width = image_shape

        print(f"Image shape: {self.height} × {self.width} pixels")

        # Initialize model and results
        self.model = None
        self.labels = None
        self.feature_scaler = None
        self.scaled_features = None

    def preprocess(self, handle_nan: str = 'drop_feature'):
        """
        Preprocess the data for clustering.

        Args:
            handle_nan: Strategy for handling NaN values ('drop_feature', 'fill_zero', 'fill_mean')

        Returns:
            self: For method chaining
        """
        print("Preprocessing data...")

        # Extract features
        features = self.df[self.feature_cols].copy()

        # Count missing values
        nan_counts = features.isna().sum()
        nan_features = nan_counts[nan_counts > 0]

        if len(nan_features) > 0:
            print(f"Found {len(nan_features)} features with missing values")
            print(f"Top 5 features with most NaNs: {nan_features.sort_values(ascending=False).head()}")

            # Handle missing values based on strategy
            if handle_nan == 'drop_feature':
                # Drop features with any NaN values
                good_features = [col for col in self.feature_cols if nan_counts[col] == 0]
                features = self.df[good_features].copy()
                print(f"Dropped {len(self.feature_cols) - len(good_features)} features with NaNs")
                self.feature_cols = good_features

            elif handle_nan == 'fill_zero':
                # Replace NaN with zeros
                features.fillna(0, inplace=True)
                print("Filled NaN values with zeros")

            elif handle_nan == 'fill_mean':
                # Replace NaN with feature means
                features.fillna(features.mean(), inplace=True)
                print("Filled NaN values with feature means")

            else:
                raise ValueError(f"Unknown handle_nan strategy: {handle_nan}")

        # Scale the features
        self.feature_scaler = StandardScaler()
        self.scaled_features = self.feature_scaler.fit_transform(features)
        print(f"Scaled {self.scaled_features.shape[1]} features")

        return self

    def estimate_dbscan_params(self, n_neighbors: int = 5, quantile: float = 0.95, n_samples: int = 10000):
        """
        Estimate good parameters for DBSCAN using nearest neighbors distances.

        Args:
            n_neighbors: Number of neighbors to consider for distance calculation
            quantile: Quantile of k-dist to use as eps estimate
            n_samples: Number of samples to use for estimation (to speed up computation)

        Returns:
            Tuple of (estimated_eps, estimated_min_samples)
        """
        if self.scaled_features is None:
            self.preprocess()

        print(f"Estimating DBSCAN parameters using {n_samples} sample points...")

        # Use a subset of points if data is large
        if n_samples and n_samples < len(self.scaled_features):
            indices = np.random.choice(len(self.scaled_features), n_samples, replace=False)
            sample_features = self.scaled_features[indices]
        else:
            sample_features = self.scaled_features

        # Compute nearest neighbor distances
        start_time = time.time()
        nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample_features)
        distances, _ = nbrs.kneighbors(sample_features)

        # Sort the k-distances
        k_distances = distances[:, n_neighbors-1]
        k_distances.sort()

        # Find the "elbow" point in the k-distance graph
        estimated_eps = k_distances[int(len(k_distances) * quantile)]

        # Recommend min_samples based on dimensionality
        # Common rule: min_samples = 2 * num_dimensions
        estimated_min_samples = max(5, 2 * sample_features.shape[1])

        duration = time.time() - start_time
        print(f"Parameter estimation completed in {duration:.2f}s")
        print(f"Estimated parameters: eps={estimated_eps:.4f}, min_samples={estimated_min_samples}")

        # Create a plot to visualize k-distances
        plt.figure(figsize=(10, 6))
        plt.plot(np.arange(len(k_distances)), k_distances, 'b-')
        plt.axhline(y=estimated_eps, color='r', linestyle='--',
                   label=f'Suggested eps: {estimated_eps:.4f}')
        plt.xlabel('Points sorted by distance')
        plt.ylabel(f'Distance to {n_neighbors}th nearest neighbor')
        plt.title('K-distance Graph for DBSCAN Parameter Estimation')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

        return estimated_eps, estimated_min_samples

    def fit_dbscan(self, eps: float = 0.5, min_samples: int = 5):
        """
        Fit DBSCAN clustering on the hyperspectral data.

        Args:
            eps: The maximum distance between two samples for one to be considered a neighbor of the other
            min_samples: The number of samples in a neighborhood for a point to be considered a core point

        Returns:
            self: For method chaining
        """
        print(f"Fitting DBSCAN with eps={eps}, min_samples={min_samples}...")

        # Ensure data is preprocessed
        if self.scaled_features is None:
            self.preprocess()

        # Track start time for performance monitoring
        start_time = time.time()

        # Fit the model
        self.model = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1)
        self.labels = self.model.fit_predict(self.scaled_features)

        # Track end time
        duration = time.time() - start_time
        print(f"DBSCAN completed in {duration:.2f}s")

        # Add cluster labels to dataframe
        self.df['cluster'] = self.labels

        # Calculate cluster sizes (including noise points labeled as -1)
        unique_labels = np.unique(self.labels)
        n_clusters = len(unique_labels)
        n_noise = np.sum(self.labels == -1)

        print(f"Found {n_clusters} clusters including noise points")
        print(f"Number of noise points: {n_noise} ({n_noise/len(self.labels)*100:.2f}%)")

        print("Cluster sizes:")
        cluster_sizes = np.bincount(self.labels[self.labels >= 0])
        for i, size in enumerate(cluster_sizes):
            percentage = size / len(self.labels) * 100
            print(f"  Cluster {i}: {size} pixels ({percentage:.2f}%)")

        return self

    def reconstruct_cluster_image(self) -> np.ndarray:
        """
        Reconstruct the cluster assignments into the original image shape.

        Returns:
            2D array of cluster assignments with shape (height, width)
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Create empty image array (default to -1 for unassigned pixels)
        cluster_image = np.full((self.height, self.width), -1, dtype=int)

        # Fill in cluster assignments
        for idx, row in self.df.iterrows():
            x, y = int(row[self.x_col]), int(row[self.y_col])
            if 0 <= x < self.width and 0 <= y < self.height:
                cluster_image[y, x] = row['cluster']

        return cluster_image

    def visualize_clusters(self,
                          figsize: Tuple[int, int] = (12, 10),
                          cmap: Optional[Union[str, ListedColormap]] = None,
                          noise_color: str = 'black',
                          save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize the clustering results by reconstructing the original image.

        Args:
            figsize: Figure size
            cmap: Colormap to use for visualization
            noise_color: Color to use for noise points (cluster -1)
            save_path: Path to save the visualization image

        Returns:
            Matplotlib figure
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Reconstruct cluster image
        cluster_image = self.reconstruct_cluster_image()
        unique_labels = np.unique(self.labels)

        # Count number of actual clusters (excluding noise points)
        n_clusters = len([l for l in unique_labels if l >= 0])

        # Create a custom colormap that handles noise points specially
        if cmap is None:
            # For regular clusters (excluding noise), choose colormap based on number of clusters
            if n_clusters <= 10:
                # For few clusters, use distinct colors
                tab_colors = list(mcolors.TABLEAU_COLORS.values())
                # Add black at the beginning for noise points (label -1)
                colors = [noise_color] + tab_colors[:n_clusters]
                cmap = ListedColormap(colors)
            else:
                # For many clusters, modify a continuous colormap
                base_cmap = plt.cm.get_cmap('tab20' if n_clusters <= 20 else 'viridis', n_clusters)
                # Create a new colormap with black for noise points
                colors = [noise_color]  # Start with noise color
                colors.extend([base_cmap(i) for i in range(n_clusters)])  # Add colors for actual clusters
                cmap = ListedColormap(colors)

        # Create figure
        fig, ax = plt.subplots(figsize=figsize)

        # Display the image - ensure noise points (label -1) map to the first color in the colormap
        # Add 1 to all labels to shift them up (making -1 → 0, 0 → 1, etc.)
        shifted_image = cluster_image + 1
        im = ax.imshow(shifted_image, cmap=cmap, vmin=0, vmax=n_clusters)

        # Create a custom colorbar with proper labels
        cbar = plt.colorbar(im, ax=ax)
        cbar.set_label('Cluster')

        # Adjust colorbar ticks and labels to show -1 for noise
        tick_locs = np.arange(n_clusters + 1) + 0.5  # Center ticks
        cbar.set_ticks(tick_locs)
        tick_labels = ['-1 (Noise)'] + [str(i) for i in range(n_clusters)]
        cbar.set_ticklabels(tick_labels)

        # Set title and labels
        ax.set_title(f'DBSCAN Clustering Results ({n_clusters} clusters, {np.sum(cluster_image == -1)} noise points)')
        ax.set_xlabel('X Coordinate')
        ax.set_ylabel('Y Coordinate')

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved cluster visualization to {save_path}")

        return fig

    def visualize_separate_clusters(self,
                                   figsize: Tuple[int, int] = (15, 10),
                                   cmap: str = 'viridis',
                                   n_cols: int = 3,
                                   include_noise: bool = True,
                                   save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize each cluster separately.

        Args:
            figsize: Figure size
            cmap: Colormap to use for visualization
            n_cols: Number of columns in the subplot grid
            include_noise: Whether to include noise points as a separate cluster
            save_path: Path to save the visualization image

        Returns:
            Matplotlib figure
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Reconstruct cluster image
        cluster_image = self.reconstruct_cluster_image()

        # Get unique cluster labels
        unique_labels = sorted(np.unique(self.labels))

        # Decide whether to include noise in visualization
        if not include_noise and -1 in unique_labels:
            unique_labels.remove(-1)

        n_clusters = len(unique_labels)

        # Set up subplot grid
        n_rows = (n_clusters + n_cols - 1) // n_cols
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
        axes = axes.flatten()

        # Create separate mask for each cluster
        for i, label in enumerate(unique_labels):
            ax = axes[i]

            # Create binary mask for this cluster
            mask = (cluster_image == label).astype(float)

            # Display the mask
            im = ax.imshow(mask, cmap=cmap)

            # Set title and turn off axis labels
            cluster_name = f'Noise Points' if label == -1 else f'Cluster {label}'
            ax.set_title(cluster_name)
            ax.set_xticks([])
            ax.set_yticks([])

            # Add colorbar
            plt.colorbar(im, ax=ax)

        # Turn off any unused subplots
        for i in range(n_clusters, len(axes)):
            axes[i].axis('off')

        # Add overall title
        fig.suptitle('Individual Clusters', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved individual cluster visualization to {save_path}")

        return fig

    def visualize_cluster_spectra(self,
                                 excitations: Optional[List[float]] = None,
                                 figsize: Tuple[int, int] = (15, 10),
                                 include_noise: bool = False,
                                 save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize the average spectra for each cluster.

        Args:
            excitations: List of excitation wavelengths to include (if None, use all)
            figsize: Figure size
            include_noise: Whether to include noise points
            save_path: Path to save the visualization image

        Returns:
            Matplotlib figure
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Get all feature columns that have the format "emission-excitation"
        spectral_cols = [col for col in self.feature_cols if '-' in col]

        # If no spectral columns, we can't visualize spectra
        if not spectral_cols:
            raise ValueError("No spectral columns found in DataFrame")

        # Parse excitation and emission wavelengths from column names
        wavelengths = []
        for col in spectral_cols:
            try:
                emission, excitation = map(float, col.split('-'))
                wavelengths.append((emission, excitation, col))
            except ValueError:
                print(f"Skipping column {col} - doesn't match expected format")

        # Filter by excitation wavelengths if requested
        if excitations:
            wavelengths = [(em, ex, col) for em, ex, col in wavelengths if ex in excitations]

        # If no wavelengths left, we can't visualize spectra
        if not wavelengths:
            raise ValueError("No valid spectral columns found after filtering")

        # Group by excitation wavelength
        excitations = sorted(set(ex for _, ex, _ in wavelengths))

        # Set up subplot grid
        n_rows = min(3, len(excitations))
        n_cols = (len(excitations) + n_rows - 1) // n_rows
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
        axes = axes.flatten()

        # Get unique cluster labels (sorted, excluding noise if specified)
        unique_labels = sorted(np.unique(self.labels))
        if not include_noise and -1 in unique_labels:
            unique_labels.remove(-1)

        n_clusters = len(unique_labels)

        # Set up colors for clusters - using a colormap that works well for many clusters
        if n_clusters <= 10:
            colors = plt.cm.tab10(np.linspace(0, 1, max(10, n_clusters)))
        else:
            colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))

        # Plot spectra for each excitation wavelength
        for ax_idx, excitation in enumerate(excitations):
            if ax_idx < len(axes):
                ax = axes[ax_idx]

                # Get emission wavelengths and column names for this excitation
                excitation_data = [(em, col) for em, ex, col in wavelengths if ex == excitation]

                if excitation_data:
                    # Sort by emission wavelength
                    excitation_data.sort()
                    emission_wavelengths = [em for em, _ in excitation_data]
                    columns = [col for _, col in excitation_data]

                    # Plot average spectrum for each cluster
                    for i, label in enumerate(unique_labels):
                        # Get cluster data
                        cluster_data = self.df[self.df['cluster'] == label]

                        # Skip if cluster is empty
                        if len(cluster_data) == 0:
                            continue

                        # Calculate mean spectrum for this cluster
                        mean_spectrum = cluster_data[columns].mean().values

                        # Determine label and style
                        if label == -1:
                            cluster_name = "Noise"
                            line_style = ":"  # dotted line for noise
                        else:
                            cluster_name = f'Cluster {label}'
                            line_style = "-"  # solid line for clusters

                        # Plot spectrum
                        color_idx = i if label != -1 else -1  # Use last color for noise
                        ax.plot(emission_wavelengths, mean_spectrum,
                                line_style, color=colors[color_idx],
                                linewidth=2, label=cluster_name)

                    # Set labels and title
                    ax.set_xlabel('Emission Wavelength (nm)')
                    ax.set_ylabel('Mean Intensity')
                    ax.set_title(f'Excitation {excitation} nm')
                    ax.grid(True, alpha=0.3)

                    # Add legend to first plot only
                    if ax_idx == 0:
                        ax.legend(loc='best')

        # Turn off any unused subplots
        for i in range(len(excitations), len(axes)):
            axes[i].axis('off')

        # Add overall title
        fig.suptitle('Average Spectra by Cluster', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved cluster spectra visualization to {save_path}")

        return fig

    def save_cluster_results(self, output_file: str):
        """
        Save clustering results to a file.

        Args:
            output_file: Path to save the results
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Get file extension
        _, ext = os.path.splitext(output_file)

        # Save based on file type
        if ext.lower() in ['.csv']:
            # Save to CSV (coordinates and cluster assignments only)
            result_df = self.df[[self.x_col, self.y_col, 'cluster']].copy()
            result_df.to_csv(output_file, index=False)
            print(f"Saved cluster assignments to {output_file}")

        elif ext.lower() in ['.pkl', '.pickle']:
            # Save full DataFrame with cluster assignments
            self.df.to_pickle(output_file)
            print(f"Saved full DataFrame with clusters to {output_file}")

        elif ext.lower() in ['.parquet']:
            # Save full DataFrame with cluster assignments
            self.df.to_parquet(output_file)
            print(f"Saved full DataFrame with clusters to {output_file}")

        else:
            # Default to CSV
            result_df = self.df[[self.x_col, self.y_col, 'cluster']].copy()
            result_df.to_csv(output_file, index=False)
            print(f"Saved cluster assignments to {output_file}")

In [None]:
def run_dbscan_clustering(
    input_file: str,
    eps: Optional[float] = None,
    min_samples: Optional[int] = None,
    auto_params: bool = True,
    output_dir: Optional[str] = None,
    sample_size: int = 10000
):
    """
    Run DBSCAN clustering on hyperspectral data and save results.

    Args:
        input_file: Path to input CSV/parquet/pickle file with flattened hyperspectral data
        eps: DBSCAN eps parameter (max distance between points to be considered neighbors)
        min_samples: DBSCAN min_samples parameter (min points to form a dense region)
        auto_params: Whether to automatically estimate parameters using k-dist method
        output_dir: Directory to save outputs (if None, use same directory as input)
        sample_size: Number of samples to use for parameter estimation (if auto_params=True)
    """
    # Determine file type and load data
    _, ext = os.path.splitext(input_file)

    print(f"Loading data from {input_file}...")
    if ext.lower() in ['.csv']:
        df = pd.read_csv(input_file)
    elif ext.lower() in ['.pkl', '.pickle']:
        df = pd.read_pickle(input_file)
    elif ext.lower() in ['.parquet']:
        df = pd.read_parquet(input_file)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    print(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns")

    # Set up output directory
    if output_dir is None:
        output_dir = os.path.dirname(input_file)
        if not output_dir:
            output_dir = "."

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Get base filename without extension
    base_name = os.path.basename(input_file)
    base_name = os.path.splitext(base_name)[0]

    # Initialize clustering
    clustering = HyperspectralDBSCAN(df)

    # Preprocess the data
    clustering.preprocess()

    # If auto_params is True, estimate parameters
    if auto_params:
        estimated_eps, estimated_min_samples = clustering.estimate_dbscan_params(
            n_samples=sample_size, quantile=0.95
        )

        # Save k-distance plot
        plt.savefig(os.path.join(output_dir, f"{base_name}_kdist_plot.png"), dpi=300, bbox_inches='tight')
        plt.close()

        # Use estimated parameters if not explicitly provided
        if eps is None:
            eps = estimated_eps
        if min_samples is None:
            min_samples = estimated_min_samples

    # Use default parameters if still None
    if eps is None:
        eps = 0.5
    if min_samples is None:
        min_samples = 5

    # Fit DBSCAN with the parameters
    clustering.fit_dbscan(eps=eps, min_samples=min_samples)

    # Visualize clustering results
    fig = clustering.visualize_clusters()
    clusters_path = os.path.join(output_dir, f"{base_name}_dbscan_clusters.png")
    fig.savefig(clusters_path, dpi=300, bbox_inches='tight')
    print(f"Saved cluster visualization to {clusters_path}")

    # Visualize individual clusters
    fig = clustering.visualize_separate_clusters()
    separate_path = os.path.join(output_dir, f"{base_name}_dbscan_separate_clusters.png")
    fig.savefig(separate_path, dpi=300, bbox_inches='tight')
    print(f"Saved individual cluster visualization to {separate_path}")

    # Visualize cluster spectra
    try:
        fig = clustering.visualize_cluster_spectra()
        spectra_path = os.path.join(output_dir, f"{base_name}_dbscan_cluster_spectra.png")
        fig.savefig(spectra_path, dpi=300, bbox_inches='tight')
        print(f"Saved cluster spectra visualization to {spectra_path}")
    except ValueError as e:
        print(f"Could not visualize cluster spectra: {e}")

    # Save cluster results
    results_path = os.path.join(output_dir, f"{base_name}_dbscan_results.csv")
    clustering.save_cluster_results(results_path)

    # Save parameters used
    with open(os.path.join(output_dir, f"{base_name}_dbscan_params.txt"), 'w') as f:
        f.write(f"DBSCAN Parameters:\n")
        f.write(f"eps: {eps}\n")
        f.write(f"min_samples: {min_samples}\n")
        f.write(f"Number of clusters: {len(np.unique(clustering.labels)) - (1 if -1 in clustering.labels else 0)}\n")
        f.write(f"Number of noise points: {np.sum(clustering.labels == -1)}\n")
        f.write(f"Percentage of noise: {np.sum(clustering.labels == -1) / len(clustering.labels) * 100:.2f}%\n")

    print("DBSCAN clustering complete!")

In [None]:
run_dbscan_clustering(
    "../Data/Kiwi Experiment/parquests/KiwiDataMasked.parquet",
    auto_params=True,
    output_dir="DBScanResults/Masked",
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import matplotlib.colors as mcolors
from matplotlib.colors import ListedColormap
from typing import Tuple, List, Optional, Dict, Union
import os
from pathlib import Path
import time
import gc  # Garbage collector
from joblib import Memory  # For caching

class MemoryEfficientDBSCAN:
    """
    Memory-optimized implementation of DBSCAN for large hyperspectral datasets.
    """

    def __init__(self, df: pd.DataFrame, image_shape: Optional[Tuple[int, int]] = None):
        """
        Initialize with the flattened hyperspectral dataframe.

        Args:
            df: DataFrame containing flattened hyperspectral data with x, y coordinates
            image_shape: Tuple of (height, width) of the original image. If None, inferred from data.
        """
        self.df = df

        # Extract spatial coordinates and feature columns
        self.x_col = 'x'
        self.y_col = 'y'

        # Validate that x and y columns exist
        if self.x_col not in df.columns or self.y_col not in df.columns:
            raise ValueError(f"DataFrame must contain '{self.x_col}' and '{self.y_col}' columns")

        # Get feature columns (all columns except x and y)
        self.feature_cols = [col for col in df.columns if col not in [self.x_col, self.y_col]]

        if len(self.feature_cols) == 0:
            raise ValueError("No feature columns found in DataFrame")

        print(f"Found {len(self.feature_cols)} feature columns in the data")

        # Determine image shape if not provided
        if image_shape is None:
            self.height = int(df[self.y_col].max()) + 1
            self.width = int(df[self.x_col].max()) + 1
        else:
            self.height, self.width = image_shape

        print(f"Image shape: {self.height} × {self.width} pixels")

        # Initialize model and results
        self.model = None
        self.labels = None
        self.feature_scaler = None
        self.scaled_features = None
        self.pca_model = None
        self.reduced_features = None

    def preprocess(self, handle_nan: str = 'drop_feature', n_components: Optional[int] = None,
                  pca_variance: float = 0.95):
        """
        Preprocess the data for clustering with dimensionality reduction.

        Args:
            handle_nan: Strategy for handling NaN values ('drop_feature', 'fill_zero', 'fill_mean')
            n_components: Number of PCA components to keep (if None, determined by variance)
            pca_variance: Proportion of variance to preserve if n_components is None

        Returns:
            self: For method chaining
        """
        print("Preprocessing data...")

        # Extract features
        features = self.df[self.feature_cols].copy()

        # Count missing values
        nan_counts = features.isna().sum()
        nan_features = nan_counts[nan_counts > 0]

        if len(nan_features) > 0:
            print(f"Found {len(nan_features)} features with missing values")
            print(f"Top 5 features with most NaNs: {nan_features.sort_values(ascending=False).head()}")

            # Handle missing values based on strategy
            if handle_nan == 'drop_feature':
                # Drop features with any NaN values
                good_features = [col for col in self.feature_cols if nan_counts[col] == 0]
                features = self.df[good_features].copy()
                print(f"Dropped {len(self.feature_cols) - len(good_features)} features with NaNs")
                self.feature_cols = good_features

            elif handle_nan == 'fill_zero':
                # Replace NaN with zeros
                features.fillna(0, inplace=True)
                print("Filled NaN values with zeros")

            elif handle_nan == 'fill_mean':
                # Replace NaN with feature means
                features.fillna(features.mean(), inplace=True)
                print("Filled NaN values with feature means")

            else:
                raise ValueError(f"Unknown handle_nan strategy: {handle_nan}")

        # Scale the features
        print("Scaling features...")
        self.feature_scaler = StandardScaler()
        self.scaled_features = self.feature_scaler.fit_transform(features)

        # Run garbage collection to free memory
        gc.collect()

        # Apply PCA for dimensionality reduction
        print(f"Applying PCA dimensionality reduction...")
        if n_components is None:
            # Determine number of components to keep based on explained variance
            self.pca_model = PCA(n_components=pca_variance, svd_solver='randomized')
        else:
            self.pca_model = PCA(n_components=n_components, svd_solver='randomized')

        self.reduced_features = self.pca_model.fit_transform(self.scaled_features)

        # Release memory from original scaled features
        self.scaled_features = None
        gc.collect()

        components_kept = self.pca_model.n_components_
        variance_explained = np.sum(self.pca_model.explained_variance_ratio_)

        print(f"Reduced features from {len(self.feature_cols)} to {components_kept} dimensions")
        print(f"Preserved {variance_explained:.2%} of the original variance")

        return self

    def estimate_dbscan_params(self, n_samples: int = 10000, n_neighbors: int = 5, quantile: float = 0.95):
        """
        Estimate good parameters for DBSCAN using nearest neighbors distances on a sample.

        Args:
            n_samples: Number of samples to use for estimation
            n_neighbors: Number of neighbors to consider for distance calculation
            quantile: Quantile of k-dist to use as eps estimate

        Returns:
            Tuple of (estimated_eps, estimated_min_samples)
        """
        if self.reduced_features is None:
            raise ValueError("Run preprocess() before estimating parameters")

        print(f"Estimating DBSCAN parameters using {n_samples} sample points...")

        # Use a subset of points to estimate parameters
        if n_samples >= len(self.reduced_features):
            sample_features = self.reduced_features
            n_samples = len(self.reduced_features)
            print("Using all available data points for parameter estimation")
        else:
            # Take a random sample without replacement
            indices = np.random.choice(len(self.reduced_features), n_samples, replace=False)
            sample_features = self.reduced_features[indices]

        # Compute nearest neighbor distances on the sample
        start_time = time.time()
        print("Computing nearest neighbors... (this may take a while)")

        # Use batch processing for k-distance calculation to save memory
        batch_size = min(5000, n_samples)
        k_distances = np.zeros(n_samples)

        nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample_features)

        # Process in batches to avoid memory issues
        for i in range(0, n_samples, batch_size):
            end_idx = min(i + batch_size, n_samples)
            batch = sample_features[i:end_idx]
            distances, _ = nbrs.kneighbors(batch)
            k_distances[i:end_idx] = distances[:, n_neighbors-1]

            # Report progress
            if (i // batch_size) % 5 == 0:
                print(f"  Processed {end_idx}/{n_samples} samples...")

        # Sort the k-distances
        k_distances.sort()

        # Find the "elbow" point in the k-distance graph
        estimated_eps = k_distances[int(len(k_distances) * quantile)]

        # Recommend min_samples based on dimensionality
        # Common rule: min_samples = 2 * num_dimensions
        estimated_min_samples = max(5, 2 * self.reduced_features.shape[1])

        duration = time.time() - start_time
        print(f"Parameter estimation completed in {duration:.2f}s")
        print(f"Estimated parameters: eps={estimated_eps:.4f}, min_samples={estimated_min_samples}")

        # Create a plot to visualize k-distances
        plt.figure(figsize=(10, 6))
        plt.plot(np.arange(len(k_distances)), k_distances, 'b-')
        plt.axhline(y=estimated_eps, color='r', linestyle='--',
                   label=f'Suggested eps: {estimated_eps:.4f}')
        plt.xlabel('Points sorted by distance')
        plt.ylabel(f'Distance to {n_neighbors}th nearest neighbor')
        plt.title('K-distance Graph for DBSCAN Parameter Estimation')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()

        # Free memory
        del nbrs, sample_features, k_distances
        gc.collect()

        return estimated_eps, estimated_min_samples

    def fit_dbscan(self, eps: float = 0.5, min_samples: int = 5, max_samples: Optional[int] = None,
                  batch_size: int = 20000):
        """
        Fit DBSCAN clustering on the hyperspectral data using memory-efficient approach.

        Args:
            eps: The maximum distance between two samples for one to be considered a neighbor
            min_samples: The number of samples in a neighborhood for a point to be considered a core point
            max_samples: Maximum number of samples to use for clustering (None = all)
            batch_size: Size of batches for processing larger datasets

        Returns:
            self: For method chaining
        """
        if self.reduced_features is None:
            raise ValueError("Run preprocess() before fitting DBSCAN")

        print(f"Fitting DBSCAN with eps={eps}, min_samples={min_samples}...")

        # Determine if we need to sample data
        total_points = len(self.reduced_features)
        if max_samples is not None and max_samples < total_points:
            print(f"Sampling {max_samples} points from {total_points} total points")
            indices = np.random.choice(total_points, max_samples, replace=False)
            features_to_cluster = self.reduced_features[indices]

            # Keep track of the sampled indices for later
            self.sampled_indices = indices
            using_sample = True
        else:
            features_to_cluster = self.reduced_features
            using_sample = False

        # Track start time for performance monitoring
        start_time = time.time()

        # If the dataset is very large, use batch processing
        if len(features_to_cluster) > batch_size and False:  # Disabling batch processing for now (can be complex to implement correctly)
            print(f"Using batch processing with batch size {batch_size}")
            # This would require a custom implementation of batch DBSCAN
            # Which is complex and beyond the scope of this response
            raise NotImplementedError("Batch processing for DBSCAN not yet implemented")
        else:
            # Use standard DBSCAN implementation
            print(f"Using standard DBSCAN on {len(features_to_cluster)} points")
            self.model = DBSCAN(eps=eps, min_samples=min_samples,
                              algorithm='kd_tree',  # More memory efficient
                              leaf_size=40,  # Increased leaf size can reduce memory
                              n_jobs=-1)  # Use all cores

            labels = self.model.fit_predict(features_to_cluster)

        # Track end time
        duration = time.time() - start_time
        print(f"DBSCAN completed in {duration:.2f}s")

        # If we used a sample, we need to propagate labels to all points
        if using_sample:
            print("Propagating labels from sample to all points...")
            # Create full labels array (initialize all to noise)
            full_labels = np.full(total_points, -1)

            # Assign the labels from clustering to the sampled points
            full_labels[self.sampled_indices] = labels

            # To propagate to non-sampled points, we could use nearest neighbors
            # This is optional and would require additional processing

            self.labels = full_labels
        else:
            self.labels = labels

        # Add cluster labels to dataframe
        self.df['cluster'] = self.labels

        # Calculate cluster sizes (including noise points labeled as -1)
        unique_labels = np.unique(self.labels)
        n_clusters = len([l for l in unique_labels if l >= 0])
        n_noise = np.sum(self.labels == -1)

        print(f"Found {n_clusters} clusters")
        print(f"Number of noise points: {n_noise} ({n_noise/len(self.labels)*100:.2f}%)")

        print("Cluster sizes:")
        cluster_sizes = np.bincount(self.labels[self.labels >= 0])
        for i, size in enumerate(cluster_sizes):
            percentage = size / len(self.labels) * 100
            print(f"  Cluster {i}: {size} pixels ({percentage:.2f}%)")

        # Free memory
        gc.collect()

        return self

    def reconstruct_cluster_image(self) -> np.ndarray:
        """
        Reconstruct the cluster assignments into the original image shape.

        Returns:
            2D array of cluster assignments with shape (height, width)
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Create empty image array (default to -1 for unassigned pixels)
        cluster_image = np.full((self.height, self.width), -1, dtype=int)

        # Fill in cluster assignments - using vectorized approach for speed
        x_coords = self.df[self.x_col].astype(int).values
        y_coords = self.df[self.y_col].astype(int).values

        # Filter valid coordinates within image bounds
        valid_indices = (
            (x_coords >= 0) & (x_coords < self.width) &
            (y_coords >= 0) & (y_coords < self.height)
        )

        if np.any(valid_indices):
            cluster_image[y_coords[valid_indices], x_coords[valid_indices]] = self.labels[valid_indices]

        return cluster_image

    # Visualization methods remain mostly unchanged
    # ... (keep the same visualization methods as before)

    def visualize_clusters(self,
                          figsize: Tuple[int, int] = (12, 10),
                          cmap: Optional[Union[str, ListedColormap]] = None,
                          noise_color: str = 'black',
                          save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize the clustering results by reconstructing the original image.
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Reconstruct cluster image
        cluster_image = self.reconstruct_cluster_image()
        unique_labels = np.unique(self.labels)

        # Count number of actual clusters (excluding noise points)
        n_clusters = len([l for l in unique_labels if l >= 0])

        # Create a custom colormap that handles noise points specially
        if cmap is None:
            # For regular clusters (excluding noise), choose colormap based on number of clusters
            if n_clusters <= 10:
                # For few clusters, use distinct colors
                tab_colors = list(mcolors.TABLEAU_COLORS.values())
                # Add black at the beginning for noise points (label -1)
                colors = [noise_color] + tab_colors[:n_clusters]
                cmap = ListedColormap(colors)
            else:
                # For many clusters, modify a continuous colormap
                base_cmap = plt.cm.get_cmap('tab20' if n_clusters <= 20 else 'viridis', n_clusters)
                # Create a new colormap with black for noise points
                colors = [noise_color]  # Start with noise color
                colors.extend([base_cmap(i) for i in range(n_clusters)])  # Add colors for actual clusters
                cmap = ListedColormap(colors)

        # Create figure
        fig, ax = plt.subplots(figsize=figsize)

        # Display the image - ensure noise points (label -1) map to the first color in the colormap
        # Add 1 to all labels to shift them up (making -1 → 0, 0 → 1, etc.)
        shifted_image = cluster_image + 1
        im = ax.imshow(shifted_image, cmap=cmap, vmin=0, vmax=n_clusters)

        # Create a custom colorbar with proper labels
        cbar = plt.colorbar(im, ax=ax)
        cbar.set_label('Cluster')

        # Adjust colorbar ticks and labels to show -1 for noise
        tick_locs = np.arange(n_clusters + 1) + 0.5  # Center ticks
        cbar.set_ticks(tick_locs)
        tick_labels = ['-1 (Noise)'] + [str(i) for i in range(n_clusters)]
        cbar.set_ticklabels(tick_labels)

        # Set title and labels
        ax.set_title(f'DBSCAN Clustering Results ({n_clusters} clusters, {np.sum(cluster_image == -1)} noise points)')
        ax.set_xlabel('X Coordinate')
        ax.set_ylabel('Y Coordinate')

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved cluster visualization to {save_path}")

        return fig

    def visualize_separate_clusters(self,
                                   figsize: Tuple[int, int] = (15, 10),
                                   cmap: str = 'viridis',
                                   n_cols: int = 3,
                                   include_noise: bool = True,
                                   save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize each cluster separately.
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Reconstruct cluster image
        cluster_image = self.reconstruct_cluster_image()

        # Get unique cluster labels
        unique_labels = sorted(np.unique(self.labels))

        # Decide whether to include noise in visualization
        if not include_noise and -1 in unique_labels:
            unique_labels.remove(-1)

        n_clusters = len(unique_labels)

        # Set up subplot grid
        n_rows = (n_clusters + n_cols - 1) // n_cols
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
        axes = axes.flatten()

        # Create separate mask for each cluster
        for i, label in enumerate(unique_labels):
            ax = axes[i]

            # Create binary mask for this cluster
            mask = (cluster_image == label).astype(float)

            # Display the mask
            im = ax.imshow(mask, cmap=cmap)

            # Set title and turn off axis labels
            cluster_name = f'Noise Points' if label == -1 else f'Cluster {label}'
            ax.set_title(cluster_name)
            ax.set_xticks([])
            ax.set_yticks([])

            # Add colorbar
            plt.colorbar(im, ax=ax)

        # Turn off any unused subplots
        for i in range(n_clusters, len(axes)):
            axes[i].axis('off')

        # Add overall title
        fig.suptitle('Individual Clusters', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved individual cluster visualization to {save_path}")

        return fig

    def visualize_cluster_spectra(self,
                                 excitations: Optional[List[float]] = None,
                                 figsize: Tuple[int, int] = (15, 10),
                                 include_noise: bool = False,
                                 save_path: Optional[str] = None) -> plt.Figure:
        """
        Visualize the average spectra for each cluster.
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Get all feature columns that have the format "emission-excitation"
        spectral_cols = [col for col in self.feature_cols if '-' in col]

        # If no spectral columns, we can't visualize spectra
        if not spectral_cols:
            raise ValueError("No spectral columns found in DataFrame")

        # Parse excitation and emission wavelengths from column names
        wavelengths = []
        for col in spectral_cols:
            try:
                emission, excitation = map(float, col.split('-'))
                wavelengths.append((emission, excitation, col))
            except ValueError:
                print(f"Skipping column {col} - doesn't match expected format")

        # Filter by excitation wavelengths if requested
        if excitations:
            wavelengths = [(em, ex, col) for em, ex, col in wavelengths if ex in excitations]

        # If no wavelengths left, we can't visualize spectra
        if not wavelengths:
            raise ValueError("No valid spectral columns found after filtering")

        # Group by excitation wavelength
        excitations = sorted(set(ex for _, ex, _ in wavelengths))

        # Set up subplot grid
        n_rows = min(3, len(excitations))
        n_cols = (len(excitations) + n_rows - 1) // n_rows
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, squeeze=False)
        axes = axes.flatten()

        # Get unique cluster labels (sorted, excluding noise if specified)
        unique_labels = sorted(np.unique(self.labels))
        if not include_noise and -1 in unique_labels:
            unique_labels.remove(-1)

        n_clusters = len(unique_labels)

        # Set up colors for clusters - using a colormap that works well for many clusters
        if n_clusters <= 10:
            colors = plt.cm.tab10(np.linspace(0, 1, max(10, n_clusters)))
        else:
            colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))

        # Process one excitation at a time to save memory
        for ax_idx, excitation in enumerate(excitations):
            if ax_idx < len(axes):
                ax = axes[ax_idx]

                # Get emission wavelengths and column names for this excitation
                excitation_data = [(em, col) for em, ex, col in wavelengths if ex == excitation]

                if excitation_data:
                    # Sort by emission wavelength
                    excitation_data.sort()
                    emission_wavelengths = [em for em, _ in excitation_data]
                    columns = [col for _, col in excitation_data]

                    # Plot average spectrum for each cluster one by one
                    for i, label in enumerate(unique_labels):
                        # Get cluster data efficiently using boolean indexing
                        cluster_mask = self.df['cluster'] == label
                        cluster_size = np.sum(cluster_mask)

                        # Skip if cluster is empty
                        if cluster_size == 0:
                            continue

                        # Calculate mean spectrum using optimized approach
                        mean_spectrum = self.df.loc[cluster_mask, columns].mean().values

                        # Determine label and style
                        if label == -1:
                            cluster_name = "Noise"
                            line_style = ":"  # dotted line for noise
                        else:
                            cluster_name = f'Cluster {label}'
                            line_style = "-"  # solid line for clusters

                        # Plot spectrum
                        color_idx = i if label != -1 else -1  # Use last color for noise
                        ax.plot(emission_wavelengths, mean_spectrum,
                                line_style, color=colors[color_idx],
                                linewidth=2, label=cluster_name)

                    # Set labels and title
                    ax.set_xlabel('Emission Wavelength (nm)')
                    ax.set_ylabel('Mean Intensity')
                    ax.set_title(f'Excitation {excitation} nm')
                    ax.grid(True, alpha=0.3)

                    # Add legend to first plot only
                    if ax_idx == 0:
                        ax.legend(loc='best')

        # Turn off any unused subplots
        for i in range(len(excitations), len(axes)):
            axes[i].axis('off')

        # Add overall title
        fig.suptitle('Average Spectra by Cluster', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])  # Make room for suptitle

        # Save figure if requested
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Saved cluster spectra visualization to {save_path}")

        return fig

    def save_cluster_results(self, output_file: str):
        """
        Save clustering results to a file.

        Args:
            output_file: Path to save the results
        """
        if self.labels is None:
            raise ValueError("Must fit a model first using fit_dbscan()")

        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Get file extension
        _, ext = os.path.splitext(output_file)

        # Save based on file type - focusing on memory efficiency
        print(f"Saving results to {output_file}...")

        if ext.lower() in ['.csv']:
            # Save to CSV (coordinates and cluster assignments only)
            result_df = self.df[[self.x_col, self.y_col, 'cluster']].copy()
            result_df.to_csv(output_file, index=False)
            print(f"Saved cluster assignments to {output_file}")

        elif ext.lower() in ['.pkl', '.pickle']:
            # Save full DataFrame with cluster assignments
            self.df.to_pickle(output_file)
            print(f"Saved full DataFrame with clusters to {output_file}")

        elif ext.lower() in ['.parquet']:
            # Save full DataFrame with clusters - parquet is more memory-efficient
            self.df.to_parquet(output_file, index=False)
            print(f"Saved full DataFrame with clusters to {output_file}")

        else:
            # Default to CSV
            result_df = self.df[[self.x_col, self.y_col, 'cluster']].copy()
            result_df.to_csv(output_file, index=False)
            print(f"Saved cluster assignments to {output_file}")

In [None]:
def run_memory_efficient_dbscan(
    input_file: str,
    eps: Optional[float] = None,
    min_samples: Optional[int] = None,
    n_pca_components: Optional[int] = None,
    pca_variance: float = 0.95,
    max_samples: Optional[int] = None,
    output_dir: Optional[str] = None,
    sample_size_for_estimation: int = 5000
):
    """
    Run memory-efficient DBSCAN clustering on hyperspectral data and save results.

    Args:
        input_file: Path to input CSV/parquet/pickle file with flattened hyperspectral data
        eps: DBSCAN eps parameter (if None, estimated automatically)
        min_samples: DBSCAN min_samples parameter (if None, estimated automatically)
        n_pca_components: Number of PCA components to keep (if None, based on pca_variance)
        pca_variance: Proportion of variance to preserve if n_pca_components is None
        max_samples: Maximum number of samples to use for clustering (None = all)
        output_dir: Directory to save outputs (if None, use same directory as input)
        sample_size_for_estimation: Number of samples to use for parameter estimation
    """
    # Determine file type and load data
    _, ext = os.path.splitext(input_file)

    print(f"Loading data from {input_file}...")
    if ext.lower() in ['.csv']:
        df = pd.read_csv(input_file)
    elif ext.lower() in ['.pkl', '.pickle']:
        df = pd.read_pickle(input_file)
    elif ext.lower() in ['.parquet']:
        df = pd.read_parquet(input_file)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    print(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns")

    # Set up output directory
    if output_dir is None:
        output_dir = os.path.dirname(input_file)
        if not output_dir:
            output_dir = "."

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Get base filename without extension
    base_name = os.path.basename(input_file)
    base_name = os.path.splitext(base_name)[0]

    # Initialize clustering
    clustering = MemoryEfficientDBSCAN(df)

    # Preprocess the data with dimensionality reduction
    clustering.preprocess(handle_nan='fill_mean',
                         n_components=n_pca_components,
                         pca_variance=pca_variance)

    # If parameters not specified, estimate them
    if eps is None or min_samples is None:
        estimated_eps, estimated_min_samples = clustering.estimate_dbscan_params(
            n_samples=sample_size_for_estimation
        )

        # Save k-distance plot
        plt.savefig(os.path.join(output_dir, f"{base_name}_kdist_plot.png"), dpi=300, bbox_inches='tight')
        plt.close()

        # Use estimated parameters if not explicitly provided
        if eps is None:
            eps = estimated_eps
        if min_samples is None:
            min_samples = estimated_min_samples

    # Fit DBSCAN with the parameters
    clustering.fit_dbscan(eps=eps, min_samples=min_samples, max_samples=max_samples)

    # Visualize clustering results
    fig = clustering.visualize_clusters()
    clusters_path = os.path.join(output_dir, f"{base_name}_dbscan_clusters.png")
    fig.savefig(clusters_path, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close figure to free memory
    print(f"Saved cluster visualization to {clusters_path}")

    # Visualize individual clusters
    fig = clustering.visualize_separate_clusters()
    separate_path = os.path.join(output_dir, f"{base_name}_dbscan_separate_clusters.png")
    fig.savefig(separate_path, dpi=300, bbox_inches='tight')
    plt.close(fig)  # Close figure to free memory
    print(f"Saved individual cluster visualization to {separate_path}")

    # Visualize cluster spectra
    try:
        fig = clustering.visualize_cluster_spectra()
        spectra_path = os.path.join(output_dir, f"{base_name}_dbscan_cluster_spectra.png")
        fig.savefig(spectra_path, dpi=300, bbox_inches='tight')
        plt.close(fig)  # Close figure to free memory
        print(f"Saved cluster spectra visualization to {spectra_path}")
    except Exception as e:
        print(f"Could not visualize cluster spectra: {e}")

    # Save cluster results
    results_path = os.path.join(output_dir, f"{base_name}_dbscan_results.parquet")
    clustering.save_cluster_results(results_path)

    # Save PCA information for reference
    if clustering.pca_model is not None:
        pca_info = {
            'n_components': clustering.pca_model.n_components_,
            'explained_variance_ratio': clustering.pca_model.explained_variance_ratio_,
            'total_variance_explained': np.sum(clustering.pca_model.explained_variance_ratio_)
        }
        with open(os.path.join(output_dir, f"{base_name}_pca_info.txt"), 'w') as f:
            f.write("PCA Information:\n")
            f.write(f"Number of components: {pca_info['n_components']}\n")
            f.write(f"Total variance explained: {pca_info['total_variance_explained']:.4f}\n")
            f.write("Component-wise variance explained:\n")
            for i, var in enumerate(pca_info['explained_variance_ratio']):
                f.write(f"  Component {i+1}: {var:.4f}\n")

    # Save parameters used
    with open(os.path.join(output_dir, f"{base_name}_dbscan_params.txt"), 'w') as f:
        f.write(f"DBSCAN Parameters:\n")
        f.write(f"eps: {eps}\n")
        f.write(f"min_samples: {min_samples}\n")
        if max_samples is not None:
            f.write(f"max_samples used: {max_samples}\n")
        if n_pca_components is not None:
            f.write(f"PCA components: {n_pca_components}\n")
        else:
            f.write(f"PCA variance threshold: {pca_variance}\n")

        # Add cluster information
        if clustering.labels is not None:
            n_clusters = len(np.unique(clustering.labels)) - (1 if -1 in clustering.labels else 0)
            n_noise = np.sum(clustering.labels == -1)
            f.write(f"Number of clusters: {n_clusters}\n")
            f.write(f"Number of noise points: {n_noise}\n")
            f.write(f"Percentage of noise: {n_noise / len(clustering.labels) * 100:.2f}%\n")

    print("Memory-efficient DBSCAN clustering complete!")

In [None]:
run_memory_efficient_dbscan(
    "../Data/Kiwi Experiment/parquests/KiwiDataMasked.parquet",
    # auto_params=True,
    output_dir="DBScanResults/Masked",
)