In [1]:
import numpy as np
import pandas as pd
from umap import UMAP
import math
from sklearn.base import BaseEstimator
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN, OPTICS
from sklearn.mixture import GaussianMixture


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class UMAPClusteringEvaluator(BaseEstimator):
    def __init__(self, n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean'):
        """
        Initialize UMAP and store hyperparameters for later grid search.
        """
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components = n_components
        self.metric = metric
        self.umap_model = None

    def fit(self, X, y=None):
        """
        Fit the UMAP model on the data.
        """
        self.umap_model = UMAP(n_neighbors=self.n_neighbors,
                                    min_dist=self.min_dist,
                                    n_components=self.n_components,
                                    metric=self.metric,
                                    random_state=42)
        self.umap_model.fit(X)
        return self

    def transform(self, X):
        """
        Transform the data with the fitted UMAP model.
        """
        return self.umap_model.transform(X)

    def score(self, X, y=None):
        """
        Transform the data into the reduced space and then run a set of clustering algorithms.
        Compute the silhouette score for each (if valid) and return the average silhouette score.
        """
        X_reduced = self.transform(X)
        scores = []  # To collect silhouette scores
        
        # List of clustering algorithms to evaluate.
        # You can adjust these or add new ones as needed.
        clustering_methods = [
            ('KMeans', KMeans(n_clusters=3, random_state=42)),
            ('Agglomerative', AgglomerativeClustering(n_clusters=3)),
            ('Spectral', SpectralClustering(n_clusters=3, affinity='nearest_neighbors', random_state=42)),
            ('DBSCAN', DBSCAN(eps=0.5, min_samples=5)),
            ('GaussianMixture', GaussianMixture(n_components=3, random_state=42)),
            ('OPTICS', OPTICS(min_samples=5))
        ]
        
        for name, algorithm in clustering_methods:
            try:
                # Obtain cluster labels.
                # Some algorithms have fit_predict, others require separate fitting and predicting.
                if name in ['KMeans', 'Agglomerative', 'Spectral', 'DBSCAN', 'OPTICS']:
                    labels = algorithm.fit_predict(X_reduced)
                elif name == 'GaussianMixture':
                    algorithm.fit(X_reduced)
                    labels = algorithm.predict(X_reduced)
                
                # Check if we have at least two clusters.
                # For DBSCAN, we exclude noise labeled as -1.
                if name == 'DBSCAN':
                    valid_idx = labels != -1
                    if len(np.unique(labels[valid_idx])) < 2:
                        continue
                    score = silhouette_score(X_reduced[valid_idx], labels[valid_idx])
                else:
                    if len(np.unique(labels)) < 2:
                        continue  # Skip if only one cluster is produced.
                    score = silhouette_score(X_reduced, labels)
                
                scores.append(score)
            except Exception as e:
                # In a production system you might log errors; here we just print them.
                print(f"Error with {name}: {e}")
                continue

        # If none of the clustering methods produced a valid silhouette score,
        # return a default low score. Otherwise, return the average.
        if scores:
            return np.mean(scores)
        return -1.0  # or another value indicating failure




In [6]:
df_log = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/Harrison_Local_Data/TCGA.HNSC.expression_log_all.txt', sep = '\t')

np_log = df_log.to_numpy()
np_log = np.delete(np_log, [0, 1], axis=1)


In [7]:
X = np_log

# Define parameter grid for UMAP hyperparameters.
param_grid = {
    'n_neighbors': [10, 15, 20],
    'min_dist': [0.1, 0.5],
    'n_components': [2, 5, 10]
}

# Set up GridSearchCV. Note that we don't need to specify a separate scoring function,
# because the custom estimator's score() method is used by default.
grid_search = GridSearchCV(estimator=UMAPClusteringEvaluator(),
                           param_grid=param_grid,
                           cv=5,  # 3-fold cross-validation; adjust as needed.
                           n_jobs=-1,  # Use all available cores.
                           verbose=1)

# Run the grid search over the UMAP hyperparameters.
grid_search.fit(X)

# Report the best hyperparameters and corresponding silhouette score.
print("Best parameters:", grid_search.best_params_)
print("Best average silhouette score:", grid_search.best_score_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'min_dist': 0.5, 'n_components': 2, 'n_neighbors': 20}
Best average silhouette score: 0.4487955093383789
