In [1]:
import numpy as np 
import pandas as pd 
from umap import UMAP 
import leidenalg
import igraph as ig
import math
from sklearn.base import BaseEstimator
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def leiden_cluster(X, k=25):
    """
    Create a graph from nearest neighbors and find clusters using Leiden algorithm
    
    Parameters:
    -----------
    X : array-like
        The dimensionality reduced data
    k : int, default=25
        Number of nearest neighbors
        
    Returns:
    --------
    g : igraph.Graph
        The created graph
    partition : leidenalg.VertexPartition
        The partition result from Leiden algorithm
    """
    # Find nearest neighbors
    neighbors = NearestNeighbors(n_neighbors=k).fit(X)
    distances, indices = neighbors.kneighbors(X)

    # Build edge list with weighted edges
    edges = []
    weights = []
    num_points = X.shape[0]

    for i in range(num_points):
        for idx, j in enumerate(indices[i]):
            if i == j: 
                continue
            if (j, i) in edges:
                continue
            d = distances[i, idx]
            weight = math.exp(-d)
            edges.append((i, j))
            weights.append(weight)

    # Create an igraph Graph, add vertices and edges
    g = ig.Graph()
    g.add_vertices(num_points)
    g.add_edges(edges)

    # Set the edge attribute 'weight' for our weighted graph
    g.es['weight'] = weights

    # Find partition using Leiden algorithm
    partition = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition)
    # print("Clusters:", partition)
    
    return g, partition


In [None]:
class UMAPClusteringEvaluator(BaseEstimator):
    def __init__(self, n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean'):
        """
        Initialize UMAP and store hyperparameters for later grid search.
        """
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components = n_components
        self.metric = metric
        self.umap_model = None

    def fit(self, X, y=None):
        """
        Fit the UMAP model on the data.
        """
        self.umap_model = UMAP(n_neighbors=self.n_neighbors,
                                    min_dist=self.min_dist,
                                    n_components=self.n_components,
                                    metric=self.metric,
                                    random_state=42)
        self.umap_model.fit(X)
        return self

    def transform(self, X):
        """
        Transform the data with the fitted UMAP model.
        """
        return self.umap_model.transform(X)

    def score(self, X, y=None):
        """
        Transform the data into the reduced space and then run a set of clustering algorithms.
        Compute the silhouette score for each (if valid) and return the average silhouette score.
        """
        X_reduced = self.transform(X)
        scores = []  # To collect silhouette scores
        
        # List of clustering algorithms to evaluate.
        # You can adjust these or add new ones as needed.
        clustering_methods = [
            ('KMeans', KMeans()), # 6 k means works the best 
            ('Agglomerative', AgglomerativeClustering()),
            ('Spectral', SpectralClustering()),
            ('DBSCAN', DBSCAN()),
            ('GaussianMixture', GaussianMixture()),
            ('Leiden', leiden_cluster(X_reduced)) # Custom Leiden clustering,
        ]
        
        for name, algorithm in clustering_methods:
            try:
                # Obtain cluster labels.
                # Some algorithms have fit_predict, others require separate fitting and predicting.
                if name in ['KMeans', 'Agglomerative', 'Spectral', 'DBSCAN']:
                    labels = algorithm.fit_predict(X_reduced)
                elif name == 'GaussianMixture':
                    algorithm.fit(X_reduced)
                    labels = algorithm.predict(X_reduced)
                elif name == 'Leiden':
                    g, partition = leiden_cluster(X_reduced)
                    labels = np.array(partition.membership)
                
                # Check if we have at least two clusters.
                # For DBSCAN, we exclude noise labeled as -1.
                if name == 'DBSCAN':
                    valid_idx = labels != -1
                    if len(np.unique(labels[valid_idx])) < 2:
                        continue
                    score = silhouette_score(X_reduced[valid_idx], labels[valid_idx])
                else:
                    if len(np.unique(labels)) < 2:
                        continue  # Skip if only one cluster is produced.
                    score = silhouette_score(X_reduced, labels)
                
                scores.append(score)
            except Exception as e:
                # In a production system you might log errors; here we just print them.
                print(f"Error with {name}: {e}")
                continue

        # If none of the clustering methods produced a valid silhouette score,
        # return a default low score. Otherwise, return the average.
        if scores:
            return np.mean(scores)
        return -1.0  # or another value indicating failure


In [48]:
from sklearn.base import BaseEstimator
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
from sklearn.mixture import GaussianMixture
import numpy as np

class TSNEClusteringEvaluator(BaseEstimator):
    def __init__(
        self,
        perplexity=30,
        learning_rate=200,
        n_components=2,
        n_iter=1000,
        init='pca',
        metric='euclidean',
        angle=0.5,
        random_state=42
    ):
        # t‑SNE hyperparameters
        self.perplexity = perplexity
        self.learning_rate = learning_rate
        self.n_components = n_components
        self.n_iter = n_iter
        self.init = init
        self.metric = metric
        self.angle = angle
        self.random_state = random_state

        # placeholder for the computed embedding
        self.embedding_ = None

    def fit(self, X, y=None):
        # t‑SNE has no transform, so we compute embedding here once
        self.embedding_ = TSNE(
            perplexity=self.perplexity,
            learning_rate=self.learning_rate,
            n_components=self.n_components,
            n_iter=self.n_iter,
            init=self.init,
            metric=self.metric,
            angle=self.angle,
            random_state=self.random_state,
        ).fit_transform(X)
        return self

    def transform(self, X):
        # ignore X, just return the stored embedding
        return self.embedding_

    def score(self, X, y=None):
        X_emb = self.transform(X)
        scores = []
        clustering_methods = [
            ('KMeans', KMeans()), # 6 k means works the best 
            ('Agglomerative', AgglomerativeClustering()),
            ('Spectral', SpectralClustering()),
            ('DBSCAN', DBSCAN()),
            ('GaussianMixture', GaussianMixture()),
            ('Leiden', leiden_cluster(X_emb)) # Custom Leiden clustering,
        ]
        
        for name, algorithm in clustering_methods:
            try:
                # Obtain cluster labels.
                # Some algorithms have fit_predict, others require separate fitting and predicting.
                if name in ['KMeans', 'Agglomerative', 'Spectral', 'DBSCAN']:
                    labels = algorithm.fit_predict(X_emb)
                elif name == 'GaussianMixture':
                    algorithm.fit(X_emb)
                    labels = algorithm.predict(X_emb)
                elif name == 'Leiden':
                    g, partition = leiden_cluster(X_emb)
                    labels = np.array(partition.membership)
                
                # Check if we have at least two clusters.
                # For DBSCAN, we exclude noise labeled as -1.
                if name == 'DBSCAN':
                    valid_idx = labels != -1
                    if len(np.unique(labels[valid_idx])) < 2:
                        continue
                    score = silhouette_score(X_emb[valid_idx], labels[valid_idx])
                else:
                    if len(np.unique(labels)) < 2:
                        continue  # Skip if only one cluster is produced.
                    score = silhouette_score(X_emb, labels)
                
                scores.append(score)
            except Exception as e:
                # In a production system you might log errors; here we just print them.
                print(f"Error with {name}: {e}")
                continue

        return np.mean(scores) if scores else -1.0


In [16]:
df_log = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/Data/processed/TCGA.HNSC.expression_log_tumor_top2000.txt', sep = '\t')

np_log = df_log.to_numpy()
np_log = np.delete(np_log, [0, 1], axis=1)
X = np_log# np.delete(np_log, [0, 1], axis=1)


In [18]:
# Define parameter grid for UMAP hyperparameters.
param_grid = {
    'n_neighbors': [10, 15, 20],
    'min_dist': [0.1, 0.5],
    'n_components': [2, 5, 10]
}

# Set up GridSearchCV. Note that we don't need to specify a separate scoring function,
# because the custom estimator's score() method is used by default.
grid_search = GridSearchCV(estimator=UMAPClusteringEvaluator(),
                           param_grid=param_grid,
                           cv=5,  # 3-fold cross-validation; adjust as needed.
                           n_jobs=-1,  # Use all available cores.
                           verbose=1)

# Run the grid search over the UMAP hyperparameters.
grid_search.fit(X)

# Report the best hyperparameters and corresponding silhouette score.
print("Best parameters:", grid_search.best_params_)
print("Best average silhouette score:", grid_search.best_score_)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'min_dist': 0.1, 'n_components': 2, 'n_neighbors': 10}
Best average silhouette score: 0.4900832772254944


In [49]:
param_grid = {
    'perplexity':      [5, 30, 50],
    'learning_rate':   [50, 100, 200],
    'n_iter':          [250, 500, 1000],
    'n_components':    [2, 5, 10],
    # you could also include:
    # 'angle':         [0.1, 0.5, 1.0],
    # 'init':          ['pca', 'random'],
}

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=TSNEClusteringEvaluator(),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid.fit(X)
print("Best params:", grid.best_params_)
print("Best silhouette:", grid.best_score_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=5; total time=   8.6s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=5; total time=   8.6s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=5; total time=   8.9s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=5; total time=   8.8s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=5; total time=   9.0s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=30; total time=  26.2s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=30; total time=  26.0s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=30; total time=  26.8s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=30; total time=  23.3s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=50; total time=  23.2s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=30; total time=  23.8s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=50; total time=  24.2s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=50; total time=  24.8s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=30; total time=   6.7s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=30; total time=   6.4s
[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=50; total time=  21.1s




[CV] END learning_rate=50, n_components=2, n_iter=250, perplexity=50; total time=  21.1s


[4.33342471e-14 2.40055060e-06 3.06309969e-06 2.24807133e-06
 2.59588696e-06 3.34184379e-06 2.19704386e-06 4.62729078e-06
 1.55760783e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
Use iteration 1894 instead with accuracy 
2.970461684702403e-06.

  _, diffusion_map = lobpcg(
[1.46532851e-14 1.84492175e-06 2.82919067e-06 2.67643987e-06
 2.39241759e-06 4.02766195e-06 1.58333444e-06 2.95603261e-06
 8.28248297e-06]
not reaching the requested tolerance 5.9604644775390625e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=30; total time=   6.8s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=5; total time=  27.5s
[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=30; total time=   6.6s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=30; total time=   8.0s


[8.70381514e-15 7.97913684e-07 1.65395249e-06 2.58765990e-06
 2.91952148e-06 4.17855643e-06 3.45815466e-06 3.38266164e-06
 1.06519446e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1991 instead with accuracy 
3.2055838247970877e-06.

  _, diffusion_map = lobpcg(
[4.80765334e-15 8.09795324e-07 1.72572163e-06 2.70862543e-06
 2.94154553e-06 4.35035907e-06 3.47647431e-06 3.34489079e-06
 9.49284322e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(
[1.00787916e-14 1.36656063e-06 2.04459660e-06 4.09348262e-06
 1.94581622e-06 2.41986044e-06 2.54977668e-06 3.76923215e-06
 6.07527183e-06]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 2001 instead with accuracy 
2.6960663527800172e-06.

  _, diffusion_map = lobpcg(
[6.64804990e-15 1.36674102e-06 2.04471227e-06 4.09347602e-06
 1.94600115e-06 2.41999111e-06 2.54972576e-06 3.76894672e-06
 6.07528393e-06]
not reaching the requested tolerance 5.9753656

[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=5; total time=  27.3s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=5; total time=  27.6s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=5; total time=  27.4s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=5; total time=  27.0s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=50; total time=   6.5s
[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=50; total time=   6.2s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=50; total time=   6.5s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=50; total time=   6.4s




[CV] END learning_rate=50, n_components=2, n_iter=500, perplexity=50; total time=   5.5s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=30; total time=   6.3s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=30; total time=   6.9s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=30; total time=   7.1s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=30; total time=   6.3s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=30; total time=   6.3s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=50; total time=   6.6s


[3.95718873e-14 9.31434335e-07 1.43503462e-06 1.32771962e-06
 1.63059921e-06 6.55979292e-06 2.61467030e-06 2.75912654e-06
 1.10475070e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 2001 instead with accuracy 
3.1450982911783936e-06.

  _, diffusion_map = lobpcg(
[9.15410387e-15 9.30711497e-07 1.43424959e-06 1.32656523e-06
 1.62646435e-06 6.55749877e-06 2.61791629e-06 2.76357664e-06
 1.10478989e-05]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(
  return fit_method(estimator, *args, **kwargs)
[8.33241687e-15 3.36755269e-06 2.08053381e-06 1.19473381e-06
 2.85034471e-06 2.11408703e-06 2.33476333e-06 7.23455596e-06
 1.20106593e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
Use iteration 1857 instead with accuracy 
3.371160083825882e-06.

  _, diffusion_map = lobpcg(
[6.72356216e-15 3.36312848e-06 2.12709885e-06 1.21285891e-06
 2.88868895e-06 1.91790079e-06 2.09778033e-06 8.44010874e-06
 8.25853445e-0

[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=5; total time=  21.6s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=5; total time=  22.2s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=50; total time=   7.5s




[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=5; total time=  22.1s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=5; total time=  21.7s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=5; total time=  21.9s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=50; total time=   0.1s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=50; total time=   0.1s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=50; total time=   8.0s
[CV] END learning_rate=50, n_components=5, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=50; total time=   6.8s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=2, n_iter=1000, perplexity=50; total time=   5.9s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=50; total time=   0.1s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=50, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=5; total time=   4.2s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=5; total time=   4.3s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=5; total time=   4.4s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=5; total time=   4.5s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=5; total time=   4.5s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=30; total time=   5.3s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=30; total time=   5.1s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=30; total time=   5.3s


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=30; total time=   4.9s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=30; total time=   5.0s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=50; total time=   5.2s
[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=50; total time=   5.2s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=50; total time=   5.3s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=50; total time=   5.2s




[CV] END learning_rate=100, n_components=2, n_iter=250, perplexity=50; total time=   5.3s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=30; total time=   4.8s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=30; total time=   5.0s
[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=30; total time=   4.8s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=30; total time=   4.7s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=30; total time=   5.0s
[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=50; total time=   4.9s


[1.85875925e-14 5.62931099e-07 2.37793318e-06 1.29508380e-06
 1.69722809e-06 3.75243981e-06 6.27747187e-06 3.34376103e-06
 1.14920074e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
Use iteration 1997 instead with accuracy 
3.382318314666172e-06.

  _, diffusion_map = lobpcg(
[1.26399579e-14 5.63568172e-07 2.35667870e-06 1.30743838e-06
 1.69505833e-06 3.74823520e-06 6.28369089e-06 3.30791312e-06
 1.11781383e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=50; total time=   5.5s


[8.27768511e-15 1.65972045e-06 1.60973281e-06 3.10565513e-06
 1.91442539e-06 2.58833575e-06 2.55242288e-06 2.76837610e-06
 8.38991416e-06]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1995 instead with accuracy 
2.6812051352228213e-06.

  _, diffusion_map = lobpcg(
[5.43791042e-15 1.67432551e-06 1.61235378e-06 3.08851297e-06
 1.93475231e-06 2.58700925e-06 2.56198978e-06 2.70076168e-06
 7.97113318e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=5; total time=  20.8s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=5; total time=  17.2s
[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=50; total time=   5.6s
[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=50; total time=   5.8s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=5; total time=  17.4s


[8.21239378e-15 1.58403795e-06 1.19639863e-06 1.66594523e-06
 2.57891411e-06 2.04301102e-06 2.90331263e-06 2.67303962e-06
 1.11708848e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1636 instead with accuracy 
2.7965453537690067e-06.

  _, diffusion_map = lobpcg(
[5.06913457e-15 1.91331040e-06 1.16552279e-06 1.72945271e-06
 2.79922807e-06 2.29367341e-06 3.15597869e-06 2.98083488e-06
 9.13074210e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(
[1.33109520e-14 1.31857896e-06 1.29462031e-06 3.41262356e-06
 3.48903056e-06 3.21455144e-06 5.43307458e-06 1.54628086e-05
 3.79683678e-06]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1656 instead with accuracy 
2.728871532596809e-06.

  _, diffusion_map = lobpcg(
[6.48279079e-15 1.48759540e-06 1.54247050e-06 2.65293707e-06
 3.34316091e-06 3.85945826e-06 1.12567961e-06 7.30726218e-06
 3.24132676e-06]
not reaching the requested tolerance 5.97536563

[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=5; total time=  20.3s
[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=5; total time=  20.2s




[CV] END learning_rate=100, n_components=2, n_iter=500, perplexity=50; total time=   5.0s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=30; total time=   6.7s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=30; total time=   6.9s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=30; total time=   6.7s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=30; total time=   7.1s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=30; total time=   6.9s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=50; total time=   7.3s


[2.85052622e-14 1.51689518e-06 3.20403997e-06 2.37107258e-06
 2.15250043e-06 8.62847201e-06 1.56922827e-05 1.13054545e-05
 3.01554600e-06]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 963 instead with accuracy 
4.4560463063657635e-06.

  _, diffusion_map = lobpcg(
[6.22847559e-15 1.15471699e-06 2.28792001e-06 2.18198637e-06
 2.03987215e-06 7.88739694e-06 1.48153401e-05 2.90713341e-06
 6.83005252e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(
[8.59802744e-15 1.89113437e-06 1.03272385e-06 3.07018864e-06
 2.76353508e-06 2.03517298e-06 1.78061541e-06 4.00187542e-06
 1.19607978e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1795 instead with accuracy 
2.606007928325288e-06.

  _, diffusion_map = lobpcg(
[5.64463674e-15 1.85733712e-06 1.11699534e-06 2.71244719e-06
 2.15959856e-06 2.25205726e-06 1.61356068e-06 3.15092139e-06
 8.59115349e-06]
not reaching the requested tolerance 5.975365638

[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=5; total time=  23.3s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=5; total time=  23.5s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=50; total time=   7.5s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=5; total time=  23.8s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=5; total time=  24.2s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=50; total time=   7.6s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=5; total time=  24.6s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=50; total time=   7.0s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=2, n_iter=1000, perplexity=50; total time=   4.7s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=100, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=5; total time=   4.6s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=5; total time=   4.5s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=5; total time=   4.6s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=5; total time=   5.0s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=5; total time=   5.0s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=30; total time=   5.1s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=30; total time=   5.5s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=30; total time=   5.3s


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=30; total time=   4.7s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=30; total time=   4.5s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=50; total time=   4.8s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=50; total time=   4.8s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=50; total time=   5.2s
[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=50; total time=   4.9s




[CV] END learning_rate=200, n_components=2, n_iter=250, perplexity=50; total time=   5.1s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=30; total time=   5.8s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=30; total time=   6.2s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=30; total time=   6.2s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=30; total time=   5.5s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=30; total time=   6.0s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=50; total time=   6.0s


[1.35116366e-14 1.50010901e-06 1.69455800e-06 1.18398692e-06
 2.10505953e-06 3.09672392e-06 2.10022935e-06 3.61433040e-06
 1.07098790e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
Use iteration 1930 instead with accuracy 
2.5331340493848004e-06.

  _, diffusion_map = lobpcg(
[1.14404775e-14 1.55397994e-06 1.70768983e-06 1.07658775e-06
 2.25424545e-06 2.83087585e-06 2.06227220e-06 3.90930308e-06
 7.40299152e-06]
not reaching the requested tolerance 5.9604644775390625e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=5; total time=  22.0s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=50; total time=   6.0s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=50; total time=   6.2s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=50; total time=   6.5s


[7.23571668e-15 6.75317655e-07 1.12318756e-06 2.10926586e-06
 1.34115534e-06 1.19828100e-06 4.12208416e-06 2.93713476e-06
 1.66536894e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1854 instead with accuracy 
2.3477008932639526e-06.

  _, diffusion_map = lobpcg(
[5.15113719e-15 6.89134086e-07 1.10697987e-06 1.82151729e-06
 1.19463403e-06 1.28412114e-06 2.83274214e-06 3.05673389e-06
 9.14353781e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(
[9.24180253e-15 9.37456330e-07 5.71343092e-07 1.67237032e-06
 4.22766434e-06 2.14142599e-06 2.56118914e-06 3.11198831e-06
 1.61036239e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1978 instead with accuracy 
3.213456815269296e-06.

  _, diffusion_map = lobpcg(
[5.15095818e-15 9.28123738e-07 5.72383847e-07 1.69581872e-06
 4.21870823e-06 2.14033226e-06 2.56084258e-06 3.08054929e-06
 1.37243551e-05]
not reaching the requested tolerance 5.97536563

[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=5; total time=  22.4s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=5; total time=  23.2s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=5; total time=  23.7s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=50; total time=   5.3s




[CV] END learning_rate=200, n_components=2, n_iter=500, perplexity=5; total time=  23.8s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=30; total time=   6.0s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=30; total time=   6.3s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=30; total time=   6.9s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=30; total time=   6.1s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=30; total time=   6.4s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=50; total time=   6.3s


[6.88517506e-15 1.60869715e-06 1.11787413e-06 1.31962878e-06
 3.08078734e-06 1.42119346e-06 1.87252936e-06 3.65574982e-06
 2.11231320e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
Use iteration 1837 instead with accuracy 
2.527104423837219e-06.

  _, diffusion_map = lobpcg(
[5.46954713e-15 1.62590254e-06 1.18549612e-06 1.30197681e-06
 1.60253141e-06 1.45448793e-06 1.64113487e-06 3.80182617e-06
 1.01303512e-05]
not reaching the requested tolerance 5.9604644775390625e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=5; total time=  22.3s


[5.11244158e-14 1.03990488e-06 1.26005774e-06 1.26240438e-06
 6.31041919e-06 8.10115029e-07 3.21178963e-06 6.60866459e-06
 2.45839395e-05]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1897 instead with accuracy 
2.5304130379160332e-06.

  _, diffusion_map = lobpcg(
[1.22735188e-14 8.45719899e-07 1.17618683e-06 1.30407178e-06
 2.61744408e-06 9.05179340e-07 2.09730280e-06 2.94971788e-06
 1.08780449e-05]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=5; total time=  23.3s




[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=5; total time=   0.0s


[1.08216353e-14 8.52783320e-07 1.31454782e-06 1.09304554e-06
 3.40232322e-06 1.49782719e-06 3.01996749e-06 1.98603365e-06
 7.13887172e-06]
not reaching the requested tolerance 5.97536563873291e-06.
Use iteration 1965 instead with accuracy 
2.1551289178885326e-06.

  _, diffusion_map = lobpcg(
[1.05528292e-14 8.65775842e-07 1.26123667e-06 1.07930997e-06
 3.40876364e-06 1.50571289e-06 2.99667570e-06 1.98689333e-06
 6.29177253e-06]
not reaching the requested tolerance 5.97536563873291e-06.
  _, diffusion_map = lobpcg(


[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=5; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=50; total time=   8.2s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=30; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=30; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=30; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=30; total time=   0.1s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=5; total time=  24.3s




[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=50; total time=   0.1s




[CV] END learning_rate=200, n_components=5, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=50; total time=   8.7s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=5; total time=  23.6s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=5; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=30; total time=   0.1s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=30; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=5; total time=  24.1s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=50; total time=   9.4s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=5; total time=   0.1s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=5, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=2, n_iter=1000, perplexity=50; total time=   8.1s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=30; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=250, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=30; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=500, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=5; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=30; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s




[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s
[CV] END learning_rate=200, n_components=10, n_iter=1000, perplexity=50; total time=   0.0s


270 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/harrisonma/opt/anaconda3/envs/ai_cellchat/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 864, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "/var/folders/dr/_r18syxx1yd9jgt5prvtlwcr0000gn/T/ipykernel_36683/564412395.py", line 44, in fit
  File "/Users/harrisonma/opt/anaconda3/envs/ai_cellchat/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/harrisonma/opt/anaconda3/envs/ai_cel

Best params: {'learning_rate': 100, 'n_components': 2, 'n_iter': 250, 'perplexity': 30}
Best silhouette: 0.6320655107498169


In [55]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from dtne import *
import leidenalg
import math
import igraph as ig
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances


# 1) Define the sklearn‐compatible wrapper.
class DTNEClusteringEvaluator(BaseEstimator):
    def __init__(
        self,
        n_neighbors=15,
        include_self=True,
        delta=1.0,
        alpha=1.0,
        epsilon=1e-2,
        beta=0.1,
        kernel='box',
        solver='mds',
        random_state=0
    ):
        self.n_neighbors   = n_neighbors
        self.include_self  = include_self
        self.delta         = delta
        self.alpha         = alpha
        self.epsilon       = epsilon
        self.beta          = beta
        self.kernel        = kernel
        self.solver        = solver
        self.random_state  = random_state
        self.embedding_    = None

    def fit(self, X, y=None):
        # instantiate your DTNE operator with current hyperparameters
        dtne = DTNE(
            n_neighbors=self.n_neighbors,
            include_self=self.include_self,
            delta=self.delta,
            alpha=self.alpha,
            epsilon=self.epsilon,
            beta=self.beta,
            kernel=self.kernel,
            solver=self.solver,
            random_state=self.random_state
        )
        # compute embedding once
        self.embedding_ = dtne.fit_transform(X)
        return self

    def transform(self, X):
        # scikit‐learn requires a transform method, but DTNE is non‐incremental,
        # so we just return the embedding computed in fit()
        return self.embedding_

    def score(self, X, y=None):
        X_emb = self.embedding_
        scores = []
        clustering_methods = [
            KMeans(random_state=self.random_state),
            AgglomerativeClustering(),
            SpectralClustering(random_state=self.random_state),
            DBSCAN(),
            GaussianMixture(random_state=self.random_state),
            leidenalg.find_partition(ig.Graph(), leidenalg.ModularityVertexPartition),
        ]
        for algo in clustering_methods:
            try:
                if hasattr(algo, 'fit_predict'):
                    labels = algo.fit_predict(X_emb)
                else:
                    algo.fit(X_emb)
                    labels = algo.predict(X_emb)

                # handle DBSCAN noise
                if isinstance(algo, DBSCAN):
                    mask = labels != -1
                    if len(np.unique(labels[mask])) < 2:
                        continue
                    sc = silhouette_score(X_emb[mask], labels[mask])
                else:
                    if len(np.unique(labels)) < 2:
                        continue
                    sc = silhouette_score(X_emb, labels)
                scores.append(sc)
            except Exception:
                continue

        return np.mean(scores) if scores else -1.0

# 2) Choose a param_grid.
param_grid = {
    'n_neighbors':   [5, 10, 15],
    #'include_self':  [True, False],
    'delta':         [0.5, 1.0, 2.0],
    'alpha':         [0.5, 1.0, 2.0],
    'epsilon':       [1e-3, 1e-2, 1e-1],
    'beta':          [0.01, 0.1, 1.0],
    # 'kernel':        ['box', 'gaussian'],
    #'solver':        ['mds', 'sgd', 'umap'],
    # leave random_state fixed for reproducibility
}

# 3) Run GridSearchCV.
grid = GridSearchCV(
    estimator=DTNEClusteringEvaluator(),
    param_grid=param_grid,
    cv=5,           # 5‐fold cross‐validation
    n_jobs=-1,      # use all cores
    verbose=1
)

# assuming `np_log` is your data matrix:
grid.fit(np_log)

print("Best parameters:", grid.best_params_)
print("Best silhouette score:", grid.best_score_)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters: {'alpha': 0.5, 'beta': 1.0, 'delta': 0.5, 'epsilon': 0.001, 'n_neighbors': 5}
Best silhouette score: 0.389358870846531


UMAP: {'min_dist': 0.1, 'n_components': 2, 'n_neighbors': 10}  
DTNE: {'alpha': 0.5, 'beta': 1.0, 'delta': 0.5, 'epsilon': 0.001, 'kernel': 'box', 'n_neighbors': 5}  
T-SNE: learning_rate': 100, 'n_components': 2, 'n_iter': 250, 'perplexity': 30

# Run UMAP, DTNE, and T-SNE based on the above parameters

In [70]:
X = np_log

dim_red_umap = UMAP(n_neighbors=10, min_dist=0.1, n_components=2,).fit_transform(X)
dim_red_dtne = DTNE(n_neighbors=5, delta=0.5, alpha=0.5, epsilon=0.001, beta=1.0).fit_transform(X)
dim_red_umap = pd.DataFrame(dim_red_umap)
dim_red_dtne = pd.DataFrame(dim_red_dtne)
dim_red_umap.index = df_log['sample_id']
dim_red_dtne.index = df_log['sample_id']
dim_red_umap.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/dim_red_umap.txt', sep = '\t')
dim_red_dtne.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/dim_red_dtne.txt', sep = '\t')




In [76]:
df_dim_umap = pd.DataFrame(dim_red_umap, columns=['UMAP1', 'UMAP2'])
df_dim_dtne = pd.DataFrame(dim_red_dtne, columns=['DTNE1', 'DTNE2'])


In [78]:
clustering_methods = [
            KMeans(),
            AgglomerativeClustering(),
            SpectralClustering(),
            DBSCAN(),
            GaussianMixture(),
            leidenalg.find_partition(ig.Graph(), leidenalg.ModularityVertexPartition),
        ]
# run each of the clustering methods, write the results to a column in df_log
for method in clustering_methods:
    if hasattr(method, 'fit_predict'):
        labels = method.fit_predict(dim_red_umap)
    elif hasattr(method, 'fit'):
        method.fit(dim_red_umap)
        labels = method.predict(dim_red_umap)
    else:
        g, partition = leiden_cluster(dim_red_umap)
        labels = np.array(partition.membership)

    # Add the labels to the DataFrame
    df_log[f'umap {method}'] = labels


In [80]:
clustering_methods = [
            KMeans(),
            AgglomerativeClustering(),
            SpectralClustering(),
            DBSCAN(),
            GaussianMixture(),
            leidenalg.find_partition(ig.Graph(), leidenalg.ModularityVertexPartition),
        ]
# run each of the clustering methods, write the results to a column in df_log
for method in clustering_methods:
    if hasattr(method, 'fit_predict'):
        labels = method.fit_predict(dim_red_dtne)
    elif hasattr(method, 'fit'):
        method.fit(dim_red_dtne)
        labels = method.predict(dim_red_dtne)
    else:
        g, partition = leiden_cluster(dim_red_dtne)
        labels = np.array(partition.membership)

    # Add the labels to the DataFrame
    df_log[f'umap {method}'] = labels


In [65]:
df_log_old = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/Data/processed/TCGA.HNSC.expression_log_tumor_top2000.txt', sep = '\t')
df_log['sample_id'] = df_log_old['sample_id']
df_log['patient_id'] = df_log_old['patient_id']


In [66]:
df_log.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/Harrison_Local_Data/TCGA.HNSC.expression_log_tumor_top2000_clustering_results.txt', sep = '\t', index = False)


In [71]:
# do DEG analysis using leiden clusters, one versus all. The genes are the column names to df_log
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import numpy as np


def run_DEG_analysis(df, cluster_col, alpha=0.05):
    """
    Run differential expression analysis for each cluster against all other clusters.
    
    Parameters:
    -----------
    df : DataFrame
        The input DataFrame containing gene expression data and cluster labels.
    cluster_col : str
        The name of the column containing cluster labels.
    alpha : float, default=0.05
        Significance level for multiple testing correction.
        
    Returns:
    --------
    results : DataFrame
        A DataFrame containing the results of the differential expression analysis.
    """
    
    # Get unique clusters
    clusters = df[cluster_col].unique()
    
    # Initialize a list to store results
    results = []
    
    # Loop through each cluster
    for cluster in clusters:
        # Get the genes in the current cluster
        genes_in_cluster = df[df[cluster_col] == cluster].drop(columns=[cluster_col])
        
        # Get the genes in all other clusters
        genes_in_others = df[df[cluster_col] != cluster].drop(columns=[cluster_col])
        
        # Perform t-test for each gene
        for gene in genes_in_cluster.columns:
            # Calculate means for log2fold change
            mean_cluster = np.mean(genes_in_cluster[gene])
            mean_others = np.mean(genes_in_others[gene])
            log2fc = mean_cluster - mean_others  # Data is already log-transformed
            
            # Perform t-test
            t_stat, p_val = ttest_ind(genes_in_cluster[gene], genes_in_others[gene], equal_var=False)
            
            results.append({
                'gene': gene, 
                'cluster': cluster, 
                't_stat': t_stat, 
                'p_val': p_val,
                'log2fc': log2fc,
                'mean_cluster': mean_cluster,
                'mean_others': mean_others
            })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Adjust p-values for multiple testing using Benjamini-Hochberg method
    _, corrected_p_vals, _, _ = multipletests(results_df['p_val'], alpha=alpha, method='fdr_bh')
    
    # Add corrected p-values to results DataFrame
    results_df['corrected_p_val'] = corrected_p_vals
    
    return results_df


In [None]:
df_labels_dtne = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/dim_red/DTNE_clustering_labels_df.tsv', sep = '\t')
df_labels_umap = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/dim_red/UMAP_clustering_labels_df.tsv', sep = '\t')
df_labels_pca = pd.read_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/DEGs/2k_PCA_clustering_labels_df.tsv', sep = '\t')


In [None]:
# take all columns starting with 'umap' and 'dtne' and store them in a list
umap_cols = [col for col in df_log.columns if col.startswith('umap')]
dtne_cols = [col for col in df_log.columns if col.startswith('dtne')]
df_cluster = df_log[umap_cols + dtne_cols]
df_log.drop(columns=umap_cols, inplace=True)
df_log.drop(columns=dtne_cols, inplace=True)


In [None]:
df_labels_pca


Unnamed: 0,sample_id,kmeans_cluster_label,agg_cluster_label,gmm_cluster_label,spectral_cluster_label,dbscan_cluster_label,leiden_cluster_label
0,TCGA-4P-AA8J-01A-11R-A39I-07,1,2,1,0,-1,3
1,TCGA-BA-4074-01A-01R-1436-07,3,0,1,1,-1,2
2,TCGA-BA-4075-01A-01R-1436-07,3,0,1,1,-1,2
3,TCGA-BA-4076-01A-01R-1436-07,0,1,1,0,0,1
4,TCGA-BA-4077-01B-01R-1436-07,0,0,1,1,0,4
...,...,...,...,...,...,...,...
496,TCGA-UF-A7JT-01A-11R-A34R-07,0,0,1,1,0,2
497,TCGA-UF-A7JV-01A-11R-A34R-07,3,0,1,1,0,2
498,TCGA-UP-A6WW-01A-12R-A34R-07,4,2,0,4,0,4
499,TCGA-WA-A7GZ-01A-11R-A34R-07,2,0,1,0,0,0


In [107]:
df_log['cluster'] = df_labels_pca['kmeans_cluster_label']


In [108]:
res = run_DEG_analysis(df_log, 'cluster', alpha=0.05) 
res.head() 


Unnamed: 0,gene,cluster,t_stat,p_val,log2fc,mean_cluster,mean_others,corrected_p_val
0,KRT14|3861,1,3.29418,0.001142,0.733997,18.125472,17.391474,0.002299
1,KRT13|3860,1,-1.03152,0.303986,-0.454128,12.221956,12.676084,0.367355
2,KRT6A|3853,1,2.639719,0.008855,0.397693,18.089158,17.691465,0.015222
3,KRT16|3868,1,3.191031,0.001634,0.716902,16.773139,16.056237,0.003198
4,SMR3B|10879,1,0.01282,0.989787,0.00184,0.407145,0.405305,0.991473


In [109]:
# filter corrected_p_val < 0.05 

res_pca = res[res['corrected_p_val'] < 0.05]
res_pca = res_pca[res_pca['log2fc'] > 1]
# split gene by | and the first part
res_pca['gene'] = res_pca['gene'].str.split('|').str[1]
res_pca


Unnamed: 0,gene,cluster,t_stat,p_val,log2fc,mean_cluster,mean_others,corrected_p_val
8,1277,1,10.785488,4.907863e-21,1.844462,17.267604,15.423141,1.085810e-19
16,1281,1,11.640505,1.559151e-23,1.999929,16.662623,14.662693,4.585738e-22
18,2335,1,10.947164,7.673115e-22,2.004439,15.904636,13.900196,1.894596e-20
19,1278,1,11.406428,6.633849e-23,1.887532,16.449347,14.561815,1.858221e-21
27,4620,1,20.663137,1.173114e-46,8.388799,11.760768,3.371969,1.988330e-44
...,...,...,...,...,...,...,...,...
9847,131177,4,3.353185,1.149335e-03,1.122560,6.677189,5.554629,2.311150e-03
9862,83699,4,6.053656,2.883652e-08,1.221188,8.447160,7.225971,1.228131e-07
9895,21,4,10.222007,6.776102e-16,3.199642,9.638605,6.438963,8.095701e-15
9961,3965,4,11.208469,1.193069e-18,1.711203,11.136147,9.424944,1.975279e-17


In [103]:
# filter corrected_p_val < 0.05 

res_dtne = res[res['corrected_p_val'] < 0.05]
res_dtne = res_dtne[res_dtne['log2fc'] > 1]
# split gene by | and the first part
res_dtne['gene'] = res_dtne['gene'].str.split('|').str[1]
res_dtne


Unnamed: 0,gene,cluster,t_stat,p_val,log2fc,mean_cluster,mean_others,corrected_p_val
8,1277,1,10.252107,1.228109e-21,1.616075,16.874221,15.258145,1.779868e-20
16,1281,1,10.307481,7.450059e-22,1.658850,16.173042,14.514191,1.106444e-20
18,2335,1,11.812637,2.836673e-27,2.009791,15.648189,13.638398,7.464930e-26
19,1278,1,10.263805,1.087704e-21,1.596135,16.007867,14.411732,1.591762e-20
27,4620,1,28.534627,1.030365e-82,8.702338,10.883930,2.181592,2.060731e-80
...,...,...,...,...,...,...,...,...
5842,10723,2,9.429153,2.865549e-17,1.262506,10.685586,9.423080,2.746532e-16
5847,131177,2,4.389549,1.877163e-05,1.274084,6.683535,5.409452,4.963852e-05
5862,83699,2,7.514051,2.027418e-12,1.269658,8.364872,7.095215,1.236231e-11
5895,21,2,10.150840,1.090729e-18,2.375279,8.693819,6.318540,1.185575e-17


In [100]:
# filter corrected_p_val < 0.05 

res_umap = res[res['corrected_p_val'] < 0.05]
res_umap = res_umap[res_umap['log2fc'] > 1]
# split gene by | and the first part
res_umap['gene'] = res_umap['gene'].str.split('|').str[1]
res_umap


Unnamed: 0,gene,cluster,t_stat,p_val,log2fc,mean_cluster,mean_others,corrected_p_val
8,1277,3,7.156271,1.519563e-11,1.289257,16.774681,15.485424,1.200050e-10
16,1281,3,7.037570,3.187015e-11,1.318586,16.067182,14.748596,2.414405e-10
18,2335,3,10.734287,9.594042e-22,2.001857,15.830636,13.828779,2.091344e-20
19,1278,3,6.844242,9.395784e-11,1.233221,15.878718,14.645497,6.675512e-10
27,4620,3,16.568431,1.707420e-39,6.928350,10.337073,3.408723,1.501029e-37
...,...,...,...,...,...,...,...,...
7842,10723,2,8.707909,1.540911e-14,1.310767,10.782751,9.471984,1.674904e-13
7847,131177,2,3.785668,2.239052e-04,1.154236,6.644665,5.490429,6.424826e-04
7862,83699,2,8.397111,2.229371e-14,1.383602,8.515725,7.132123,2.377996e-13
7895,21,2,10.946957,1.734777e-19,2.752359,9.109194,6.356835,3.097816e-18


In [None]:
res_umap.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/DEGs/res_umap_DEGs.csv')
res_dtne.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/DEGs/res_dtne_DEGs.csv')
res_pca.to_csv('/Users/harrisonma/Documents/PhD Classes/2025 Spring/BNFO 285/BNFO285_Projects/project_1/Harrison/DEGs/res_pca_DEGs.csv')
