# Parallel Hyperparameter Optimization with Optuna & MLflow

This notebook uses **true parallelization** by running different (dataset, algorithm) combinations in parallel.

**Strategy**: Instead of parallelizing trials within one study (limited by TPE coordination),
we parallelize the 15 independent optimization tasks (3 datasets × 5 algorithms).

This is **much faster** for small datasets!

## Configuration

In [None]:
# Configuration
N_TRIALS = 2000  # Trials per (dataset, algorithm) combination
EXPERIMENT_NAME = "HPO_3_parallel"
ALGORITHMS = ['DTSCAN', 'DBSCAN', 'KMeans', 'Spectral', 'GMM', 'ASCDT', 'DTC']
N_JOBS = 6

## Imports

In [19]:
import sys
sys.path.insert(0, '../code')

import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from multiprocessing import Pool, cpu_count
from functools import partial
import time

In [20]:
import optuna
import mlflow

In [21]:
from sklearn.cluster import DBSCAN, KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [None]:
from dtscan import DTSCAN, psr
from dtc import DTC
from ascdt import ASCDT
from cluster_sets import clusters_paper

import io
from contextlib import redirect_stdout

print("✓ All imports loaded")

✓ All imports loaded


## MLflow Setup

In [23]:
# Set MLflow tracking URI
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"✓ MLflow tracking: {mlflow.get_tracking_uri()}")
print(f"✓ Experiment: {EXPERIMENT_NAME}")

2025/11/21 20:02:52 INFO mlflow.tracking.fluent: Experiment with name 'HPO_2_parallel' does not exist. Creating a new experiment.


✓ MLflow tracking: file:../mlruns
✓ Experiment: HPO_2_parallel


## Load Datasets

In [24]:
# Load datasets
S1, S2, S3 = clusters_paper()

datasets = {
    'S1': (S1[['x', 'y']].values, S1['label'].values),
    'S2': (S2[['x', 'y']].values, S2['label'].values),
    'S3': (S3[['x', 'y']].values, S3['label'].values)
}

print("✓ Datasets loaded")

✓ Datasets loaded


In [25]:
# Display dataset information
for name, (X, labels) in datasets.items():
    n_clusters = len(np.unique(labels[labels != -1]))
    print(f"{name}: {X.shape[0]} points, {n_clusters} clusters")

S1: 788 points, 7 clusters
S2: 300 points, 3 clusters
S3: 373 points, 2 clusters


## Optimizer Functions

These are standalone functions that can run in separate processes.

In [None]:
def objective_dtscan(trial, X, true_labels):
    """DTSCAN objective - optimizes for PSR score."""
    min_pts = trial.suggest_int('MinPts', 2, 20)
    area_threshold = trial.suggest_float('area_threshold', -40.0, 40.0)
    length_threshold = trial.suggest_float('length_threshold', -40.0, 40.0)
    
    try:
        dtscan = DTSCAN(MinPts=min_pts, area_threshold=area_threshold,
                       length_threshold=length_threshold)
        f = io.StringIO()
        with redirect_stdout(f):
            labels = dtscan.fit_predict(X)
        
        psr_score = psr(true_labels, labels)
        n_clusters = len(np.unique(labels[labels != -1]))
        
        trial.set_user_attr('n_clusters', n_clusters)
        return psr_score
    except:
        return 0.0

def objective_dbscan(trial, X_scaled, true_labels, eps_min, eps_max):
    """DBSCAN objective - optimizes for PSR score."""
    eps = trial.suggest_float('eps', eps_min, eps_max)
    min_samples = trial.suggest_int('min_samples', 3, 12)
    
    try:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_scaled)
        
        psr_score = psr(true_labels, labels)
        n_clusters = len(np.unique(labels[labels != -1]))
        
        trial.set_user_attr('n_clusters', n_clusters)
        return psr_score
    except:
        return 0.0

def objective_kmeans(trial, X_scaled, true_labels, expected_clusters):
    """K-means objective - optimizes for PSR score."""
    n_clusters = trial.suggest_int('n_clusters',
                                   max(2, expected_clusters - 2),
                                   expected_clusters + 5)
    n_init = trial.suggest_categorical('n_init', [5, 10, 20, 50])
    
    try:
        kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=42)
        labels = kmeans.fit_predict(X_scaled)
        return psr(true_labels, labels)
    except:
        return 0.0

def objective_spectral(trial, X_scaled, true_labels, expected_clusters):
    """Spectral objective - optimizes for PSR score."""
    if len(X_scaled) > 1000:
        return 0.0
    
    n_clusters = trial.suggest_int('n_clusters',
                                   max(2, expected_clusters - 2),
                                   expected_clusters + 5)
    n_neighbors = trial.suggest_int('n_neighbors', 5, 30)
    
    try:
        spectral = SpectralClustering(n_clusters=n_clusters,
                                     n_neighbors=n_neighbors,
                                     affinity='nearest_neighbors',
                                     random_state=42)
        labels = spectral.fit_predict(X_scaled)
        return psr(true_labels, labels)
    except:
        return 0.0

def objective_gmm(trial, X_scaled, true_labels, expected_clusters):
    """GMM objective - optimizes for PSR score."""
    n_components = trial.suggest_int('n_components',
                                    max(2, expected_clusters - 2),
                                    expected_clusters + 5)
    covariance_type = trial.suggest_categorical('covariance_type',
                                                ['full', 'tied', 'diag', 'spherical'])
    n_init = trial.suggest_categorical('n_init', [1, 5, 10, 20])
    
    try:
        gmm = GaussianMixture(n_components=n_components,
                             covariance_type=covariance_type,
                             n_init=n_init,
                             random_state=42)
        labels = gmm.fit_predict(X_scaled)
        return psr(true_labels, labels)
    except:
        return 0.0

def objective_ascdt(trial, X, true_labels):
    """ASCDT objective - optimizes for PSR score."""
    min_cluster_size = trial.suggest_int('min_cluster_size', 2, 20)
    beta = trial.suggest_float('beta', 1.0, 1.5)
    
    try:
        ascdt = ASCDT(min_cluster_size=min_cluster_size, beta=beta)
        labels = ascdt.fit_predict(X)
        
        psr_score = psr(true_labels, labels)
        n_clusters = ascdt.n_clusters_
        
        trial.set_user_attr('n_clusters', n_clusters)
        return psr_score
    except Exception:
        return 0.0

def objective_dtc(trial, X, true_labels):
    """DTC objective - optimizes for PSR score."""
    minPts = trial.suggest_int('minPts', 2, 20)
    local_std = trial.suggest_float('local_std', 1.0, 5.0)
    
    try:
        df = pd.DataFrame(X, columns=['x', 'y'])
        
        tdbscan = DTC(data=df, minPts=minPts, local_std=local_std, kde=False)
        
        f = io.StringIO()
        with redirect_stdout(f):
            result_df = tdbscan.tri_dbscan()

        if result_df is None or 'est_clust' not in result_df.columns:
            return 0.0

        labels = result_df['est_clust'].values
        
        psr_score = psr(true_labels, labels)
        n_clusters = len(np.unique(labels[labels != 0]))
        
        trial.set_user_attr('n_clusters', n_clusters)
        return psr_score
    except Exception:
        return 0.0

print("✓ Objective functions defined")

✓ Objective functions defined


## Parallel Optimization Function

This function will be executed in parallel for each (dataset, algorithm) combination.

In [27]:
def optimize_single_task(task_params):
    """Optimize a single (dataset, algorithm) combination.
    
    This function runs in a separate process.
    """
    dataset_name, algorithm, X, true_labels, n_trials, experiment_name = task_params
    
    # Prepare data
    expected_clusters = len(np.unique(true_labels[true_labels != -1]))
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # For DBSCAN, estimate eps range
    if algorithm == 'DBSCAN':
        nbrs = NearestNeighbors(n_neighbors=6).fit(X_scaled)
        distances, _ = nbrs.kneighbors(X_scaled)
        distances_sorted = np.sort(distances[:, 5])
        eps_min, eps_max = np.percentile(distances_sorted, [10, 90])
    
    # Setup study
    study_name = f"{dataset_name}_{algorithm}"
    
    # Use in-memory storage (each process has its own study)
    study = optuna.create_study(
        study_name=study_name,
        direction='maximize',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    # Select objective function
    if algorithm == 'DTSCAN':
        objective = lambda trial: objective_dtscan(trial, X, true_labels)
    elif algorithm == 'DBSCAN':
        objective = lambda trial: objective_dbscan(trial, X_scaled, true_labels, eps_min, eps_max)
    elif algorithm == 'KMeans':
        objective = lambda trial: objective_kmeans(trial, X_scaled, true_labels, expected_clusters)
    elif algorithm == 'Spectral':
        objective = lambda trial: objective_spectral(trial, X_scaled, true_labels, expected_clusters)
    elif algorithm == 'GMM':
        objective = lambda trial: objective_gmm(trial, X_scaled, true_labels, expected_clusters)
    elif algorithm == 'ASCDT':
        objective = lambda trial: objective_ascdt(trial, X, true_labels)
    elif algorithm == 'DTC':
        objective = lambda trial: objective_dtc(trial, X, true_labels)
    else:
        raise ValueError(f"Unknown algorithm: {algorithm}")
    
    # Optimize
    start_time = time.time()
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    duration = time.time() - start_time
    
    # Log to MLflow
    mlflow.set_tracking_uri("file:../mlruns")
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name=study_name):
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("algorithm", algorithm)
        mlflow.log_param("n_samples", len(X))
        mlflow.log_param("expected_clusters", expected_clusters)
        mlflow.log_param("n_trials", n_trials)
        mlflow.log_params(study.best_params)
        mlflow.log_metric("best_psr_score", study.best_value)
        mlflow.log_metric("optimization_time_seconds", duration)
        
        if 'n_clusters' in study.best_trial.user_attrs:
            mlflow.log_metric("best_n_clusters",
                            study.best_trial.user_attrs['n_clusters'])
    
    # Return results
    result = {
        'dataset': dataset_name,
        'algorithm': algorithm,
        'best_params': study.best_params,
        'best_psr': study.best_value,
        'expected_clusters': expected_clusters,
        'n_clusters': study.best_trial.user_attrs.get('n_clusters', None),
        'duration': duration
    }
    
    print(f"✓ {study_name}: PSR={study.best_value:.4f} ({duration:.1f}s)")
    
    return result

print("✓ Parallel optimization function defined")

✓ Parallel optimization function defined


## Run Parallel Optimization

This creates all tasks and runs them in parallel using multiprocessing.

In [28]:
# Check available CPU cores
n_cores = cpu_count()
n_jobs = n_cores if N_JOBS == -1 else N_JOBS

print(f"CPU cores available: {n_cores}")
print(f"Will use: {n_jobs} parallel processes")
print(f"Total tasks: {len(datasets)} datasets × {len(ALGORITHMS)} algorithms = {len(datasets) * len(ALGORITHMS)} tasks")

CPU cores available: 8
Will use: 8 parallel processes
Total tasks: 3 datasets × 5 algorithms = 15 tasks


In [29]:
# Create all tasks
tasks = []
for dataset_name, (X, labels) in datasets.items():
    for algorithm in ALGORITHMS:
        task = (dataset_name, algorithm, X, labels, N_TRIALS, EXPERIMENT_NAME)
        tasks.append(task)

print(f"✓ Created {len(tasks)} optimization tasks")

✓ Created 15 optimization tasks


In [30]:
# Run all tasks in parallel
print("\n" + "="*80)
print("STARTING PARALLEL OPTIMIZATION")
print("="*80)
print(f"Datasets: {list(datasets.keys())}")
print(f"Algorithms: {ALGORITHMS}")
print(f"Trials per task: {N_TRIALS}")
print(f"Parallel processes: {n_jobs}")
print("="*80 + "\n")

start_time = time.time()

if n_jobs == 1:
    # Sequential execution (for debugging)
    results = [optimize_single_task(task) for task in tasks]
else:
    # Parallel execution
    with Pool(processes=n_jobs) as pool:
        results = pool.map(optimize_single_task, tasks)

total_time = time.time() - start_time

print("\n" + "="*80)
print("PARALLEL OPTIMIZATION COMPLETE")
print("="*80)
print(f"Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
print(f"Average time per task: {total_time/len(tasks):.1f} seconds")
print("="*80)


STARTING PARALLEL OPTIMIZATION
Datasets: ['S1', 'S2', 'S3']
Algorithms: ['DTSCAN', 'DBSCAN', 'KMeans', 'Spectral', 'GMM']
Trials per task: 2000
Parallel processes: 8



[I 2025-11-21 20:02:52,737] A new study created in memory with name: S1_DTSCAN
[I 2025-11-21 20:02:52,739] A new study created in memory with name: S1_KMeans
[I 2025-11-21 20:02:52,742] A new study created in memory with name: S1_DBSCAN
[I 2025-11-21 20:02:52,741] A new study created in memory with name: S1_GMM
[I 2025-11-21 20:02:52,742] A new study created in memory with name: S2_DTSCAN
[I 2025-11-21 20:02:52,744] A new study created in memory with name: S1_Spectral
[I 2025-11-21 20:02:52,750] Trial 0 finished with value: 0.07563025210084033 and parameters: {'eps': 0.11473042557039725, 'min_samples': 12}. Best is trial 0 with value: 0.07563025210084033.
[I 2025-11-21 20:02:52,754] Trial 1 finished with value: 0.8110238967381825 and parameters: {'eps': 0.13406283036564298, 'min_samples': 8}. Best is trial 1 with value: 0.8110238967381825.
[I 2025-11-21 20:02:52,751] A new study created in memory with name: S2_DBSCAN
[I 2025-11-21 20:02:52,748] A new study created in memory with name: 

✓ S2_DBSCAN: PSR=0.6830 (48.8s)

[I 2025-11-21 20:03:41,923] Trial 635 finished with value: 0.6866078591749505 and parameters: {'MinPts': 6, 'area_threshold': -19.064295932369077, 'length_threshold': -1.064617332945592}. Best is trial 494 with value: 0.9904115492350787.





[I 2025-11-21 20:03:41,930] A new study created in memory with name: S2_Spectral
[I 2025-11-21 20:03:41,937] Trial 1886 finished with value: 0.9648361908866112 and parameters: {'eps': 0.1232725170278983, 'min_samples': 6}. Best is trial 41 with value: 0.9801979840977965.
[I 2025-11-21 20:03:41,938] Trial 1153 finished with value: 0.0 and parameters: {'MinPts': 16, 'area_threshold': -35.06851633538041, 'length_threshold': 0.2080461921636161}. Best is trial 956 with value: 0.7345604717823925.
[I 2025-11-21 20:03:41,964] Trial 1154 finished with value: 0.0 and parameters: {'MinPts': 3, 'area_threshold': -38.50357588039628, 'length_threshold': 4.22696474798625}. Best is trial 956 with value: 0.7345604717823925.
[I 2025-11-21 20:03:41,959] Trial 1887 finished with value: 0.9739321469190116 and parameters: {'eps': 0.1402763098554561, 'min_samples': 7}. Best is trial 41 with value: 0.9801979840977965.
[I 2025-11-21 20:03:41,981] Trial 1888 finished with value: 0.9793576479633427 and parameter

✓ S1_DBSCAN: PSR=0.9802 (53.5s)

[I 2025-11-21 20:03:46,563] Trial 678 finished with value: 0.38218056453350574 and parameters: {'MinPts': 7, 'area_threshold': -16.154883467188665, 'length_threshold': -1.932966228667083}. Best is trial 494 with value: 0.9904115492350787.





[I 2025-11-21 20:03:46,571] Trial 1230 finished with value: 0.0 and parameters: {'MinPts': 8, 'area_threshold': -39.99588459773342, 'length_threshold': 5.996682888493353}. Best is trial 956 with value: 0.7345604717823925.
[I 2025-11-21 20:03:46,588] A new study created in memory with name: S2_GMM
[I 2025-11-21 20:03:46,628] Trial 1231 finished with value: 0.332991452991453 and parameters: {'MinPts': 3, 'area_threshold': -37.155763556700556, 'length_threshold': -1.4726840705596964}. Best is trial 956 with value: 0.7345604717823925.
[I 2025-11-21 20:03:46,641] Trial 679 finished with value: 0.0 and parameters: {'MinPts': 6, 'area_threshold': -22.357095017615624, 'length_threshold': 1.001884336640656}. Best is trial 494 with value: 0.9904115492350787.
[I 2025-11-21 20:03:46,659] Trial 1232 finished with value: 0.0 and parameters: {'MinPts': 20, 'area_threshold': -34.61492103125514, 'length_threshold': -5.7617974174955116}. Best is trial 956 with value: 0.7345604717823925.
[I 2025-11-21 20

✓ S2_DTSCAN: PSR=0.7346 (105.9s)

[I 2025-11-21 20:04:38,898] Trial 51 finished with value: 0.7278941728559287 and parameters: {'n_components': 8, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 10 with value: 0.7278941728559287.





[I 2025-11-21 20:04:38,923] A new study created in memory with name: S3_DTSCAN
[I 2025-11-21 20:04:38,937] Trial 81 finished with value: 0.5987179487179487 and parameters: {'n_clusters': 3, 'n_init': 10}. Best is trial 1 with value: 0.5987179487179487.
[I 2025-11-21 20:04:38,970] Trial 0 finished with value: 0.0 and parameters: {'MinPts': 9, 'area_threshold': 36.057144512793286, 'length_threshold': 18.559515344912406}. Best is trial 0 with value: 0.0.
[I 2025-11-21 20:04:38,976] Trial 1184 finished with value: 0.9896096900298581 and parameters: {'MinPts': 5, 'area_threshold': -22.568832374624616, 'length_threshold': -0.25920480680453817}. Best is trial 978 with value: 0.9917751716071044.
[I 2025-11-21 20:04:39,017] Trial 1 finished with value: 0.03654873164218958 and parameters: {'MinPts': 13, 'area_threshold': -27.51850876460508, 'length_threshold': -27.520438373103786}. Best is trial 1 with value: 0.03654873164218958.
[I 2025-11-21 20:04:39,019] Trial 69 finished with value: 0.555851

✓ S1_DTSCAN: PSR=0.9918 (201.6s)


[I 2025-11-21 20:06:14,744] Trial 152 finished with value: 0.7278941728559287 and parameters: {'n_components': 8, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 10 with value: 0.7278941728559287.
[I 2025-11-21 20:06:14,763] A new study created in memory with name: S3_DBSCAN
[I 2025-11-21 20:06:14,779] Trial 0 finished with value: 0.08152173913043478 and parameters: {'eps': 0.16131169161587927, 'min_samples': 12}. Best is trial 0 with value: 0.08152173913043478.
[I 2025-11-21 20:06:14,795] Trial 1 finished with value: 0.5721649484536082 and parameters: {'eps': 0.22829192232597995, 'min_samples': 8}. Best is trial 1 with value: 0.5721649484536082.
[I 2025-11-21 20:06:14,810] Trial 1570 finished with value: 0.0 and parameters: {'MinPts': 4, 'area_threshold': -10.140075670951054, 'length_threshold': 5.809740262248675}. Best is trial 1242 with value: 0.9930863649807748.
[I 2025-11-21 20:06:14,819] Trial 2 finished with value: 0.2870536381293889 and parameters: {'eps': 0.120364810932

✓ S3_DTSCAN: PSR=0.9931 (129.5s)

[I 2025-11-21 20:06:48,628] Trial 1354 finished with value: 0.8505154639175257 and parameters: {'eps': 0.2695018663236479, 'min_samples': 3}. Best is trial 49 with value: 0.8505154639175257.





[I 2025-11-21 20:06:48,659] A new study created in memory with name: S3_KMeans
[I 2025-11-21 20:06:48,657] Trial 1355 finished with value: 0.5111310324219334 and parameters: {'eps': 0.1486674966845939, 'min_samples': 3}. Best is trial 49 with value: 0.8505154639175257.
[I 2025-11-21 20:06:48,716] Trial 1356 finished with value: 0.8505154639175257 and parameters: {'eps': 0.2737635340553593, 'min_samples': 3}. Best is trial 49 with value: 0.8505154639175257.
[I 2025-11-21 20:06:48,745] Trial 1357 finished with value: 0.8350515463917525 and parameters: {'eps': 0.2612344648126904, 'min_samples': 3}. Best is trial 49 with value: 0.8505154639175257.
[I 2025-11-21 20:06:48,806] Trial 1358 finished with value: 0.8092783505154639 and parameters: {'eps': 0.2666252942688203, 'min_samples': 4}. Best is trial 49 with value: 0.8505154639175257.
[I 2025-11-21 20:06:48,862] Trial 1359 finished with value: 0.8505154639175257 and parameters: {'eps': 0.270189382220753, 'min_samples': 3}. Best is trial 49

✓ S3_DBSCAN: PSR=0.8505 (58.7s)


[I 2025-11-21 20:07:13,776] A new study created in memory with name: S3_Spectral
[I 2025-11-21 20:07:14,232] Trial 212 finished with value: 0.7278941728559287 and parameters: {'n_components': 8, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 10 with value: 0.7278941728559287.
[I 2025-11-21 20:07:14,475] Trial 121 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 20:07:14,484] Trial 170 finished with value: 0.9885310559374361 and parameters: {'n_clusters': 7, 'n_neighbors': 19}. Best is trial 147 with value: 0.9885310559374361.
[I 2025-11-21 20:07:14,734] Trial 261 finished with value: 0.6380399342259807 and parameters: {'n_clusters': 3, 'n_neighbors': 7}. Best is trial 18 with value: 0.6380399342259807.
[I 2025-11-21 20:07:14,904] Trial 0 finished with value: 0.5700335807705903 and parameters: {'n_clusters': 4, 'n_neighbors': 29}. Best is trial 0

✓ S2_Spectral: PSR=0.6594 (2175.1s)


[I 2025-11-21 20:39:57,339] A new study created in memory with name: S3_GMM
[I 2025-11-21 20:39:57,517] Trial 1199 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 5}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:39:57,578] Trial 1853 finished with value: 1.0 and parameters: {'n_clusters': 2, 'n_neighbors': 8}. Best is trial 2 with value: 1.0.
[I 2025-11-21 20:39:57,745] Trial 1120 finished with value: 0.6980221945679544 and parameters: {'n_clusters': 7, 'n_neighbors': 18}. Best is trial 147 with value: 0.9885310559374361.
[I 2025-11-21 20:39:57,877] Trial 1026 finished with value: 0.5987179487179487 and parameters: {'n_clusters': 3, 'n_init': 10}. Best is trial 1 with value: 0.5987179487179487.
[I 2025-11-21 20:39:57,976] Trial 1200 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 5}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:39:58,433] Trial 1854 finished with value: 1.0 and p

✓ S3_Spectral: PSR=1.0000 (2114.8s)


[I 2025-11-21 20:42:28,860] Trial 1192 finished with value: 0.9885310559374361 and parameters: {'n_clusters': 7, 'n_neighbors': 19}. Best is trial 147 with value: 0.9885310559374361.
[I 2025-11-21 20:42:28,967] Trial 1960 finished with value: 0.7278941728559287 and parameters: {'n_components': 8, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 10 with value: 0.7278941728559287.
[I 2025-11-21 20:42:29,070] Trial 107 finished with value: 0.7907528483544374 and parameters: {'n_components': 2, 'covariance_type': 'tied', 'n_init': 5}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 20:42:29,684] Trial 1298 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 20}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:42:30,025] Trial 1299 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 5}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:42:30,088] Trial 1961 finished with value: 0

✓ S2_GMM: PSR=0.7279 (2364.3s)


[I 2025-11-21 20:43:11,550] Trial 1331 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 20}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:43:11,960] Trial 1332 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 5}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:43:12,548] Trial 1117 finished with value: 0.5987179487179487 and parameters: {'n_clusters': 3, 'n_init': 20}. Best is trial 1 with value: 0.5987179487179487.
[I 2025-11-21 20:43:12,609] Trial 1333 finished with value: 0.750826790537982 and parameters: {'n_clusters': 2, 'n_init': 5}. Best is trial 1 with value: 0.750826790537982.
[I 2025-11-21 20:43:12,711] Trial 156 finished with value: 0.7907528483544374 and parameters: {'n_components': 2, 'covariance_type': 'tied', 'n_init': 20}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 20:43:12,791] Trial 1218 finished with value: 0.6980221945679544 and parameters: {'n_

✓ S3_KMeans: PSR=0.7508 (2869.0s)


[I 2025-11-21 20:54:38,145] Trial 1145 finished with value: 0.4192439862542955 and parameters: {'n_components': 7, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 20:54:38,228] Trial 1146 finished with value: 0.7907528483544374 and parameters: {'n_components': 2, 'covariance_type': 'diag', 'n_init': 1}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 20:54:38,272] Trial 224 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 20:54:38,453] Trial 1379 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 20:54:38,549] Trial 1668 finished with value: 0.9860718646432932 and parameters: {'n_clusters': 7, 'n_neighbors': 20}. Best is trial 147 with value: 0.9885310559374361.
[I 2025-11-21 20:54:38

✓ S1_Spectral: PSR=0.9885 (3508.3s)

[I 2025-11-21 21:01:21,308] Trial 1974 finished with value: 0.5987179487179487 and parameters: {'n_clusters': 3, 'n_init': 10}. Best is trial 1 with value: 0.5987179487179487.





[I 2025-11-21 21:01:21,376] Trial 1920 finished with value: 0.7953180604982206 and parameters: {'n_components': 2, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 21:01:21,486] Trial 1660 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:21,665] Trial 1661 finished with value: 0.929172953722191 and parameters: {'n_components': 8, 'covariance_type': 'spherical', 'n_init': 1}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:21,754] Trial 1921 finished with value: 0.5695242001640689 and parameters: {'n_components': 3, 'covariance_type': 'diag', 'n_init': 10}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 21:01:22,053] Trial 1922 finished with value: 0.7907528483544374 and parameters: {'n_components': 2, 'covariance_type': 'tied', 'n_init': 5}. Best is trial 19

✓ S2_KMeans: PSR=0.5987 (3528.9s)


[I 2025-11-21 21:01:42,046] Trial 1680 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 1}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:42,489] Trial 1982 finished with value: 0.40909090909090906 and parameters: {'n_components': 2, 'covariance_type': 'full', 'n_init': 5}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 21:01:42,503] Trial 1681 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:42,622] Trial 1983 finished with value: 0.7953180604982206 and parameters: {'n_components': 2, 'covariance_type': 'diag', 'n_init': 5}. Best is trial 19 with value: 0.7953180604982206.
[I 2025-11-21 21:01:42,775] Trial 270 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282

✓ S3_GMM: PSR=0.7953 (1309.0s)


[I 2025-11-21 21:01:46,514] Trial 272 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:01:46,727] Trial 1688 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 20}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:46,946] Trial 273 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 20}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:01:46,950] Trial 1689 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:47,082] Trial 1690 finished with value: 0.9929092174584547 and parameters: {'n_components': 7, 'covariance_type': 'spherical', 'n_init': 5}. Best is trial 7 with value: 0.9929092174584547.
[I 2025-11-21 21:01:

✓ S1_GMM: PSR=0.9929 (3598.9s)


[I 2025-11-21 21:02:51,710] Trial 355 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 20}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:02:51,986] Trial 356 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:02:52,050] Trial 357 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:02:52,075] Trial 358 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 10}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:02:52,144] Trial 359 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init': 50}. Best is trial 19 with value: 0.7560104309282243.
[I 2025-11-21 21:02:52,207] Trial 360 finished with value: 0.7560104309282243 and parameters: {'n_clusters': 7, 'n_init'

✓ S1_KMeans: PSR=0.7560 (3677.0s)

PARALLEL OPTIMIZATION COMPLETE
Total time: 3677.2 seconds (61.3 minutes)
Average time per task: 245.1 seconds


## Organize Results

In [31]:
# Organize results by dataset and algorithm
all_results = {}
for result in results:
    dataset = result['dataset']
    algorithm = result['algorithm']
    
    if dataset not in all_results:
        all_results[dataset] = {}
    
    all_results[dataset][algorithm] = result

print("✓ Results organized by dataset and algorithm")

✓ Results organized by dataset and algorithm


## Results Summary

In [32]:
# Create results DataFrame
results_data = []
for dataset_name, algo_results in all_results.items():
    for algo_name, result in algo_results.items():
        row = {
            'Dataset': dataset_name,
            'Algorithm': algo_name,
            'PSR Score': result['best_psr'],
            'N Clusters': result.get('n_clusters', 'N/A'),
            'Expected': result['expected_clusters'],
            'Time (s)': f"{result['duration']:.1f}"
        }
        results_data.append(row)

results_df = pd.DataFrame(results_data)
results_df = results_df.sort_values(['Dataset', 'PSR Score'], ascending=[True, False])

print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))


RESULTS SUMMARY
Dataset Algorithm  PSR Score  N Clusters  Expected Time (s)
     S1       GMM   0.992909         NaN         7   3598.9
     S1    DTSCAN   0.991775         7.0         7    201.6
     S1  Spectral   0.988531         NaN         7   3508.3
     S1    DBSCAN   0.980198         7.0         7     53.5
     S1    KMeans   0.756010         NaN         7   3677.0
     S2    DTSCAN   0.734560         6.0         3    105.9
     S2       GMM   0.727894         NaN         3   2364.3
     S2    DBSCAN   0.682973        14.0         3     48.8
     S2  Spectral   0.659432         NaN         3   2175.1
     S2    KMeans   0.598718         NaN         3   3528.9
     S3  Spectral   1.000000         NaN         2   2114.8
     S3    DTSCAN   0.993086         2.0         2    129.5
     S3    DBSCAN   0.850515         4.0         2     58.7
     S3       GMM   0.795318         NaN         2   1309.0
     S3    KMeans   0.750827         NaN         2   2869.0


In [33]:
# Find best algorithm per dataset
print("\n" + "="*80)
print("BEST ALGORITHM PER DATASET")
print("="*80)

for dataset in ['S1', 'S2', 'S3']:
    dataset_df = results_df[results_df['Dataset'] == dataset]
    if len(dataset_df) > 0:
        best = dataset_df.loc[dataset_df['PSR Score'].idxmax()]
        print(f"\n{dataset}: {best['Algorithm']} (PSR: {best['PSR Score']:.4f})")


BEST ALGORITHM PER DATASET

S1: GMM (PSR: 0.9929)

S2: DTSCAN (PSR: 0.7346)

S3: Spectral (PSR: 1.0000)


## Export Results

In [34]:
import json

# Format for JSON export
export_data = {}
for dataset_name, algo_results in all_results.items():
    export_data[dataset_name] = {}
    for algo_name, result in algo_results.items():
        export_data[dataset_name][algo_name] = {
            'best_params': result['best_params'],
            'psr_score': float(result['best_psr']),
            'n_clusters': int(result['n_clusters']) if result['n_clusters'] else None,
            'expected_clusters': int(result['expected_clusters']),
            'optimization_time_seconds': float(result['duration'])
        }

# Save to JSON
output_file = 'best_params_parallel.json'
with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"✓ Results exported to {output_file}")

✓ Results exported to best_params_parallel.json


## Next Steps

1. **View in MLflow UI**: `mlflow ui --backend-store-uri file:../mlruns`
2. **Load best parameters**: Use the `best_params_parallel.json` file
3. **Compare with sequential version**: Check time savings!

### Performance Notes

This parallel approach is faster because:
- **Independent tasks**: Each (dataset, algorithm) runs completely independently
- **No coordination overhead**: No shared TPE sampler state
- **Better CPU utilization**: For small datasets, trial computation is fast, so we benefit from parallelizing at the task level

Typical speedup: ~Linear with number of cores (e.g., 8 cores ≈ 8× faster)