In [7]:
import numpy as np
import time
from sklearn.datasets import make_blobs
import concurrent.futures
import os

In [8]:
# Set n_jobs to the number of available CPU cores
n_jobs = os.cpu_count()

In [9]:
# Non-parallel K-means implementation
def kmeans_non_parallel(X, k, max_iter=300):
    # Random initialization of centroids
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]
    for _ in range(max_iter):
        # Assignment step
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        
        # Update step
        for i in range(k):
            centroids[i] = X[labels == i].mean(axis=0)
    return centroids, labels


In [10]:
# Parallel assignment function with concurrent.futures
def parallel_assignment_future(X_chunk, centroids):
    distances = np.linalg.norm(X_chunk[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)

# Parallel k-means using concurrent.futures
def kmeans_parallel_optimized(X, k, max_iter=300, n_jobs=None):
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]
    for _ in range(max_iter):
        # Split X into chunks for parallel processing
        chunk_size = X.shape[0] // 8  # Using 8 chunks to balance the load
        chunks = [X[i:i + chunk_size] for i in range(0, X.shape[0], chunk_size)]

        # Use concurrent.futures for parallel assignment
        with concurrent.futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
            results = list(executor.map(lambda chunk: parallel_assignment_future(chunk, centroids), chunks))
        
        # Combine the results
        labels = np.concatenate(results)
        
        # Update centroids
        for i in range(k):
            centroids[i] = X[labels == i].mean(axis=0)
    
    return centroids, labels



In [11]:
# Generate synthetic data for benchmarking
def generate_synthetic_data(n_samples=70000, n_features=10, n_clusters=3):
    X, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters, cluster_std=1.0, random_state=42)
    return X

# Now set n_jobs to an appropriate value
def benchmark_kmeans(X, n_clusters, n_jobs=None):
    # Benchmark for Gaussian Dataset
    start = time.time()
    centroids, labels = kmeans_non_parallel(X, n_clusters)  # non-parallel version
    end = time.time()
    print(f"Non-parallel K-means (Gaussian) took {end - start:.2f} seconds")

    start = time.time()
    centroids, labels = kmeans_parallel_optimized(X, n_clusters, n_jobs=n_jobs)  # parallel version
    end = time.time()
    print(f"Parallel K-means (Gaussian) took {end - start:.2f} seconds")


# Generate a synthetic dataset with 70,000 samples, 10 features, and 3 clusters
X = generate_synthetic_data(n_samples=70000, n_features=10, n_clusters=3)
print(f"Generated dataset with shape: {X.shape}")

benchmark_kmeans(X, n_clusters=3, n_jobs=n_jobs)

# Generate a synthetic random dataset with higher dimensions for benchmarking
X_random = np.random.randn(70000, 100)
print(f"Generated random dataset with shape: {X_random.shape}")

benchmark_kmeans(X_random, n_clusters=5, n_jobs=n_jobs)


Generated dataset with shape: (70000, 10)


  centroids[i] = X[labels == i].mean(axis=0)
  ret = um.true_divide(


Non-parallel K-means (Gaussian) took 7.44 seconds
Parallel K-means (Gaussian) took 5.38 seconds
Generated random dataset with shape: (70000, 100)
Non-parallel K-means (Gaussian) took 129.47 seconds
Parallel K-means (Gaussian) took 41.95 seconds
