In [11]:
pip install ucimlrepo



#Fetch data from UCI Machine Learning repository

In [1]:
from ucimlrepo import fetch_ucirepo

Mines = fetch_ucirepo(id=763)
pdf = Mines.data.features

In [2]:
pdf

Unnamed: 0,V,H,S
0,0.338157,0.000000,0.0
1,0.320241,0.181818,0.0
2,0.287009,0.272727,0.0
3,0.256284,0.454545,0.0
4,0.262840,0.545455,0.0
...,...,...,...
333,0.323262,0.909091,0.4
334,0.444108,0.181818,1.0
335,0.353474,0.454545,1.0
336,0.362537,0.727273,1.0


In [3]:
pdf.dtypes

V    float64
H    float64
S    float64
dtype: object

In [4]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=6ab4ee470bb390dd56f5877c424ad074e4d5f87d67db39b2896c0d4c6298719c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


#local

In [5]:
import numpy as np
from pyspark.sql import SparkSession

class KMeans:
    def __init__(self, K):
        self.K = K

    def init_centroids(self, X):
        n_features = X.shape[1]
        centroids = np.zeros((self.K, n_features))
        for i in range(self.K):
            idx = np.random.choice(X.shape[0])
            centroids[i] = X[idx]
        self.centroids = centroids
        return centroids

    def get_closest_centroid(self, x):
        distances = np.linalg.norm(self.centroids - x, axis=1)
        closest_cluster = np.argmin(distances)
        return closest_cluster

    def fit(self, X, n_iters=10, stopping_criterion=1e-6):
        self.init_centroids(X)
        self.clustered = np.zeros(X.shape[0])
        for iter in range(n_iters):
            loss = self.fit_one_step(X)
            print(f"iter: {iter} loss: {loss}")
            if loss < stopping_criterion:
                break

    def fit_one_step(self, X):
        self.prev_centroids = self.centroids.copy()
        for i, x in enumerate(X):
            closest_centroid = self.get_closest_centroid(x)
            self.clustered[i] = closest_centroid
        for i in range(self.K):
            idx = np.where(self.clustered == i)[0]
            if idx.shape[0] > 0:
                self.centroids[i] = np.mean(X[idx], axis=0)
        return np.linalg.norm(self.centroids - self.prev_centroids)

    def transform(self, X_test):
        clustered = np.zeros(X_test.shape[0])
        for i, x in enumerate(X_test):
            closest_centroid = self.get_closest_centroid(x)
            clustered[i] = closest_centroid
        return clustered

def partition_to_numpy_array(instances):
    array = None
    for row in instances:
        row_values = list(row.asDict().values())
        if array is None:
            array = np.array([row_values], dtype=float)
        else:
            array_row = np.array([row_values], dtype=float)
            array = np.vstack((array, array_row))
    if array is not None:
        yield array

def get_optimal_kmeans_centroid(X, K=3):
    kmeans = KMeans(K)
    kmeans.fit(X)
    return kmeans.centroids

def get_optimal_kmeans_centroid_with_dummy_key(X, K=3):
    centroid = get_optimal_kmeans_centroid(X, K)
    return (1, centroid)

In [6]:
spark = SparkSession.builder.master("local[*]").appName("K-Means-local").getOrCreate()
sc = spark.sparkContext

# Load your data into a Spark DataFrame
df = spark.createDataFrame(pdf)
df.show(5)

+-----------+-----------+---+
|          V|          H|  S|
+-----------+-----------+---+
|0.338156758|        0.0|0.0|
|0.320241334|0.181818182|0.0|
| 0.28700875|0.272727273|0.0|
|0.256283622|0.454545455|0.0|
|0.262839599|0.545454545|0.0|
+-----------+-----------+---+
only showing top 5 rows



In [7]:
df.cache()
rdd = df.rdd.repartition(10)
rdd.cache()
rdd = rdd.mapPartitions(partition_to_numpy_array)
rdd = rdd.map(get_optimal_kmeans_centroid)
all_centroids = rdd.reduce(lambda x, y: np.vstack((x, y)))
K = 3
final_kmeans = KMeans(K)
final_kmeans.fit(all_centroids)
print(final_kmeans.centroids)

iter: 0 loss: 0.5145551000338641
iter: 1 loss: 0.06823420993605749
iter: 2 loss: 0.0
[[0.4613617  0.42578644 0.77072856]
 [0.3592772  0.79154423 0.47360269]
 [0.4655792  0.40761734 0.168165  ]]


In [8]:
import time

# Number of clusters
K = 3

# List of fraction sizes
fraction_sizes = [0.25, 0.5, 0.75, 1.0]

# List of partition numbers
partition_numbers = [10, 50, 100]

for n_partitions in partition_numbers:
    print(f"Number of partitions: {n_partitions}")
    for frac in fraction_sizes:
        print(f"-------------------------------{frac}------------------------------")
        start_time = time.time()
        sample_df = df.sample(withReplacement=False, fraction=frac)
        rdd = sample_df.rdd.repartition(n_partitions)
        rdd.cache()
        rdd = rdd.mapPartitions(partition_to_numpy_array)

        # Get centroids from partitioned data
        rdd = rdd.map(get_optimal_kmeans_centroid_with_dummy_key)
        all_centroids = rdd.reduceByKey(lambda x, y: np.vstack((x, y))).map(lambda x: x[1]).collect()

        # Perform final K-Means clustering
        final_kmeans = KMeans(K)
        final_kmeans.fit(np.vstack(all_centroids))

        # Print iterations and time
        for iter in range(len(final_kmeans.prev_centroids)):
            distance = np.linalg.norm(final_kmeans.centroids - final_kmeans.prev_centroids[iter])
            print(f"iteration : {iter+1} euclidean distance between new and previous centroid: {distance}")

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"--- {elapsed_time:.9f} seconds ---")
        print()

Number of partitions: 10
-------------------------------0.25------------------------------
iter: 0 loss: 0.35776381186832457
iter: 1 loss: 0.0654442438612809
iter: 2 loss: 0.0
iteration : 1 euclidean distance between new and previous centroid: 0.8561437829245986
iteration : 2 euclidean distance between new and previous centroid: 0.8994372348388141
iteration : 3 euclidean distance between new and previous centroid: 0.7666792011071124
--- 6.611148119 seconds ---

-------------------------------0.5------------------------------
iter: 0 loss: 0.5810937980268489
iter: 1 loss: 0.05269886077515044
iter: 2 loss: 0.0
iteration : 1 euclidean distance between new and previous centroid: 0.9271970472708747
iteration : 2 euclidean distance between new and previous centroid: 0.9897794022736356
iteration : 3 euclidean distance between new and previous centroid: 0.6695360607201257
--- 5.191709042 seconds ---

-------------------------------0.75------------------------------
iter: 0 loss: 0.309216382664