In [None]:
from ucimlrepo import fetch_ucirepo
import time
from math import e
import random
import numpy as np
import gc
from collections import Counter

def to_numpy(row):
  a = list(row.asDict().values())
  return np.array(a, dtype="<U22")

def hamming_distance(x1, x2):
  return np.count_nonzero(x1!=x2)

def get_closest_cluster(x, centroid):
  min_hamming_distance = np.inf
  closest_cluster = 0
  for i, mode in enumerate(centroid):
     distance = hamming_distance(x, mode)
     if distance < min_hamming_distance:
        min_hamming_distance = distance
        closest_cluster = i
  return (closest_cluster, x)

def get_mode_from_vec(vec):
  counted = Counter(vec)
  return counted.most_common(1)[0][0]

def get_mode_from_arr(arr):
  return np.apply_along_axis(get_mode_from_vec, 0, arr)

In [None]:
K = 3 #number of cluseters

stop_distance = 1

N_iters = 10

### UCL Data

uncoment the follow code to fetch UCL data <br/>
use id = 73 for xxx data <br/>
use id = 76 for mushroom data

In [None]:
### uncoment the follow code to fetch UCL data
### use id = 73 for xxx data
### use id = 76 for mushroom data

#ucl_data = fetch_ucirepo(id=73) 
#pdf = ucl_data.data.features

### Data from Kaggle

In [None]:
import pandas as pd

pdf = pd.read_csv('experiments_notebook/data/train.csv')
pdf = pdf.drop(columns=['id', 'target'])

### init spark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("K-Mode-global") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
df = spark.createDataFrame(pdf)
df.cache()

columns = df.columns
unique_values_dict = {}
for col in columns:
  unique_val_objs = df.select(col).distinct().collect()
  unique_val_list = [row[col] for row in unique_val_objs]
  unique_values_dict[col] = unique_val_list
  del unique_val_list
  gc.collect()

In [None]:
runtime_dict = {
    0.25: [],
    0.5: [],
    0.75: [],
    1: []
}

for _ in range(1):

  for (i, col) in enumerate(columns):
    unique_values = unique_values_dict[col]
    ramdom_vals = random.choices(unique_values, k=K)
    if i == 0:
      ini_centroid = np.array(ramdom_vals).reshape(-1, 1).astype('str')
    else:
      ramdom_vals = np.array(ramdom_vals).reshape(-1, 1).astype('str')
      ini_centroid = np.hstack((ini_centroid, ramdom_vals))

  for (i, col) in enumerate(columns):
    unique_values = unique_values_dict[col]
    ramdom_vals = random.choices(unique_values, k=K)
    if i == 0:
      ini_centroid = np.array(ramdom_vals).reshape(-1, 1).astype('str')
    else:
      ramdom_vals = np.array(ramdom_vals).reshape(-1, 1).astype('str')
      ini_centroid = np.hstack((ini_centroid, ramdom_vals))

  BASE_PARTITION = 400

  for frac in [0.25, 0.5, 0.75, 1]:
    print(f'-------------------------------{frac}------------------------------')
    print(f'num partition: {int(BASE_PARTITION * frac)}')
    df = spark.createDataFrame(pdf.sample(frac=frac).reset_index(drop=True))
    df.cache()
    rdd = df.rdd.repartition(int(BASE_PARTITION * frac))
    rdd.cache()

    centroid = sc.broadcast(ini_centroid)

    rdd = rdd.map(lambda row: to_numpy(row))

    t_start = time.time()

    for iter in range(N_iters):
      clustered = rdd.map(lambda x: get_closest_cluster(x, centroid.value)) #-> (k, v) = (cluster_i, X)
      group_by_clustered = clustered.reduceByKey(lambda x, y: np.vstack((x, y)))
      centroid_rdd = group_by_clustered.map(lambda x : (x[0], get_mode_from_arr(x[1])))
      centroid_list = centroid_rdd.collect()

      new_centroid = centroid.value.copy()
      for (i, arr) in centroid_list:
        new_centroid[i] = arr

      old_centroid = centroid.value.copy()
      centroid = sc.broadcast(new_centroid)

      distance = hamming_distance(old_centroid, new_centroid)

      print('iteration : ', {iter+1}, " hamming distance between new and previous centroid:  ", distance)

      if distance <= stop_distance:
        break
    t_end = time.time()
    print(f"--- {t_end-t_start} seconds ---")
    runtime_dict[frac].append(t_end-t_start)

In [None]:
import pandas as pd
df_results = pd.DataFrame(runtime_dict)
df_results.mean()