In [None]:
from ucimlrepo import fetch_ucirepo

data = fetch_ucirepo(id=73)
pdf = data.data.features

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("K-Mode-global") \
    .getOrCreate()

sc = spark.sparkContext

In [None]:
df = spark.createDataFrame(pdf)
df.cache()
rdd = df.rdd.repartition(10)
rdd.cache()

### Get unique value from each columns

In [None]:
import gc

columns = df.columns
unique_values_dict = {}
for col in columns:
  unique_val_objs = df.select(col).distinct().collect()
  unique_val_list = [row[col] for row in unique_val_objs]
  unique_values_dict[col] = unique_val_list
  del unique_val_list
  gc.collect()

### Ramdomly initialize centroid (Mode)

In [None]:
from math import e
import random
import numpy as np

K = 5 #number of cluseters

#count_row
n_data = df.count()

for (i, col) in enumerate(columns):
  unique_values = unique_values_dict[col]
  ramdom_vals = random.choices(unique_values, k=K)
  if i == 0:
    centroid = np.array(ramdom_vals).reshape(-1, 1).astype('str')
  else:
    ramdom_vals = np.array(ramdom_vals).reshape(-1, 1).astype('str')
    centroid = np.hstack((centroid, ramdom_vals))

centroid = sc.broadcast(centroid)
centroid.value

### Row to numpy

In [None]:
def to_numpy(row):
  a = list(row.asDict().values())
  return np.array(a, dtype="<U22")

rdd = rdd.map(lambda row: to_numpy(row))

### Clustering

In [None]:
from collections import Counter

def hamming_distance(x1, x2):
  return np.count_nonzero(x1!=x2)

def get_closest_cluster(x, centroid):
  min_hamming_distance = np.inf
  closest_cluster = 0
  for i, mode in enumerate(centroid):
     distance = hamming_distance(x, mode)
     if distance < min_hamming_distance:
        min_hamming_distance = distance
        closest_cluster = i
  return (closest_cluster, x)

def get_mode_from_vec(vec):
  counted = Counter(vec)
  return counted.most_common(1)[0][0]

def get_mode_from_arr(arr):
  return np.apply_along_axis(get_mode_from_vec, 0, arr)

### Pararellize training phrase

In [None]:
stop_distance = 1

N_iters = 10

for iter in range(N_iters):
  clustered = rdd.map(lambda x: get_closest_cluster(x, centroid.value)) #-> (k, v) = (cluster_i, X)
  group_by_clustered = clustered.reduceByKey(lambda x, y: np.vstack((x, y)))
  centroid_rdd = group_by_clustered.map(lambda x : (x[0], get_mode_from_arr(x[1])))
  centroid_list = centroid_rdd.collect()

  new_centroid = centroid.value.copy()
  for (i, arr) in centroid_list:
    new_centroid[i] = arr

  old_centroid = centroid.value.copy()
  centroid = sc.broadcast(new_centroid)

  distance = hamming_distance(old_centroid, new_centroid)

  print('iteration : ', {iter+1}, " hamming distance between new and previous centroid:  ", distance)

  if distance <= stop_distance:
    break

In [None]:
centroid.value