In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [2]:
!sudo apt update

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Waiting for headers] [Waiting for headers] [1 InRelease 0 B/3,626 B 0%] [Wa[0m[33m0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f[0m                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
[33m0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [2 InRelea[0m[33m                                                                               0% [Waiting for headers] [Waiting for headers] [Waiting for headers][0m                                                                    Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
[33m0% [Waiting for headers] [3 InRelease 12.7 kB/110 kB 12%] [Waiting for headers][0m                                     

In [5]:
from ucimlrepo import fetch_ucirepo

data = fetch_ucirepo(id=848)

pdf = data.data.features

pdf = pdf.drop(columns = [
    'cap-diameter',
    'stem-height',
    'stem-width',
])

pdf = pdf.astype('<U22')

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("K-Mode-global") \
    .getOrCreate()

sc = spark.sparkContext

In [7]:
import random
import numpy as np
from collections import Counter

class KMode:

    def __init__(self, K, centroid=None) -> None:
        self.K = K
        self.centroid = centroid

    def init_centroid(self, X):
        col_len = X.shape[1]
        centroid = np.array([], dtype="<U22")
        for i in range(self.K):
            mode = []
            for col in range(col_len):
                rand_val = random.choice(np.unique(X[:, col]))
                mode.append(rand_val)
            if i == 0:
                centroid = np.array(mode, dtype="<U22")
                del mode
            else:
                mode = np.array(mode, dtype="<U22")
                centroid = np.vstack((centroid, mode))
        self.centroid = centroid
        return centroid

    def get_closest_centroid(self, x):
        min_hamming_distance = np.inf
        closest_cluster = 0
        for i, mode in enumerate(self.centroid):
            distance = self.hamming_distance(x, mode)
            if distance < min_hamming_distance:
                min_hamming_distance = distance
                closest_cluster = i
        return closest_cluster

    def fit(self, X, n_iters=100, stopping_criterion=1) -> None:
        losses = [] #for report
        if self.centroid is None:
          self.init_centroid(X)
        self.clustered = np.zeros(X.shape[0])
        for iter in range(n_iters):
            loss = self.fit_one_step(X)
            print(f"iter: {iter} loss: {loss}")

            if loss < stopping_criterion:
                break

    def fit_one_step(self, X) -> int:
        self.prev_centroid = self.centroid.copy()
        for i, x in enumerate(X):
            closest_centroid = self.get_closest_centroid(x)
            self.clustered[i] = closest_centroid
        for i in range(self.K):
            idx = np.where(self.clustered == i)[0]
            if idx.shape[0] > 0:
                new_mode = self.get_mode_from_arr(X[idx])
                self.centroid[i] = new_mode
        return self.hamming_distance(self.centroid, self.prev_centroid)

    def transform(self, X_test):
        clustered = np.zeros(X_test.shape[0])
        for i, x in enumerate(X_test):
            closest_centroid = self.get_closest_centroid(x)
            clustered[i] = closest_centroid
        return clustered.astype('int')

    @staticmethod
    def hamming_distance(a, b) -> int:
        return np.count_nonzero(a != b)

    @staticmethod
    def get_mode_from_arr(arr):
        def get_mode_from_vec(vec):
            counted = Counter(vec)
            return counted.most_common(1)[0][0]
        P = arr.shape[1]
        mode_arr = np.full((arr.shape[1], ), "", dtype="<U22")
        for p in range(P):
          mode = get_mode_from_vec(arr[:, p])
          mode_arr[p] = mode
        return mode_arr



In [8]:
def parition_to_numpy_array(instances):
  array = None
  for row in instances:
    if array is None:
      array = np.array(list(row.asDict().values()), dtype="<U22")
    else:
      array_row = np.array(list(row.asDict().values()), dtype="<U22")
      array = np.vstack((array, array_row))
  yield array

In [9]:
def get_optimal_kmode_centroid(X, K=3):
  kmode = KMode(K)
  kmode.fit(X)
  return kmode.centroid

In [10]:
df = spark.createDataFrame(pdf)
df.cache()
rdd = df.rdd.repartition(10)
rdd.cache()

MapPartitionsRDD[18] at coalesce at NativeMethodAccessorImpl.java:0

In [11]:
K = 3

rdd = rdd.mapPartitions(parition_to_numpy_array)
rdd = rdd.map(lambda x: get_optimal_kmode_centroid(x, K))
all_centroids = rdd.reduce(lambda x, y: np.vstack((x, y)))
final_kmode = KMode(K)
final_kmode.fit(all_centroids)

iter: 0 loss: 23
iter: 1 loss: 4
iter: 2 loss: 0


In [12]:
final_kmode.centroid

array([['x', 'nan', 'n', 'f', 'd', 'c', 'w', 'nan', 'nan', 'w', 'nan',
        'nan', 'f', 'f', 'nan', 'd', 'a'],
       ['x', 'nan', 'n', 'f', 'a', 'nan', 'w', 'nan', 'nan', 'w', 'nan',
        'nan', 'f', 'f', 'nan', 'd', 'a'],
       ['x', 'h', 'n', 'f', 'e', 'nan', 'w', 'nan', 'nan', 'w', 'nan',
        'nan', 't', 'e', 'nan', 'd', 'u']], dtype='<U22')