In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [2]:
!sudo apt update

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
[33m0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/110 kB 13%] [Connect[0m[33m0% [Waiting for headers] [Connecting to cloud.r-project.org] [Connected to ppa.[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Connecting to cloud.r-project.org] [Connected to ppa.[0m                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy 

In [1]:
import time
from math import e
import random
import numpy as np
import gc

def to_numpy(row):
  a = list(row.asDict().values())
  return np.array(a, dtype="<U22")

def hamming_distance(x1, x2):
  return np.count_nonzero(x1!=x2)

def get_closest_cluster_and_count(x, centroid):
  min_hamming_distance = np.inf
  closest_cluster = 0
  for i, mode in enumerate(centroid):
     distance = hamming_distance(x, mode)
     if distance < min_hamming_distance:
        min_hamming_distance = distance
        closest_cluster = i

  P = len(x)

  count_elem_hash = {}
  for i in range(P):
    count_elem_hash[i] = {x[i] : 1}

  return (closest_cluster, count_elem_hash)

def merge_count_elem_hash(count_elem_hash_A, count_elem_hash_B):
  for idx in count_elem_hash_A.keys():
    for key in count_elem_hash_B[idx].keys():
      if key in count_elem_hash_A[idx]:
        count_elem_hash_A[idx][key] += count_elem_hash_B[idx][key]
      else:
        count_elem_hash_A[idx][key] =  count_elem_hash_B[idx][key]
  return count_elem_hash_A


def get_centroid(count_elem_hash):
  P = len(count_elem_hash)
  centroid = np.full((P,), "", dtype="<U22")
  for idx, count_hash in count_elem_hash.items():
    mode = get_mode(count_hash)
    centroid[idx] = mode
  return centroid

def get_mode(count_hash):
  mode = ""
  highest_count = 0
  for value, count in count_hash.items():
    if count > highest_count:
      mode = value
      highest_count = count
  return mode

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

pdf = pd.read_csv('/content/drive/MyDrive/BigData/K-Mode/train.csv')
pdf = pdf.drop(columns=['id', 'target'])
pdf = pdf.astype('<U22')

#from ucimlrepo import fetch_ucirepo
#
##
#data = fetch_ucirepo(id=848)
#pdf = data.data.features
#
#pdf = pdf.drop(columns = [
#    'cap-diameter',
#    'stem-height',
#    'stem-width',
#])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("K-Mode-global") \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
K = 3 #number of cluseters

N_iters = 10

N_partitions = 100

P = len(pdf.columns)

stop_distance = P//10

In [5]:
for frac in [0.25, 0.5, 0.75, 1]:
  print(f'-------------------------------{frac}------------------------------')
  df = spark.createDataFrame(pdf.sample(frac=frac).reset_index(drop=True))
  df.cache()
  rdd = df.rdd.repartition(N_partitions)
  rdd.cache()

  for (idx, row) in enumerate(rdd.takeSample(withReplacement=False, num=K, seed=42)):
    if idx == 0:
      init_centroid = np.array([to_numpy(row)]) #shape (1, P)
    else:
      c_ = to_numpy(row)
      c_ = np.array([c_]) #shape (1, P)
      init_centroid = np.concatenate([init_centroid, c_], axis=0)

  centroid = sc.broadcast(init_centroid)

  rdd = rdd.map(lambda row: to_numpy(row))

  t_start = time.time()

  for iter in range(N_iters):

    clustered_and_hash_count_rdd = rdd.map(lambda x: get_closest_cluster_and_count(x, centroid.value)) #-> (k, v) = (cluster_i, count_hash)
    counted_elem_rdd = clustered_and_hash_count_rdd.reduceByKey(lambda x, y: merge_count_elem_hash(x, y))
    cluster_and_new_centroid_rdd =  counted_elem_rdd.map(lambda x: (x[0], get_centroid(x[1])))
    centroid_hash_form = cluster_and_new_centroid_rdd.collect()

    for idx in range(K):
      if idx == 0:
        new_centroid = np.array([centroid_hash_form[idx][1]])
      else:
        mode = np.array([centroid_hash_form[idx][1]])
        new_centroid = np.concatenate([new_centroid, mode])

    old_centroid = centroid.value.copy()
    centroid = sc.broadcast(new_centroid)

    distance = hamming_distance(old_centroid, new_centroid)

    print('iteration : ', {iter+1}, " hamming distance between new and previous centroid:  ", distance)

    if distance <= stop_distance:
      break

  t_end = time.time()
  print(f"--- {t_end-t_start} seconds ---")

-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   26
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 93.15848207473755 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   26
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 112.63911437988281 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   33
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 145.37082600593567 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   31
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 154.88309502601624 seconds ---


In [6]:
#mushroom 848 - N_Partition = 10
#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   9
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 10.779891729354858 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   12
#iteration :  {2}  hamming distance between new and previous centroid:   2
#iteration :  {3}  hamming distance between new and previous centroid:   2
#iteration :  {4}  hamming distance between new and previous centroid:   1
#--- 20.987311601638794 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   8
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 15.095524549484253 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   19
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 12.326220035552979 seconds ---


#mushroom 848 - N_Partition = 50
#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   16
#iteration :  {2}  hamming distance between new and previous centroid:   2
#iteration :  {3}  hamming distance between new and previous centroid:   0
#--- 65.96836447715759 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   11
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 43.24885559082031 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   9
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 38.82074046134949 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   9
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 44.693320751190186 seconds --


#mushroom 848 - N_Partition = 100
#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   8
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 67.94282054901123 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   7
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 71.87999939918518 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   15
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 66.75218081474304 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   18
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 69.71056056022644 seconds ---

In [7]:
#kaggle  N_Partition = 10

#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   31
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 27.516228437423706 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   27
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 50.537758111953735 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   33
#iteration :  {2}  hamming distance between new and previous centroid:   2
#--- 62.72692847251892 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   29
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 81.62160468101501 seconds ---

#kaggle  N_Partition = 50

#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   33
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 59.55855393409729 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   30
#iteration :  {2}  hamming distance between new and previous centroid:   1
#--- 79.15811371803284 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   29
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 98.91301417350769 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   31
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 113.1372766494751 seconds ---


#kaggle N_partition = 100
#-------------------------------0.25------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   26
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 93.15848207473755 seconds ---
#-------------------------------0.5------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   26
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 112.63911437988281 seconds ---
#-------------------------------0.75------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   33
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 145.37082600593567 seconds ---
#-------------------------------1------------------------------
#iteration :  {1}  hamming distance between new and previous centroid:   31
#iteration :  {2}  hamming distance between new and previous centroid:   0
#--- 154.88309502601624 seconds ---