In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [2]:
!sudo apt update

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Pack

In [1]:
import time
from math import e
import random
import numpy as np
import gc

def to_numpy(row):
  a = list(row.asDict().values())

  return np.array(a, dtype="<U22")

def hamming_distance(x1, x2):
  return np.count_nonzero(x1!=x2)

def get_closest_cluster_and_count(x, centroid):
  min_hamming_distance = np.inf
  closest_cluster = 0
  for i, mode in enumerate(centroid):
     distance = hamming_distance(x, mode)
     if distance < min_hamming_distance:
        min_hamming_distance = distance
        closest_cluster = i

  P = len(x)

  count_elem_hash = {}
  for i in range(P):
    count_elem_hash[i] = {x[i] : 1}

  return (closest_cluster, count_elem_hash)

def merge_count_elem_hash(count_elem_hash_A, count_elem_hash_B):
  for idx in count_elem_hash_A.keys():
    for key in count_elem_hash_B[idx].keys():
      if key in count_elem_hash_A[idx]:
        count_elem_hash_A[idx][key] += count_elem_hash_B[idx][key]
      else:
        count_elem_hash_A[idx][key] =  count_elem_hash_B[idx][key]
  return count_elem_hash_A


def get_centroid(count_elem_hash):
  P = len(count_elem_hash)
  centroid = np.full((P,), "", dtype="<U22")
  for idx, count_hash in count_elem_hash.items():
    mode = get_mode(count_hash)
    centroid[idx] = mode
  return centroid

def get_mode(count_hash):
  mode = ""
  highest_count = 0
  for value, count in count_hash.items():
    if count > highest_count:
      mode = value
      highest_count = count
  return mode

def merge_counts_within_partition(iterator):
    combined_counts = {}
    for x in iterator:
        cluster, count_hash = get_closest_cluster_and_count(x, centroid.value)
        if cluster in combined_counts:
            combined_counts[cluster] = merge_count_elem_hash(combined_counts[cluster], count_hash)
        else:
            combined_counts[cluster] = count_hash
    yield from combined_counts.items()

In [2]:
from ucimlrepo import fetch_ucirepo


data = fetch_ucirepo(id=848)
pdf = data.data.features

pdf = pdf.drop(columns = [
    'cap-diameter',
    'stem-height',
    'stem-width',
])

#from google.colab import drive
#drive.mount('/content/drive')
#
#import pandas as pd
#
#pdf = pd.read_csv('/content/drive/MyDrive/BigData/K-Mode/train.csv')
#pdf = pdf.drop(columns=['id', 'target'])
#pdf = pdf.astype('<U22')

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("K-Mode-global") \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
K = 3 #number of cluseters

N_iters = 10

N_partitions = 100

P = len(pdf.columns)

stop_distance = P//10

df = spark.createDataFrame(pdf)
df.cache()
rdd = df.rdd.repartition(N_partitions)
rdd.cache()

for (idx, row) in enumerate(rdd.takeSample(withReplacement=False, num=K, seed=42)):
  if idx == 0:
    init_centroid = np.array([to_numpy(row)]) #shape (1, P)
  else:
    c_ = to_numpy(row)
    c_ = np.array([c_]) #shape (1, P)
    init_centroid = np.concatenate([init_centroid, c_], axis=0)

In [5]:
for frac in [0.25, 0.5, 0.75, 1]:
  print(f'-------------------------------{frac}------------------------------')
  df = spark.createDataFrame(pdf.sample(frac=frac).reset_index(drop=True))
  df.cache()
  rdd = df.rdd.repartition(N_partitions)
  rdd.cache()

  centroid = sc.broadcast(init_centroid)

  rdd = rdd.map(lambda row: to_numpy(row))

  t_start = time.time()

  for iter in range(N_iters):

    combined_counts_rdd = rdd.mapPartitions(merge_counts_within_partition)
    counted_elem_rdd = combined_counts_rdd.reduceByKey(lambda x, y: merge_count_elem_hash(x, y))
    cluster_and_new_centroid_rdd =  counted_elem_rdd.map(lambda x: (x[0], get_centroid(x[1])))
    centroid_hash_form = cluster_and_new_centroid_rdd.collect()

    for idx in range(K):
      if idx == 0:
        new_centroid = np.array([centroid_hash_form[idx][1]])
      else:
        mode = np.array([centroid_hash_form[idx][1]])
        new_centroid = np.concatenate([new_centroid, mode])

    old_centroid = centroid.value.copy()
    centroid = sc.broadcast(new_centroid)

    distance = hamming_distance(old_centroid, new_centroid)

    print('iteration : ', {iter+1}, " hamming distance between new and previous centroid:  ", distance)

    if distance <= stop_distance:
      break

  t_end = time.time()
  print(f"--- {t_end-t_start} seconds ---")

-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 101.0346782207489 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 102.58198928833008 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 101.47393536567688 seconds ---
------------------------------

In [6]:
#mushroom 848 - N_Partition = 10

-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   14
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 16.036723375320435 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   14
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 17.576813220977783 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   14
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 19.701108694076538 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   14
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 24.15873408317566 seconds ---

#mushroom 848 - N_Partition = 50
#mushroom 848 - N_Partition = 50
-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   21
iteration :  {2}  hamming distance between new and previous centroid:   4
iteration :  {3}  hamming distance between new and previous centroid:   1
--- 56.45051574707031 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   20
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   1
--- 52.93846321105957 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   20
iteration :  {2}  hamming distance between new and previous centroid:   2
iteration :  {3}  hamming distance between new and previous centroid:   1
--- 59.03074359893799 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   19
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   1
--- 59.19886612892151 seconds ---


#mushroom 848 - N_Partition = 100
-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 101.0346782207489 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 102.58198928833008 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 101.47393536567688 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   15
iteration :  {2}  hamming distance between new and previous centroid:   3
iteration :  {3}  hamming distance between new and previous centroid:   0
--- 107.78599643707275 seconds ---

SyntaxError: invalid syntax (<ipython-input-6-d5a1bddbd3ed>, line 3)

In [None]:
#kaggle  N_Partition = 10

-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   27
iteration :  {2}  hamming distance between new and previous centroid:   1
--- 40.02137732505798 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   27
iteration :  {2}  hamming distance between new and previous centroid:   1
--- 70.80644822120667 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   27
iteration :  {2}  hamming distance between new and previous centroid:   1
--- 95.41008138656616 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   27
iteration :  {2}  hamming distance between new and previous centroid:   1
--- 141.9313485622406 seconds ---

#kaggle  N_Partition = 50
-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   36
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 76.29413771629333 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   36
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 105.43889665603638 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   36
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 133.53474926948547 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   36
iteration :  {2}  hamming distance between new and previous centroid:   0
--- 184.66259908676147 seconds ---

#kaggle N_partition = 100
-------------------------------0.25------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   30
iteration :  {2}  hamming distance between new and previous centroid:   2
--- 111.51788449287415 seconds ---
-------------------------------0.5------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   30
iteration :  {2}  hamming distance between new and previous centroid:   2
--- 143.34579515457153 seconds ---
-------------------------------0.75------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   30
iteration :  {2}  hamming distance between new and previous centroid:   2
--- 173.9554238319397 seconds ---
-------------------------------1------------------------------
iteration :  {1}  hamming distance between new and previous centroid:   30
iteration :  {2}  hamming distance between new and previous centroid:   2
--- 238.2033395767212 seconds ---