In [145]:
!pip install pyspark



In [146]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import numpy as np

spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("k-prototype-global") \
        .getOrCreate()

sc = spark.sparkContext

In [251]:
df = pd.read_csv("data.csv")


labels = ["Age", "Height", "Weight","family_history_with_overweight","Gender"]

# from which label are the categorical variables
categorical_labels_start_index = 3

labels[categorical_labels_start_index:]

['family_history_with_overweight', 'Gender']

In [252]:
df = df[labels]
df.head()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,Gender
0,21.0,1.62,64.0,yes,Female
1,21.0,1.52,56.0,yes,Female
2,23.0,1.8,77.0,yes,Male
3,27.0,1.8,87.0,no,Male
4,22.0,1.78,89.8,no,Male


In [253]:
df = spark.createDataFrame(df)
df.show(5)

+----+------+------+------------------------------+------+
| Age|Height|Weight|family_history_with_overweight|Gender|
+----+------+------+------------------------------+------+
|21.0|  1.62|  64.0|                           yes|Female|
|21.0|  1.52|  56.0|                           yes|Female|
|23.0|   1.8|  77.0|                           yes|  Male|
|27.0|   1.8|  87.0|                            no|  Male|
|22.0|  1.78|  89.8|                            no|  Male|
+----+------+------+------------------------------+------+
only showing top 5 rows



In [254]:
df.printSchema()

root
 |-- Age: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- family_history_with_overweight: string (nullable = true)
 |-- Gender: string (nullable = true)



## Normalising the data

In [255]:
# min max normalisation for the numerical data
for col in df.columns[:categorical_labels_start_index]:
  minimum = df.agg({col: "min"}).collect()[0][0]
  maximum = df.agg({col: "max"}).collect()[0][0]

  df = df.withColumn(col + '_norm', (df[col] - minimum) / (maximum - minimum))

In [152]:
df.show(10)

+----+------+------+------------------------------+------+-------------------+-------------------+-------------------+
| Age|Height|Weight|family_history_with_overweight|Gender|           Age_norm|        Height_norm|        Weight_norm|
+----+------+------+------------------------------+------+-------------------+-------------------+-------------------+
|21.0|  1.62|  64.0|                           yes|Female|0.14893617021276595|0.32075471698113234| 0.1865671641791045|
|21.0|  1.52|  56.0|                           yes|Female|0.14893617021276595|0.13207547169811332|0.12686567164179105|
|23.0|   1.8|  77.0|                           yes|  Male|0.19148936170212766| 0.6603773584905662| 0.2835820895522388|
|27.0|   1.8|  87.0|                            no|  Male| 0.2765957446808511| 0.6603773584905662| 0.3582089552238806|
|22.0|  1.78|  89.8|                            no|  Male| 0.1702127659574468| 0.6226415094339623|0.37910447761194027|
|29.0|  1.62|  53.0|                            

In [256]:
df_norm = df.drop("Age").drop("Height").drop("Weight")
df_norm.show(5)

+------------------------------+------+-------------------+-------------------+-------------------+
|family_history_with_overweight|Gender|           Age_norm|        Height_norm|        Weight_norm|
+------------------------------+------+-------------------+-------------------+-------------------+
|                           yes|Female|0.14893617021276595|0.32075471698113234| 0.1865671641791045|
|                           yes|Female|0.14893617021276595|0.13207547169811332|0.12686567164179105|
|                           yes|  Male|0.19148936170212766| 0.6603773584905662| 0.2835820895522388|
|                            no|  Male| 0.2765957446808511| 0.6603773584905662| 0.3582089552238806|
|                            no|  Male| 0.1702127659574468| 0.6226415094339623|0.37910447761194027|
+------------------------------+------+-------------------+-------------------+-------------------+
only showing top 5 rows



## K prototype

In [257]:
# make into partitions
rdd = df_norm.rdd.repartition(4)
rdd.cache()

MapPartitionsRDD[716] at coalesce at NativeMethodAccessorImpl.java:0

In [258]:
def to_numpy_numerical(row, index):
  return np.array(list(row.asDict().values())[index:])

In [259]:
def to_numpy_categorical(row, index):
  return np.array(list(row.asDict().values())[:index])

## Initializing centroids

In [260]:
index = len(labels) - categorical_labels_start_index
numerical_rdd = rdd.map(lambda row: to_numpy_numerical(row, index))

# initialize k clusters
k = 3
seed = 42
numerical_centres = numerical_rdd.takeSample(False, k, seed)

In [261]:
numerical_centres

[array([0.1360443 , 0.56172453, 0.66528439]),
 array([0.11021132, 0.3538434 , 0.23134328]),
 array([0.18676362, 0.39889623, 0.61006129])]

In [262]:
categorical_rdd = rdd.map(lambda row: to_numpy_categorical(row, index))
categorical_centres = categorical_rdd.takeSample(False, k, seed)

In [263]:
categorical_centres

[array(['yes', 'Female'], dtype='<U6'),
 array(['yes', 'Male'], dtype='<U4'),
 array(['yes', 'Female'], dtype='<U6')]

In [264]:
# broadcast the centre values
numerical_centres_bc = sc.broadcast(numerical_centres)
categorical_centres_bc = sc.broadcast(categorical_centres)

## Distance measures

In [265]:
# find the nearest centroid
def euclidean_distance(vect):
  return np.sqrt(np.sum((numerical_centres_bc.value - vect)**2, axis=1))

def hamming_distance(vect):
  return 0.2 * np.sum(vect != categorical_centres_bc.value, axis=1)


In [266]:
# find the distance to the centroid for numerical data
numerical_rdd.map(lambda vect: euclidean_distance(vect)).collect()[:10]


[array([0.38576506, 0.33989987, 0.37263552]),
 array([0.59374894, 0.27432781, 0.45733534]),
 array([0.39887808, 0.27383837, 0.39264238]),
 array([0.49409115, 0.53184803, 0.48025454]),
 array([0.52805429, 0.08784874, 0.43926013]),
 array([0.66384915, 0.2650172 , 0.5169795 ]),
 array([0.68955449, 0.22262703, 0.56883953]),
 array([0.65999305, 0.18294946, 0.55641329]),
 array([0.49527901, 0.08752317, 0.40171169]),
 array([0.29992099, 0.25410659, 0.29848534])]

In [267]:
# find the distance to the centroid for categorical data
categorical_rdd.map(lambda vect: hamming_distance(vect)).collect()[:10]

[array([0.4, 0.2, 0.4]),
 array([0. , 0.2, 0. ]),
 array([0.2, 0.4, 0.2]),
 array([0.4, 0.2, 0.4]),
 array([0.4, 0.2, 0.4]),
 array([0. , 0.2, 0. ]),
 array([0.2, 0.4, 0.2]),
 array([0.2, 0.4, 0.2]),
 array([0.2, 0. , 0.2]),
 array([0. , 0.2, 0. ])]

In [268]:
def get_total_distance(numerical_vect, categorical_vect):
  numerical_distance = euclidean_distance(numerical_vect)
  categorical_distance = hamming_distance(categorical_vect)
  total_distance = numerical_distance + categorical_distance
  return total_distance

In [269]:
total_distance_rdd = numerical_rdd.zip(categorical_rdd).map(lambda x: get_total_distance(x[0], x[1]))

# find the centroid which has the closest distance
closest_centroid_rdd = total_distance_rdd.map(lambda x: np.argmin(x))
closest_centroid_rdd.collect()[:10]

[1, 2, 2, 1, 1, 1, 1, 1, 1, 2]

In [270]:
centroid_categorical_value = closest_centroid_rdd.zip(categorical_rdd).map((lambda x: (x[0],x[1])))


In [272]:
# closest_centroid_rdd.zip(categorical_rdd).map((lambda x: (x[0],x[1]))).collect()[:10]

[(1, array(['no', 'Male'], dtype='<U4')),
 (2, array(['yes', 'Female'], dtype='<U6')),
 (2, array(['no', 'Female'], dtype='<U6')),
 (1, array(['no', 'Male'], dtype='<U4')),
 (1, array(['no', 'Male'], dtype='<U4')),
 (1, array(['yes', 'Female'], dtype='<U6')),
 (1, array(['no', 'Female'], dtype='<U6')),
 (1, array(['no', 'Female'], dtype='<U6')),
 (1, array(['yes', 'Male'], dtype='<U4')),
 (2, array(['yes', 'Female'], dtype='<U6'))]

In [273]:
# closest = numerical_rdd.zip(categorical_rdd).map(lambda x: nearest_centroid(x[0], x[1]))

closest_centroid_rdd.zip(categorical_rdd).map((lambda x: (x[0],x[1]))).collect()[:10]

[(1, array(['no', 'Male'], dtype='<U4')),
 (2, array(['yes', 'Female'], dtype='<U6')),
 (2, array(['no', 'Female'], dtype='<U6')),
 (1, array(['no', 'Male'], dtype='<U4')),
 (1, array(['no', 'Male'], dtype='<U4')),
 (1, array(['yes', 'Female'], dtype='<U6')),
 (1, array(['no', 'Female'], dtype='<U6')),
 (1, array(['no', 'Female'], dtype='<U6')),
 (1, array(['yes', 'Male'], dtype='<U4')),
 (2, array(['yes', 'Female'], dtype='<U6'))]

## find the new centroid for categorical part

In [274]:
centroid_categorical_value = closest_centroid_rdd.zip(categorical_rdd).map((lambda x: (x[0],x[1])))

grouped_rdd = centroid_categorical_value.groupByKey().mapValues(list)

from collections import Counter
import numpy as np

def find_mode(values):
    counts = Counter(values)
    return np.array([max(counts, key=counts.get)])

modes = grouped_rdd.map(lambda x: tuple(find_mode(column) for column in zip(*x[1])))
result = np.array(modes.collect())
numpy_arrays = [np.array(row) for row in result.squeeze()]
numpy_arrays

# categorical_centres_bc = sc.broadcast(numpy_arrays)
# categorical_centres_bc.value

[array(['yes', 'Male'], dtype='<U6'),
 array(['yes', 'Male'], dtype='<U6'),
 array(['yes', 'Female'], dtype='<U6')]

## find the new centroid for numerical part

In [276]:
centroid_numerical_value = closest_centroid_rdd.zip(numerical_rdd).map((lambda x: (x[0],x[1])))

new_centres = centroid_numerical_value.map(lambda x: ((x[0]),(1,x[1])) ).reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
new_c = new_centres.map(lambda x: x[1][1]/x[1][0]).collect()
new_c
# numerical_centres_bc = sc.broadcast(new_c)
# numerical_centres_bc.value

[array([0.21251834, 0.64924743, 0.62601173]),
 array([0.18510456, 0.45994437, 0.24617052]),
 array([0.31044582, 0.36504586, 0.39755838])]

## training

In [277]:
max_iter = 10

for j in range(max_iter):
  # find the points which are closest
  total_distance_rdd = numerical_rdd.zip(categorical_rdd).map(lambda x: get_total_distance(x[0], x[1]))

  # find the centroid which has the closest distance
  closest_centroid_rdd = total_distance_rdd.map(lambda x: np.argmin(x))


  # find the new centroid for categorical data
  centroid_categorical_value = closest_centroid_rdd.zip(categorical_rdd).map((lambda x: (x[0],x[1])))

  grouped_rdd = centroid_categorical_value.groupByKey().mapValues(list)

  modes = grouped_rdd.map(lambda x: tuple(find_mode(column) for column in zip(*x[1])))
  result = np.array(modes.collect())
  numpy_arrays = [np.array(row) for row in result.squeeze()]

  # find the new centroid for numerical data
  centroid_numerical_value = closest_centroid_rdd.zip(numerical_rdd).map((lambda x: (x[0],x[1])))

  new_centres = centroid_numerical_value.map(lambda x: ((x[0]),(1,x[1])) ).reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1]))
  new_c = new_centres.map(lambda x: x[1][1]/x[1][0]).collect()

  # compute the difference between new clusters and previous
  temp_dist = 0
  for i in range(k):
    temp_dist += np.sqrt( np.sum( (new_c[i] - numerical_centres_bc.value[i])**2 ) ) + np.sum( (categorical_centres_bc.value[i] != numpy_arrays[i]) ) * 0.2

  print("iteration", j+1, "centroid difference", temp_dist)

  if temp_dist < 0.001:
    print("centroids remain the same, breaking")
    break
  else:
    categorical_centres_bc = sc.broadcast(numpy_arrays)
    numerical_centres_bc = sc.broadcast(new_c)

iteration 0 centroid difference 0.7015911793180523
iteration 1 centroid difference 0.2733756728394019
iteration 2 centroid difference 0.04831452287418763
iteration 3 centroid difference 0.010274610127814538
iteration 4 centroid difference 0.003983484660497624
iteration 5 centroid difference 0.002060422696450608
iteration 6 centroid difference 0.004406775086807856
iteration 7 centroid difference 0.0029287570805392785
iteration 8 centroid difference 0.0012839698014596255
iteration 9 centroid difference 0.0012661694351033806
