# Spark-ml Clustering

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, StandardScaler
from pyspark.ml.clustering import KMeans, BisectingKMeans

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
path = "/home/lorenzo/spark-repo/0_data/clustering.csv"
df = spark.read.csv(path, header=True, inferSchema=True)
df.show(10)

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   7|   4|   1|
|   7|   7|   9|
|   7|   9|   6|
|   1|   6|   5|
|   6|   7|   7|
|   7|   9|   4|
|   7|  10|   6|
|   7|   8|   2|
|   8|   3|   8|
|   4|  10|   5|
+----+----+----+
only showing top 10 rows



### Features vector column

In [4]:
va = VectorAssembler(inputCols=['col1', 'col2', 'col3'], outputCol = 'features')
vect_df = va.transform(df)
vect_df.show(10)

+----+----+----+--------------+
|col1|col2|col3|      features|
+----+----+----+--------------+
|   7|   4|   1| [7.0,4.0,1.0]|
|   7|   7|   9| [7.0,7.0,9.0]|
|   7|   9|   6| [7.0,9.0,6.0]|
|   1|   6|   5| [1.0,6.0,5.0]|
|   6|   7|   7| [6.0,7.0,7.0]|
|   7|   9|   4| [7.0,9.0,4.0]|
|   7|  10|   6|[7.0,10.0,6.0]|
|   7|   8|   2| [7.0,8.0,2.0]|
|   8|   3|   8| [8.0,3.0,8.0]|
|   4|  10|   5|[4.0,10.0,5.0]|
+----+----+----+--------------+
only showing top 10 rows



### K-Means

In [5]:
kmeans = KMeans().setK(3).setSeed(324)
kmodel = kmeans.fit(vect_df)

In [6]:
centers = kmodel.clusterCenters()
centers

[array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667]),
 array([5.12, 5.84, 4.84])]

### Bisecting K-Means

In [7]:
bkmeans = BisectingKMeans().setK(3).setSeed(324)
bkmodel = bkmeans.fit(vect_df)

In [8]:
bkcenters = bkmodel.clusterCenters()
bkcenters

[array([5.12, 5.84, 4.84]),
 array([35.88461538, 31.46153846, 34.42307692]),
 array([80.        , 79.20833333, 78.29166667])]

### Clustering on scaled features

In [9]:
std_scaler = StandardScaler(inputCol='features', outputCol='scl_features', withMean=True, withStd=True)
std_model = std_scaler.fit(vect_df)
std_output = std_model.transform(vect_df)
std_output.show(5)

+----+----+----+-------------+--------------------+
|col1|col2|col3|     features|        scl_features|
+----+----+----+-------------+--------------------+
|   7|   4|   1|[7.0,4.0,1.0]|[-1.0277905724046...|
|   7|   7|   9|[7.0,7.0,9.0]|[-1.0277905724046...|
|   7|   9|   6|[7.0,9.0,6.0]|[-1.0277905724046...|
|   1|   6|   5|[1.0,6.0,5.0]|[-1.2161072489445...|
|   6|   7|   7|[6.0,7.0,7.0]|[-1.0591766851612...|
+----+----+----+-------------+--------------------+
only showing top 5 rows



In [10]:
kmeans = KMeans(featuresCol='scl_features').setK(3).setSeed(324)
kmodel = kmeans.fit(std_output)

In [11]:
scl_centers = kmodel.clusterCenters()
scl_centers

[array([-0.12121478, -0.21427061, -0.13242995]),
 array([1.26339566, 1.30398914, 1.25843007]),
 array([-1.08679646, -1.02898814, -1.07036572])]