In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('kmeans II').getOrCreate()

In [3]:
df = spark.read.csv('/home/ubuntu/Course_Notes/Spark_for_Machine_Learning/Clustering/seeds_dataset.csv',header= True, inferSchema=True)

In [4]:
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [5]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
from pyspark.ml.clustering import KMeans 

In [16]:
from pyspark.ml.feature import VectorAssembler,StandardScaler

In [8]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [10]:
assembler = VectorAssembler(inputCols=['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove'],outputCol = 'Features')

In [14]:
final_data = assembler.transform(df).select('Features')

In [15]:
final_data.show()

+--------------------+
|            Features|
+--------------------+
|[15.26,14.84,0.87...|
|[14.88,14.57,0.88...|
|[14.29,14.09,0.90...|
|[13.84,13.94,0.89...|
|[16.14,14.99,0.90...|
|[14.38,14.21,0.89...|
|[14.69,14.49,0.87...|
|[14.11,14.1,0.891...|
|[16.63,15.46,0.87...|
|[16.44,15.25,0.88...|
|[15.26,14.85,0.86...|
|[14.03,14.16,0.87...|
|[13.89,14.02,0.88...|
|[13.78,14.06,0.87...|
|[13.74,14.05,0.87...|
|[14.59,14.28,0.89...|
|[13.99,13.83,0.91...|
|[15.69,14.75,0.90...|
|[14.7,14.21,0.915...|
|[12.72,13.57,0.86...|
+--------------------+
only showing top 20 rows



In [17]:
scaler  = StandardScaler(inputCol='Features', outputCol='scaledFeatures')

In [19]:
final_data = scaler.fit(final_data).transform(final_data)

In [21]:
final_data.head(1)

[Row(Features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [22]:
kmeans = KMeans(k=6,featuresCol='scaledFeatures')

In [23]:
model = kmeans.fit(final_data)

In [24]:
wsse = model.computeCost(final_data)

In [25]:
wsse

302.0078269305932

In [32]:
model.clusterCenters()

[array([  5.64454472,  11.76065389,  37.00336206,  13.32242188,
          9.12537437,   2.58125654,  11.6730398 ]),
 array([  4.87341485,  10.88419569,  37.26499116,  12.34228066,
          8.55946741,   1.52143615,  10.30987691]),
 array([  3.98633561,  10.08204203,  35.56421021,  11.79397768,
          7.36947641,   3.12173425,  10.42529039]),
 array([  6.75808635,  12.80744916,  37.36162212,  14.41478927,
         10.09457467,   3.30965846,  12.67897033]),
 array([  4.48585853,  10.45783433,  37.16483351,  11.93574269,
          8.21111921,   3.47841362,  10.22573634]),
 array([  6.50166082,  12.52139437,  37.6236222 ,  14.02801933,
          9.94081092,   1.84322264,  12.37053922])]

In [34]:
model.transform(final_data).show()

+--------------------+--------------------+----------+
|            Features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[15.26,14.84,0.87...|[5.24452795332028...|         1|
|[14.88,14.57,0.88...|[5.11393027165175...|         1|
|[14.29,14.09,0.90...|[4.91116018695588...|         1|
|[13.84,13.94,0.89...|[4.75650503761158...|         1|
|[16.14,14.99,0.90...|[5.54696468981581...|         1|
|[14.38,14.21,0.89...|[4.94209121682475...|         1|
|[14.69,14.49,0.87...|[5.04863143081749...|         1|
|[14.11,14.1,0.891...|[4.84929812721816...|         1|
|[16.63,15.46,0.87...|[5.71536696354628...|         0|
|[16.44,15.25,0.88...|[5.65006812271202...|         0|
|[15.26,14.85,0.86...|[5.24452795332028...|         0|
|[14.03,14.16,0.87...|[4.82180387844584...|         1|
|[13.89,14.02,0.88...|[4.77368894309428...|         4|
|[13.78,14.06,0.87...|[4.73588435103234...|         1|
|[13.74,14.05,0.87...|[4.72213722664617...|         1|
|[14.59,14