In [1]:
import findspark

In [2]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [8]:
data = spark.read.format('libsvm').load('sample_kmeans_data.txt')

In [9]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [10]:
df = data.select('features')

In [19]:
df.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [20]:
from pyspark.ml.clustering import KMeans

In [33]:
kmeans = KMeans(featuresCol='features').setK(5).setSeed(1)

In [34]:
model = kmeans.fit(df)

In [35]:
model.clusterCenters()

[array([9., 9., 9.]),
 array([0.05, 0.05, 0.05]),
 array([0.2, 0.2, 0.2]),
 array([9.2, 9.2, 9.2]),
 array([9.1, 9.1, 9.1])]

In [36]:
results = model.transform(df)

In [37]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         4|
|(3,[0,1,2],[9.2,9...|         3|
+--------------------+----------+



### More Practice -- Containing Data Scaling

In [1]:
import findspark

In [2]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [22]:
data = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)

In [23]:
data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [24]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [25]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [26]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [27]:
indexer = VectorAssembler(inputCols=data.columns, outputCol='features')

In [28]:
df = indexer.transform(data)

In [29]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
scaler = StandardScaler(inputCol='features', outputCol='ScaledFeatures')

In [31]:
df_fitted = scaler.fit(df)

In [32]:
df = df_fitted.transform(df)

In [35]:
df.select('ScaledFeatures').show()

+--------------------+
|      ScaledFeatures|
+--------------------+
|[5.24452795332028...|
|[5.11393027165175...|
|[4.91116018695588...|
|[4.75650503761158...|
|[5.54696468981581...|
|[4.94209121682475...|
|[5.04863143081749...|
|[4.84929812721816...|
|[5.71536696354628...|
|[5.65006812271202...|
|[5.24452795332028...|
|[4.82180387844584...|
|[4.77368894309428...|
|[4.73588435103234...|
|[4.72213722664617...|
|[5.01426361985209...|
|[4.80805675405968...|
|[5.39230954047151...|
|[5.05206821191403...|
|[4.37158555479908...|
+--------------------+
only showing top 20 rows



In [36]:
from pyspark.ml.clustering import KMeans

In [37]:
kms = KMeans(featuresCol='ScaledFeatures', k=3)

In [38]:
modeled = kms.fit(df)

In [39]:
final_df = modeled.transform(df)

In [41]:
modeled.clusterCenters()

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15410901, 10.38031464]),
 array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80061978, 10.41913733])]

In [43]:
final_df.select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

