#### Data Summary:
* The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for
the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin.

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [4]:
df = spark.read.csv('seeds_dataset.csv', inferSchema = True, header = True)

In [5]:
df.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [7]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

In [11]:
# Since this unsupervised ml this doesn't have any label col

assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')

In [12]:
output = assembler.transform(df)

In [13]:
output.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
from pyspark.ml.feature import StandardScaler

In [15]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')

In [16]:
scaled_model = scaler.fit(output)

In [17]:
final_data = scaled_model.transform(output)

In [18]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [19]:
kmeans = KMeans(featuresCol = 'scaledFeatures', k = 3)

In [20]:
model = kmeans.fit(final_data)

In [21]:
evaluator = ClusteringEvaluator()
wssse = model.summary.trainingCost
print(f"Within set Sum of Squared Errors: {wssse}")

Within set Sum of Squared Errors: 428.6082011872446


In [22]:
centers = model.clusterCenters()

In [23]:
print(centers)

[array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
        3.15410901, 10.38031464]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
        1.80061978, 10.41913733])]


In [25]:
results = model.transform(final_data)
results.show(3)

+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+----------+
| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledFeatures|prediction|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+----------+
|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|[5.24452795332028...|         2|
|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|[5.11393027165175...|         2|
|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|[4.91116018695588...|         2|
+-----+---------+-----------+-----------------

In [26]:
predictions = results.select('prediction')

In [31]:
silhouette = evaluator.evaluate(results)
print(f"Silhouette with squared euclidean distance: {silhouette}")

Silhouette with squared euclidean distance: 0.630000103338996
