In [1]:
# !pip install pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from sklearn.metrics import accuracy_score
from pyspark.ml.feature import StandardScaler

def get_path(dataset_name,env_name='colab'):
    prefix = 'https://raw.githubusercontent.com/John-Ghaly88/Big_Data_and_NoSQL/main/Datasets/Assessment/'
    if env_name == 'colab':
        return prefix+dataset_name
    else:
        return f'../Datasets/{dataset_name}'

In [2]:
#Creating Spark session
spark = SparkSession.builder.appName('iris').getOrCreate()

#Reading the dataset
df = pd.read_csv(get_path('iris.csv'))

df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
#Encoding the labels column
df['species'] = pd.factorize(df['species'])[0]

# Shuffling the dataset
df = df.sample(frac = 1)

sdf = spark.createDataFrame(df)

sdf.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         6.5|        3.0|         5.2|        2.0|      2|
|         6.5|        3.2|         5.1|        2.0|      2|
|         5.6|        2.7|         4.2|        1.3|      1|
|         5.0|        3.6|         1.4|        0.2|      0|
|         6.7|        3.3|         5.7|        2.5|      2|
|         6.0|        2.2|         4.0|        1.0|      1|
|         4.3|        3.0|         1.1|        0.1|      0|
|         4.9|        3.1|         1.5|        0.1|      0|
|         5.6|        2.8|         4.9|        2.0|      2|
|         4.7|        3.2|         1.6|        0.2|      0|
|         5.4|        3.0|         4.5|        1.5|      1|
|         6.9|        3.1|         5.4|        2.1|      2|
|         6.2|        2.9|         4.3|        1.3|      1|
|         6.7|        3.1|         5.6| 

In [4]:
#Defining the features columns
numericCols  = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

#Defining the pipeline
featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols = numericCols, outputCol = "feature_vector")])
featurizationPipelineModel = featurizationPipeline.fit(sdf)
sdf = featurizationPipelineModel.transform(sdf)

#Standardizing the data
scaler = StandardScaler(inputCol="feature_vector", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(sdf)
sdf = scalerModel.transform(sdf)

#Splitting the data into train and test
train, test = sdf.randomSplit([0.8, 0.2], seed = 2018)

In [5]:
#Training and applying the model
kmeans = KMeans(featuresCol='scaledFeatures',k=3)
model = kmeans.fit(train)
predictions = model.transform(test)

# centers=model.clusterCenters()

#Extracting the labels and predictions
labels = predictions.select('species')
predict = predictions.select('prediction')

#Calculating accuracy
accuracy = accuracy_score(labels.toPandas(), predict.toPandas())
print("Accuracy =", accuracy*100, "%")

Accuracy = 87.87878787878788 %
