In [1]:
# Initilizing the SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('KMeans').getOrCreate()

In [2]:
# Importing the dataset
df = spark.read.csv('/FileStore/tables/hack_data.csv', inferSchema=True, header=True)
df = df.drop('Location')

In [3]:
# Transform the dataset
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
df = assembler.transform(df)

In [4]:
# Feature scaling
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
df = scaler.fit(df).transform(df)

In [5]:
# Creating the model
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='scaled_features', k=2)
kmeans = kmeans.fit(df)

In [6]:
# Getting the predictions
kmeans.transform(df).show(3)

In [7]:
# Evaluating the model (k = 2)
wssse = kmeans.computeCost(df)
centers = kmeans.clusterCenters()
print(wssse)
print(centers)

In [8]:
# Counting the number of samples in each predicted cluster (k = 2)
df_pred = kmeans.transform(df)
df_pred = df_pred.groupby('prediction').count()
df_pred.show()

In [9]:
# Evaluating the model (k = 3)
# wssse = kmeans.computeCost(df)
# centers = kmeans.clusterCenters()
# print(wssse)
# print(centers)

In [10]:
# Counting the number of samples in each predicted cluster (k = 3)
# df_pred = kmeans.transform(df)
# df_pred = df_pred.groupby('prediction').count()
# df_pred.show()