In [27]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, Imputer, StandardScaler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
spark = SparkSession.builder.appName('EndSem7').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/27 00:57:37 WARN Utils: Your hostname, Karthikeya, resolves to a loopback address: 127.0.1.1; using 172.25.191.235 instead (on interface zt4homnczt)
25/10/27 00:57:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/27 00:57:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.csv('file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/Week7/datasets/transactions_kmeans.csv', inferSchema=True, header=True)

In [5]:
df.show(5)

+-------------+------+--------+----------+
|transactionId|amount|duration|locationId|
+-------------+------+--------+----------+
|        T0001|599.34|    NULL|         3|
|        T0002|472.35|   35.61|         1|
|        T0003|629.54|   40.83|        16|
|        T0004|804.61|   40.54|         6|
|        T0005|453.17|   16.22|        17|
+-------------+------+--------+----------+
only showing top 5 rows


In [30]:
# Imputer
imputer1 = Imputer(inputCols=['amount', 'duration',], outputCols=['amount', 'duration',], strategy='mean')
imputer2 = Imputer(inputCol='locationId', outputCol='locationId', strategy='median')

# assembler
assembler = VectorAssembler(inputCols=['amount', 'duration', 'locationId'], outputCol='features')

# scaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)

# paramgrid
grid = ParamGridBuilder().addGrid(KMeans.k, [2,3,4]).build()


# Kmenas
kmenas = KMeans(featuresCol='scaled_features', predictionCol='prediction', maxIter=20, seed=819)

#Evalution
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features', metricName='silhouette')

In [31]:
pipe = Pipeline(stages=[imputer1, imputer2, assembler, scaler, kmenas])

In [32]:
cv = CrossValidator(estimator=pipe, estimatorParamMaps=grid, evaluator=evaluator)

In [33]:
model = cv.fit(df)

In [35]:
best_model = model.bestModel

pred = best_model.transform(df)

In [36]:
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features', metricName='silhouette')

silhouette = evaluator.evaluate(pred)

In [37]:
silhouette

0.33772473976857703

In [38]:
pred.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  108|
|         0|   92|
+----------+-----+



In [39]:
spark.stop()