In [29]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
spark = SparkSession.builder.appName('Lab7').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/22 14:18:53 WARN Utils: Your hostname, dbl-23, resolves to a loopback address: 127.0.1.1; using 172.16.58.182 instead (on interface eno1)
25/09/22 14:18:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 14:18:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [18]:
df = spark.read.csv('datasets/transactions_kmeans.csv', inferSchema=True, header=True)

In [19]:
df.show(10)

+-------------+------+--------+----------+
|transactionId|amount|duration|locationId|
+-------------+------+--------+----------+
|        T0001|599.34|    NULL|         3|
|        T0002|472.35|   35.61|         1|
|        T0003|629.54|   40.83|        16|
|        T0004|804.61|   40.54|         6|
|        T0005|453.17|   16.22|        17|
|        T0006|453.17|    NULL|         5|
|        T0007|815.84|   35.15|         5|
|        T0008|  NULL|   35.14|         6|
|        T0009|406.11|   35.15|         3|
|        T0010|608.51|   68.53|         5|
+-------------+------+--------+----------+
only showing top 10 rows


In [20]:
df.printSchema()

root
 |-- transactionId: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- duration: double (nullable = true)
 |-- locationId: integer (nullable = true)



In [21]:
df = df.na.drop()

In [22]:
assembler = VectorAssembler(inputCols=['amount', 'duration', 'locationId'], outputCol='features')
df = assembler.transform(df)

In [23]:
normalizer = MinMaxScaler(inputCol='features', outputCol='scaled_features')
model = normalizer.fit(df)
df = model.transform(df)

In [26]:
kmeans = KMeans(featuresCol='scaled_features', predictionCol='prediction', k=2, seed=819)
model_kmean = kmeans.fit(df)

pred = model_kmean.transform(df)

In [28]:
pred.show()

+-------------+------+--------+----------+-------------------+--------------------+----------+
|transactionId|amount|duration|locationId|           features|     scaled_features|prediction|
+-------------+------+--------+----------+-------------------+--------------------+----------+
|        T0002|472.35|   35.61|         1| [472.35,35.61,1.0]|[0.46470907694900...|         1|
|        T0003|629.54|   40.83|        16|[629.54,40.83,16.0]|[0.61189348115133...|         0|
|        T0004|804.61|   40.54|         6| [804.61,40.54,6.0]|[0.77581977190584...|         1|
|        T0005|453.17|   16.22|        17|[453.17,16.22,17.0]|[0.44674993913743...|         0|
|        T0007|815.84|   35.15|         5| [815.84,35.15,5.0]|[0.78633495009269...|         1|
|        T0009|406.11|   35.15|         3| [406.11,35.15,3.0]|[0.40268544354763...|         1|
|        T0010|608.51|   68.53|         5| [608.51,68.53,5.0]|[0.59220210116294...|         1|
|        T0011|407.32|   35.71|         5| [407.32

In [30]:
evaluator = ClusteringEvaluator(predictionCol='prediction')
silhouette = evaluator.evaluate(pred)
print(f"Silhouette Score = {silhouette}")

Silhouette Score = 0.019679457165916434
