In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.types import DoubleType
import numpy as np
import findspark
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.appName("Anomaly").getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv(r"C:\Users\Robyi\Documents\Data Science Dataset\boston.csv", header = True, inferSchema = True)

In [6]:
df.show()
df.printSchema()

+--------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|                CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+--------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|             0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|
|             0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|
|             0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|
|0.032369999999999996| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|
|             0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|
|             0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222.0|   18.7|394.12| 5.21|28.7|
|             0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311.0|   15.2| 395.6|12.43|22.9|


In [7]:
df = df.dropDuplicates()

In [8]:
df = df.dropna(how="any")

In [9]:
assembler = VectorAssembler(inputCols=[
    "CRIM", "ZN", "INDUS","CHAS","NX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"], 
    outputCol="features")
df = assembler.transform(df)

In [10]:
Standardization = StandardScaler(inputCol="features", outputCol="features_scaled")
df = Standardization.fit(df).transform(df)

In [11]:
evaluator = ClusteringEvaluator(featuresCol="features_scaled", metricName="silhouette")

In [12]:
for k in range(2, 10):
    kmeans = KMeans(k=k, seed=42)
    model = kmeans.fit(df)
    predictions = model.transform(df)
    
    silhouette = evaluator.evaluate(predictions)
    print(f"K: {k}, Silhouette Score: {silhouette}")

K: 2, Silhouette Score: 0.5151967998528848
K: 3, Silhouette Score: 0.36919743583644216
K: 4, Silhouette Score: 0.39123672306991647
K: 5, Silhouette Score: 0.20347085037697069
K: 6, Silhouette Score: 0.18386484393085248
K: 7, Silhouette Score: 0.15932073867970248
K: 8, Silhouette Score: 0.14568010857772443
K: 9, Silhouette Score: 0.007299754244652135


In [13]:
kmeans = KMeans(k = 2,featuresCol="features_scaled", seed=42)
kmeans_model = kmeans.fit(df)

df_kmeans = kmeans_model.transform(df)
df_kmeans.show()

+--------------------+-----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+--------------------+----------+
|                CRIM|   ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            features|     features_scaled|prediction|
+--------------------+-----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+--------------------+----------+
|              0.7842|  0.0| 8.14|   0|0.538| 5.99| 81.7|4.2579|  4|307.0|   21.0|386.75|14.67|17.5|[0.7842,0.0,8.14,...|[0.09116966665835...|         0|
|             0.08187|  0.0| 2.89|   0|0.445| 7.82| 36.9|3.4952|  2|276.0|   18.0|393.53| 3.57|43.8|[0.08187,0.0,2.89...|[0.00951805739520...|         0|
|             0.16211| 20.0| 6.96|   0|0.464| 6.24| 16.3| 4.429|  3|223.0|   18.6| 396.9| 6.59|25.2|[0.16211,20.0,6.9...|[0.01884661395305...|         0|
|0.035019999999999996| 80.0| 4.95|   0|0.411|6.861| 27.9|5.1167|  4|245.0|  

In [14]:
df_kmeans.select("features_scaled", "prediction").show(10, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|features_scaled                                                                                                                                                                                                                                            |prediction|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|[0.09116966665835873,0.0,1.1865278754790987,0.0,4.642826988897158,8.52526878419654,2.902426454093195,2.0220731933408223,0.4593867970948732,1.8215572165118161,9.700013127337211,4.236273339238476,2.05431643

In [16]:
centroids = kmeans_model.clusterCenters()

def euclidean_distance(point, center):
    return float(np.linalg.norm(np.array(point) - np.array(center)))

distance_udf = udf(lambda features, cluster: euclidean_distance(features, centroids[cluster]), DoubleType())

df_anomaly = df_kmeans.withColumn("distance_to_centroid", distance_udf(col("features_scaled"), col("prediction")))

threshold = df_anomaly.approxQuantile("distance_to_centroid", [0.95], 0.01)[0]

df_anomaly = df_anomaly.withColumn("anomaly", (col("distance_to_centroid") > threshold).cast("integer"))

df_anomaly.select("features", "prediction", "distance_to_centroid", "anomaly").show(10, truncate=False)

+-------------------------------------------------------------------------------------------+----------+--------------------+-------+
|features                                                                                   |prediction|distance_to_centroid|anomaly|
+-------------------------------------------------------------------------------------------+----------+--------------------+-------+
|[0.7842,0.0,8.14,0.0,0.538,5.99,81.7,4.2579,4.0,307.0,21.0,386.75,14.67,17.5]              |0         |2.381195457127339   |0      |
|[0.08187,0.0,2.89,0.0,0.445,7.82,36.9,3.4952,2.0,276.0,18.0,393.53,3.57,43.8]              |0         |3.221636467643319   |0      |
|[0.16211,20.0,6.96,0.0,0.464,6.24,16.3,4.429,3.0,223.0,18.6,396.9,6.59,25.2]               |0         |1.69015367957354    |0      |
|[0.035019999999999996,80.0,4.95,0.0,0.411,6.861,27.9,5.1167,4.0,245.0,19.2,396.9,3.33,28.5]|0         |3.2546521922967435  |0      |
|[0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2