In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.feature import Imputer

# Start Spark session
spark = SparkSession.builder.appName("DataPreprocessing").getOrCreate()

In [48]:
# Load data
data = spark.read.csv("kddcup.data_10_percent_corrected", header=False, inferSchema=True)

# Define the schema based on the dataset
# Assuming the columns are unnamed, you'll need to provide column names
data = data.toDF("col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", 
                  "col11", "col12", "col13", "col14", "col15", "col16", "col17", "col18", "col19", "col20", 
                  "col21", "col22", "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", 
                  "col31", "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41","label")


In [49]:
# Handle missing values
# Imputer is used to fill missing values with the mean of each column
imputer = Imputer(inputCols=["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13",
                             "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22",
                             "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31",
                             "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"],
                   outputCols=["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13",
                               "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22",
                               "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31",
                               "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"]).setStrategy("mean")

data = imputer.fit(data).transform(data)

# Scale numerical features
feature_cols = ["col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", 
                "col14", "col15", "col16", "col17", "col18", "col19", "col20", "col21", "col22", 
                "col23", "col24", "col25", "col26", "col27", "col28", "col29", "col30", "col31", 
                "col32", "col33", "col34", "col35", "col36", "col37","col38","col39","col40", "col41"]

In [50]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
data = scaler.fit(data).transform(data)

data.show(3)

+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+
|col1|col2|col3|col4|col5|col6|col7|col8|col9|col10|col11|col12|col13|col14|col15|col16|col17|col18|col19|col20|col21|col22|col23|col24|col25|col26|col27|col28|col29|col30|col31|col32|col33|col34|col35|col36|col37|col38|col39|col40|col41|  label|            features|     scaled_features|
+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+
|   0| tcp|http|  SF| 181|5450|   0|   0|   0|    0|    0|    1|    0|    0|    0|    0|    0|    0|    0|    0|    0|    0|    8|   

In [51]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# K-means clustering
kmeans = KMeans(k=80, seed=1, featuresCol="scaled_features", predictionCol="cluster")
model = kmeans.fit(data)
clusters = model.transform(data)

clusters.show(3)

                                                                                

+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+-------+
|col1|col2|col3|col4|col5|col6|col7|col8|col9|col10|col11|col12|col13|col14|col15|col16|col17|col18|col19|col20|col21|col22|col23|col24|col25|col26|col27|col28|col29|col30|col31|col32|col33|col34|col35|col36|col37|col38|col39|col40|col41|  label|            features|     scaled_features|cluster|
+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+-------+
|   0| tcp|http|  SF| 181|5450|   0|   0|   0|    0|    0|    1|    0|    0|    0|    0|    0|    0|    0|   

In [52]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
import numpy as np
from pyspark.sql.types import FloatType

# Get cluster centers
centers = np.array(model.clusterCenters())

# Calculate distances from points to cluster centers
def distance_to_center(features, center):
    return float(np.sqrt(np.sum((np.array(features) - np.array(center)) ** 2)))

distance_udf = udf(lambda features: min([distance_to_center(features, center) for center in centers]), FloatType())

clusters = clusters.withColumn("distance_to_center", distance_udf(col("scaled_features")))

# Define anomaly if distance is greater than a threshold
threshold = 1.0  # Set a suitable threshold
clusters = clusters.withColumn("is_anomaly", col("distance_to_center") > threshold)

clusters.show(3)

+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+-------+------------------+----------+
|col1|col2|col3|col4|col5|col6|col7|col8|col9|col10|col11|col12|col13|col14|col15|col16|col17|col18|col19|col20|col21|col22|col23|col24|col25|col26|col27|col28|col29|col30|col31|col32|col33|col34|col35|col36|col37|col38|col39|col40|col41|  label|            features|     scaled_features|cluster|distance_to_center|is_anomaly|
+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------+--------------------+--------------------+-------+------------------+----------+
|   0| tcp|http|  S



In [53]:
evaluator = ClusteringEvaluator(featuresCol="scaled_features", predictionCol="cluster")
silhouette = evaluator.evaluate(clusters)
print(f"Silhouette with squared Euclidean distance = {silhouette}")


[Stage 224:>                                                      (0 + 18) / 18]

Silhouette with squared Euclidean distance = 0.8907618402151812


                                                                                

In [55]:

# Print the number of anomalies and normal points
num_anomalies = clusters.filter(col("is_anomaly")).count()
num_normal = clusters.filter(~col("is_anomaly")).count()
total_count = clusters.count()

print(f"Number of anomalies: {num_anomalies}")
print(f"Number of normal points: {num_normal}")
print(f"Total number of points: {total_count}")



Number of anomalies: 41775
Number of normal points: 452246
Total number of points: 494021


                                                                                