In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler,StringIndexer,StandardScaler
from pyspark.ml import Pipeline
from math import log
import numpy as np 

In [4]:
spark = SparkSession.builder.appName('kmeans').getOrCreate()
spark

In [5]:
df = spark.read.csv('Datasets/kddcup.data_10_percent_corrected',inferSchema=True,header=False)
df.show(5)
df.printSchema()

+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1| _c2|_c3|_c4| _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|tcp|http| SF|181|5450|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|   9|   9| 1.0| 0.0|0.11| 0.0| 0.0| 0.0| 0.0| 0.0|normal.|
|  0|tcp|http| SF|239| 486|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|  19|  19| 1.0| 0.0|0.05

In [6]:
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
                "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
                "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
                "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
                "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
                "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
                "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
df = df.toDF(*col_names)
df.show(5)
df.printSchema()

+--------+-------------+-------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+
|duration|protocol_type|service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_hos

In [7]:
null_cond = col(col_names[0]).isNull()
for col_name in col_names[1:]:
    null_cond = null_cond | col(col_name).isNull()
df_filtered = df.filter(null_cond)
df_filtered.show()

+--------+-------------+-------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-----+
|duration|protocol_type|service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_host_

In [8]:
num_features = df.drop('label','protocol_type','service','flag')
cat_features = df.select('protocol_type','service','flag','label')

In [9]:
indexer = [StringIndexer(inputCol=col_name,outputCol=col_name + '_indexed').fit(df) for col_name in cat_features.columns]
pipeline = Pipeline(stages=indexer)
df_transformed = pipeline.fit(df).transform(df)
#df_transformed.show(5)
df_transformed = df_transformed.drop('protocol_type','service','flag','label')
df_transformed.show(5)

+--------+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+---------------------+---------------+------------+-------------+
|duration|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_

In [10]:
assembler = VectorAssembler(inputCols=df_transformed.columns[:-1],outputCol='features')
df_transformed = assembler.transform(df_transformed)
df_transformed.show(5) 


+--------+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+---------------------+---------------+------------+-------------+--------------------+
|duration|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|sr

In [11]:
scaler = StandardScaler(inputCol='features',
                        outputCol='scaled_features',
                        withMean=True,
                        withStd=True)
scaler_model = scaler.fit(df_transformed)
df_transformed = scaler_model.transform(df_transformed) 
df_transformed.show(5)

+--------+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+---------------------+---------------+------------+-------------+--------------------+--------------------+
|duration|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_

In [12]:
evaluator = ClusteringEvaluator(predictionCol='prediction',
                                featuresCol='scaled_features',
                                metricName='silhouette',
                                distanceMeasure='squaredEuclidean')
scores = []
for k in range(2,11):
    kmeans = KMeans(featuresCol='scaled_features',predictionCol='prediction',k=k,seed=0)
    model = kmeans.fit(df_transformed)
    preds = model.transform(df_transformed)
    score = evaluator.evaluate(preds)
    scores.append(score)
    print(f"silhouette score for k = {k} : {score}") 

silhouette score for k = 2 : 0.4843266754253085
silhouette score for k = 3 : 0.5263100769630508
silhouette score for k = 4 : 0.6210325518787834
silhouette score for k = 5 : 0.6253914691676761
silhouette score for k = 6 : 0.6779405894792122
silhouette score for k = 7 : 0.691887839050431
silhouette score for k = 8 : 0.6904379505920174
silhouette score for k = 9 : 0.6733519893329023
silhouette score for k = 10 : 0.693750789381801


In [13]:
kmeans = KMeans(featuresCol='scaled_features',predictionCol='prediction',k=6)
model = kmeans.fit(df_transformed)
predictions = model.transform(df_transformed)

In [14]:
predictions.groupBy('prediction').count().show()

+----------+------+
|prediction| count|
+----------+------+
|         1|280869|
|         3|     9|
|         5| 28331|
|         4| 97779|
|         0| 87028|
|         2|     5|
+----------+------+



Anamoly detection

In [15]:
from pyspark.sql import functions as F, Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from random import randint

def one_hot_pipeline(input_col):
    indexer = StringIndexer(inputCol=input_col, outputCol=input_col + "_indexed")
    encoder = OneHotEncoder(inputCol=input_col + "_indexed", outputCol=input_col + "_vec")
    return Pipeline(stages=[indexer, encoder]), input_col + "_vec"

def create_pipeline(data_columns, label_col='label'):
    one_hot_pipelines, vector_cols = zip(*(one_hot_pipeline(col) for col in ['protocol_type', 'service', 'flag']))
    features_to_assemble = set(data_columns) - {label_col, 'protocol_type', 'service', 'flag'} | set(vector_cols)
    assembler = VectorAssembler(inputCols=list(features_to_assemble), outputCol="featureVector")
    scaler = StandardScaler(inputCol="featureVector", outputCol="scaledFeatureVector", withStd=True, withMean=False)
    kmeans = KMeans(seed=randint(100, 100000), k=180, featuresCol="scaledFeatureVector", predictionCol="cluster", maxIter=40, tol=1.0e-5)
    return Pipeline(stages=list(one_hot_pipelines) + [assembler, scaler, kmeans])

def calculate_weighted_entropy(data):
    df = data.groupBy("cluster", "label").count()
    total = data.count()
    window = Window.partitionBy("cluster")
    entropy_df = df.withColumn("p_col", F.col('count') / F.sum('count').over(window))\
                   .groupBy("cluster")\
                   .agg((-F.sum(F.col("p_col") * F.log2(F.col("p_col")))).alias("entropy"),
                        F.sum(F.col("count")).alias("cluster_size"))
    weighted_entropy = entropy_df.withColumn('weightedEntropy', F.col('entropy') * F.col('cluster_size'))
    total_weighted_entropy = weighted_entropy.agg(F.sum('weightedEntropy')).first()[0]
    return total_weighted_entropy / total

pipeline_model = create_pipeline(df.columns, 'label').fit(df)
transformed_data = pipeline_model.transform(df)
entropy_score = calculate_weighted_entropy(transformed_data)
print("Entropy score:", entropy_score)
transformed_data.select("cluster", "label").groupBy("cluster", "label").count().orderBy("cluster", "label").show()


Entropy score: 0.022087605436228622
+-------+-------------+-----+
|cluster|        label|count|
+-------+-------------+-----+
|      0|     neptune.|35533|
|      1|guess_passwd.|   49|
|      1|      normal.|    5|
|      1|     rootkit.|    1|
|      2|     neptune.|  105|
|      3|      normal.|    9|
|      3|       satan.|    2|
|      4|     neptune.|  113|
|      4|   portsweep.|    2|
|      5|     ipsweep.|    1|
|      5|     neptune.|  118|
|      5|   portsweep.|    1|
|      6|        back.|   66|
|      6|      normal.| 4436|
|      7|     neptune.|   20|
|      8|     rootkit.|    1|
|      9|     ipsweep.|    2|
|      9|     neptune.|  103|
|      9|      normal.|   51|
|     10|      normal.|    1|
+-------+-------------+-----+
only showing top 20 rows

