In [45]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [46]:
spark = SparkSession.builder.appName('L7Q2').getOrCreate()

In [47]:
df = spark.read.csv('./datasets/kddcup_data_10_percent_corrected.csv', inferSchema= True)

In [48]:
df.show(5)

+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1| _c2|_c3|_c4| _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|tcp|http| SF|181|5450|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|   9|   9| 1.0| 0.0|0.11| 0.0| 0.0| 0.0| 0.0| 0.0|normal.|
|  0|tcp|http| SF|239| 486|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|  19|  19| 1.0| 0.0|0.05

In [49]:
df.na.drop().count()

494021

In [50]:
df.count()

494021

In [51]:
k = df.select('_c41').distinct().count()
k

23

In [52]:
string_indexer = StringIndexer(inputCols=['_c1', '_c2', '_c3', '_c41'], outputCols=['c1', 'c2', 'c3', 'c41'])
model = string_indexer.fit(df)
df = model.transform(df)

In [53]:
columns = df.columns
columns.remove('_c1')
columns.remove('_c2')
columns.remove('_c3')
columns.remove('_c41')

In [54]:
vector_assembler = VectorAssembler(inputCols= columns, outputCol='features')
df = vector_assembler.transform(df)

In [55]:
df.show(5)

+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+---+---+---+---+--------------------+
|_c0|_c1| _c2|_c3|_c4| _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41| c1| c2| c3|c41|            features|
+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+---+---+---+---+--------------------+
|  0|tcp|http| SF|181|5450|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|   9|   9| 1.0| 0.0|0.11| 0.0| 0.0| 0.0| 0.0| 0.0|normal.|1.0|2.0|0.0|2.0|(42,[1,2,8,19,20,...|
|  0|tcp|http| SF|239| 4

In [56]:
kmenas = KMeans(featuresCol='features', predictionCol='prediction', k=k, seed=819)
model2 = kmenas.fit(df)

pred = model2.transform(df)

                                                                                

In [57]:
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features', metricName='silhouette')
silhouette = evaluator.evaluate(pred)

print(f'silhouette score is {silhouette}')



silhouette score is 0.9994108497468185


                                                                                

In [58]:
spark.stop()