1. Implement a PySpark script to handle any missing values and scale numerical features

In [47]:
import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler,StandardScaler,StringIndexer,OneHotEncoder
from pyspark.ml import Pipeline 


spark = SparkSession.builder.appName('kmeans').getOrCreate()
df = spark.read.csv('Datasets/kddcup.data_10_percent_corrected')
#df.show(5)
col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
                "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
                "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
                "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
                "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
                "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
                "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
df = df.toDF(*col_names)
#df.show(5)
null_condition = col(col_names[0]).isNull()
for col_name in col_names[1:]:
    null_condition = null_condition | col(col_name).isNull()

df_nulls = df.filter(null_condition)
#df_nulls.show()

num_features = df.drop('protocol_type','service','flag','label') 
cat_features = df.select('protocol_type','service','flag','label')
# num_features.show(5)
# cat_features.show(5)

cols = cat_features.columns
indexers = [StringIndexer(inputCol=col_name,outputCol=col_name + '_index').fit(df) for col_name in cols]
pipeline = Pipeline(stages=indexers) 
df_transformed = pipeline.fit(df).transform(df)
# df_transformed.show(5)
df_transformed = df_transformed.drop('protocol_type','service','flag','label')
df_transformed.show(5)

col_names = 'duration','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','dst_host_count','dst_host_srv_count'
for col_name in col_names:
    df_transformed = df_transformed.withColumn(col_name,col(col_name).cast('int'))
#df_transformed

col_names = 'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
for col_name in col_names:
    df_transformed = df_transformed.withColumn(col_name,col(col_name).cast('double'))
df_transformed.printSchema()

assembler = VectorAssembler(inputCols=df_transformed.columns,outputCol='features')
df_transformed = assembler.transform(df_transformed)
df_transformed.select('features').show()
scaler = StandardScaler(inputCol='features',outputCol='scaled_features')
scalerModel = scaler.fit(df_transformed)
df_transformed = scalerModel.transform(df_transformed)
df_transformed.select('scaled_features').show(5)

+--------+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------------------+-------------+----------+-----------+
|duration|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_host_cou

2. Develop a PySpark script that uses the K-means algorithm to cluster data points
4. Implement code to evaluate the effectiveness of the K-means clustering model in detecting anomalies.

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

silhouette_scores = []
evaluator = ClusteringEvaluator(predictionCol='prediction',featuresCol='scaled_features',metricName='silhouette',distanceMeasure='squaredEuclidean')
for k in range(2,10):
    kmeans = KMeans(featuresCol='scaled_features',k=k)
    model = kmeans.fit(df_transformed)
    predictions = model.transform(df_transformed)
    score = evaluator.evaluate(predictions)
    silhouette_scores.append(score)
    #print(f"Silhouette score for k = {k} is {score}")
kmeans = KMeans(featuresCol='scaled_features',predictionCol='cluster',k=2)
model = kmeans.fit(df_transformed)
predictions = model.transform(df_transformed)
predictions.groupBy('cluster').count().show()

+-------+------+
|cluster| count|
+-------+------+
|      1|123672|
|      0|370349|
+-------+------+

