In [1]:
import findspark
findspark.init('/home/mysparkub/spark-3.0.0-bin-hadoop2.7')

In [22]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
spark = SparkSession.builder.appName('hacking_cluster').getOrCreate()

In [4]:
data = spark.read.csv('files/hack_data.csv', inferSchema=True, header=True)

In [5]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [6]:
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [16]:
assembler = VectorAssembler(inputCols=['Session_Connection_Time', 'Bytes Transferred',
                                    'Kali_Trace_Used',
                                     'Servers_Corrupted', 'Pages_Corrupted',
                                    'WPM_Typing_Speed'], outputCol='features')

In [17]:
final_data= assembler.transform(data)

In [18]:
final_data = StandardScaler(inputCol='features', outputCol='scaled_features').fit(final_data).transform(final_data)

In [40]:
kmeans3 = KMeans(k=3, featuresCol = 'scaled_features').fit(final_data)
kmeans2= KMeans(k=2, featuresCol = 'scaled_features').fit(final_data)

In [41]:
preds3 = kmeans3.transform(final_data)
preds2= kmeans2.transform(final_data)

In [42]:
evaluator = ClusteringEvaluator(featuresCol='scaled_features' ) 

In [35]:
result = evaluator.evaluate(preds)

In [36]:
result.as_integer_ratio()

(6853087607351469, 9007199254740992)

In [37]:
result

0.7608455651454925

In [47]:
preds2.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [48]:
preds3.select('prediction').groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   79|
|         2|   88|
|         0|  167|
+----------+-----+

