# Clustering Consulting Project

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ClusterProject').getOrCreate()

In [2]:
hacks = spark.read.csv('hack_data.csv', inferSchema=True, header=True)

In [3]:
hacks.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [4]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [5]:
hacks.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [6]:
assembler = VectorAssembler(inputCols=['Session_Connection_Time',
                                       'Bytes Transferred',
                                       'Kali_Trace_Used',
                                       'Servers_Corrupted',
                                       'Pages_Corrupted',
                                       'WPM_Typing_Speed'],
                            outputCol='features')


In [7]:
hacks_cleaned = assembler.transform(hacks)

In [8]:
from pyspark.ml.feature import StandardScaler

In [9]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [12]:
scaler_model = scaler.fit(hacks_cleaned)

In [13]:
hacks_scaled = scaler_model.transform(hacks_cleaned)

In [17]:
kmeans2 = KMeans(k=2, featuresCol='features')
kmeans2s = KMeans(k=2, featuresCol='scaledFeatures')
kmeans3 = KMeans(k=3, featuresCol='features')
kmeans3s = KMeans(k=3, featuresCol='scaledFeatures')

In [18]:
model1 = kmeans2.fit(hacks_scaled)
model2 = kmeans2s.fit(hacks_scaled)
model3 = kmeans3.fit(hacks_scaled)
model4 = kmeans3s.fit(hacks_scaled)

In [19]:
model1.clusterCenters()

[array([4.00060976e+01, 8.59157866e+02, 5.36585366e-01, 6.92634146e+00,
        1.32134146e+01, 4.68612195e+01]),
 array([ 20.36470588, 364.22370588,   0.48823529,   3.64952941,
          8.54705882,  67.45364706])]

In [20]:
model2.clusterCenters()

[array([1.26023837, 1.31829808, 0.99280765, 1.36491885, 2.5625043 ,
        5.26676612]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ])]

In [21]:
model3.clusterCenters()

[array([3.12282609e+01, 6.19318913e+02, 5.54347826e-01, 5.53467391e+00,
        1.10108696e+01, 5.62103261e+01]),
 array([ 17.96774194, 299.77129032,   0.4516129 ,   3.16096774,
          7.97580645,  70.12048387]),
 array([4.17118644e+01, 9.20940169e+02, 5.42372881e-01, 7.24737288e+00,
        1.37118644e+01, 4.47972034e+01])]

In [23]:
model4.clusterCenters()

[array([1.21780112, 1.37901802, 1.99757683, 1.37198977, 2.55237797,
        5.29152222]),
 array([2.99991988, 2.92319035, 1.05261534, 3.20390443, 4.51321315,
        3.28474   ]),
 array([1.30217042, 1.25830099, 0.        , 1.35793211, 2.57251009,
        5.24230473])]

In [27]:
model2.transform(hacks_scaled).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [28]:
model4.transform(hacks_scaled).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   84|
|         0|   83|
+----------+-----+



# Therefore we can conclude that there were only 2 hackers