In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('hackerclustering').master('local[4]').getOrCreate()

In [None]:
data = spark.read.csv('../data/hack_data.csv', inferSchema=True, header=True)

In [None]:
# is the third suspect involved in the attack?
# can we use clustering to try to identify this?
# forensic engineer knows hackers trade off attacks,
# i.e. they should have the same number of hacks each

In [None]:
data.printSchema()

In [None]:
# we have a cateogorical attribute, location
# we can drop this because hackers are using VPNs and location is not reliable

In [None]:
data = data.drop('Location')

In [None]:
# everything is numeric, no need to encode
# we should perform feature scaling so no single feature dominates distance metric

### Create Feature Set

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
with_features = assembler.transform(data).select('features')

### Scale Data

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_featurs')
scaled_data = scaler.fit(with_features).transform(with_features)

### Train KMeans Model

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
# first we will check clustering of 2 hackers
kmeans = KMeans(featuresCol='scaled_featurs', k=2)
model_k2 = kmeans.fit(scaled_data)

In [None]:
results = model_k2.transform(scaled_data)

In [None]:
results.groupBy('prediction').count().show()

In [None]:
# now we will try for 3 clusters
kmeans = KMeans(featuresCol='scaled_featurs', k=3)
model_k3 = kmeans.fit(scaled_data)

In [None]:
results = model_k3.transform(scaled_data)
results.groupBy('prediction').count().show()

In [None]:
# since the forensics team told us the hackers would have an equal number of attacks
# we can conclude that there are only 2 hackers