In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [9]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [5]:
df = spark.read.csv("hack_data.csv", header=True, inferSchema=True)
df.show(5)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [6]:
df.count()

334

In [7]:
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [10]:
input_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed']
assembler = VectorAssembler(inputCols=input_cols, outputCol='non_scale_features')
scaler = StandardScaler(inputCol='non_scale_features', outputCol='features')

pipeline_pre = Pipeline(stages=[assembler, scaler])
pipeline_pre_fitted = pipeline_pre.fit(df)

final_df = pipeline_pre_fitted.transform(df)
final_df.select('features').show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------+
|features                                                                                                          |
+------------------------------------------------------------------------------------------------------------------+
|[0.5678510846650524,1.3658432518957642,1.9975768336483841,1.2858903881191532,2.2849485348398866,5.396290958577967]|
|[1.419627711662631,2.517986463945197,0.0,1.320644182392644,2.9377909733655687,5.150971112595909]                  |
|[2.2004229530770782,1.2444124562517545,1.9975768336483841,1.611707209433128,2.6113697541027276,5.262819066691072] |
|[0.1419627711662631,0.7965469045293562,1.9975768336483841,1.0773676224782096,2.6113697541027276,5.279223433291696]|
|[1.419627711662631,1.4266459597520256,0.0,1.5508880694545193,2.6113697541027276,5.315014778602148]                |
+---------------------------------------------------------------

In [12]:
k_means_result = []
for k in [2, 3]:
    # fit model
    k_means = KMeans(featuresCol='features', k=k, seed=42)
    model = k_means.fit(final_df)
    result = model.transform(final_df).groupBy('prediction').count().toPandas()
    # compute wsse
    wsse = model.computeCost(final_df)

    # Compute silhouette
    predictions = model.transform(final_df)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)

    k_means_result.append((k, wsse, silhouette))
    print('With k = {}, Sum of Squared Error = {:.4f}, silhouette = {:.4f}'.format(k, wsse, silhouette))
    display(result)
k_means_result = pd.DataFrame(k_means_result, columns=['k', 'wsse', 'silhouette'])

With k = 2, Sum of Squared Error = 601.7708, silhouette = 0.8176


Unnamed: 0,prediction,count
0,1,167
1,0,167


With k = 3, Sum of Squared Error = 434.1493, silhouette = 0.7552


Unnamed: 0,prediction,count
0,1,83
1,2,84
2,0,167


Theo tỷ lệ giữa k=2 và k=3 ta thấy ở 
- Kết quả phân thành 3 cụm, số lượng không đồng đều giữa 3 nhóm.
- Ở kết quả phân thành 2 cụm, số lượng là rất đồng đều 
- Kết luận: Không có hacker thứ 3, chỉ có 2 hacker
