In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_6'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_6


In [27]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [5]:
df = spark.read.csv("kdd_data/kddcup.data_10_percent.gz", header=False, inferSchema=True)
df.show(5)

+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1| _c2|_c3|_c4| _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+----+---+---+----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|tcp|http| SF|181|5450|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|   9|   9| 1.0| 0.0|0.11| 0.0| 0.0| 0.0| 0.0| 0.0|normal.|
|  0|tcp|http| SF|239| 486|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   8|   8| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0|  19|  19| 1.0| 0.0|0.05

In [15]:
df.count()

494021

In [6]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: double (nullable = true)
 |-- _c25: double (nullable = true)
 |-- _c26: double (nullable = true)
 |-- _c27: d

In [7]:
def result_null_data(df):
    total_rows = df.count()
    null_table = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas().T
    null_table = null_table.rename(columns={0: 'count_na'})
    null_table['percent_na'] = null_table['count_na']/total_rows
    return null_table
result_null_data(df)

Unnamed: 0,count_na,percent_na
_c0,0,0.0
_c1,0,0.0
_c2,0,0.0
_c3,0,0.0
_c4,0,0.0
_c5,0,0.0
_c6,0,0.0
_c7,0,0.0
_c8,0,0.0
_c9,0,0.0


In [13]:
df.groupBy('_c41').count().sort(col('count').desc()).show(30)

+----------------+------+
|            _c41| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97278|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
|            phf.|     4|
|           perl.|     3|
|            spy.|     2|
+----------------+------+



In [29]:
indexer_c1 = StringIndexer(inputCol='_c1', outputCol='_c1_idx')
indexer_c2 = StringIndexer(inputCol='_c2', outputCol='_c2_idx')
indexer_c3 = StringIndexer(inputCol='_c3', outputCol='_c3_idx')

indexer_c41 = StringIndexer(inputCol='_c41', outputCol='label')

onehot = OneHotEncoderEstimator(inputCols=['_c1_idx', '_c2_idx', '_c3_idx'],
                       outputCols=['_c1_dummy', '_c2_dummy', '_c3_dummy'])

input_cols = df.columns[4:-1] + ['_c0', '_c1_dummy', '_c2_dummy', '_c3_dummy']
assembler = VectorAssembler(inputCols=input_cols, outputCol='features_not_scale')
scaler = MinMaxScaler(inputCol="features_not_scale", outputCol="features")

pipeline_preprocessing = Pipeline(stages=[indexer_c1, indexer_c2, indexer_c3, indexer_c41, onehot, assembler, scaler])
pipe_preprocessing = pipeline_preprocessing.fit(df)
final_df = pipe_preprocessing.transform(df)

In [30]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [32]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

list_model = [('Logistic Regression', LogisticRegression()),
              ('Decision Tree', DecisionTreeClassifier()), 
              ('Random Forest', RandomForestClassifier()), 
            #   ('Gradient Boosting', GBTClassifier()),
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    evaluator = MulticlassClassificationEvaluator()
    print('-'*30)
    print(model_name)
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  AUC \t\t\t: {:.4f}'.format(evaluator.evaluate(predictions)))
    # result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    # display(result_confusion_matrix)

------------------------------
Logistic Regression
  Accuracy 		: 0.9992
  AUC 			: 0.9992
------------------------------
Decision Tree
  Accuracy 		: 0.9925
  AUC 			: 0.9903
------------------------------
Random Forest
  Accuracy 		: 0.9911
  AUC 			: 0.9883


In [33]:
model = RandomForestClassifier()
trained_model = model.fit(train_df)
predictions = trained_model.transform(test_df)
predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

prediction_and_label = predictions.select(['prediction', 'label'])
metrics = MulticlassMetrics(prediction_and_label.rdd)
evaluator = MulticlassClassificationEvaluator()
print('-'*30)
print(model_name)
print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
print('  AUC \t\t\t: {:.4f}'.format(evaluator.evaluate(predictions)))
result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray())
display(result_confusion_matrix)

------------------------------
Random Forest
  Accuracy 		: 0.9911
  AUC 			: 0.9883


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,55690.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,21450.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,19620.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,155.0,277.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,20.0,38.0,0.0,236.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,37.0,0.0,0.0,207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,90.0,17.0,0.0,0.0,0.0,92.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,221.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,162.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
df_new = spark.read.csv("kdd_data/kddcup.testdata.unlabeled_10_percent.gz", header=False, inferSchema=True)
df_new.show(5)

+---+---+-------+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|_c1|    _c2|_c3|_c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|
+---+---+-------+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  0|udp|private| SF|105|146|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   1| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 255| 254| 1.0|0.01| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0|
|  0|udp|private| SF|105|146|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   1| 0.0| 0.0| 0.0| 0.0| 1.0| 0.0| 0.0| 255| 254| 1.0|0.01| 0.0| 0.0| 0.0| 0.0| 0.0| 

In [35]:
df_new.count()

311029

In [36]:
final_df_test = pipe_preprocessing.transform(df_new)

In [37]:
predictions_new = trained_model.transform(final_df_test)
predictions_new.select('features', 'prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[4.18243709859781...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[3.21614990685279...|       2.0|
|[1.51433067363024...|       2.0|
|[3.31710528509481...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
|[4.57183641467415...|       2.0|
|[4.28339247683982...|       2.0|
|[4.19685929548952...|       2.0|
|[4.25454808305639...|       2.0|
|[1.51433067363024...|       2.0|
|[1.51433067363024...|       2.0|
+--------------------+----------+
only showing top 20 rows

