In [1]:
try:
    from pyspark import SparkContext, SparkConf
    from pyspark.mllib.clustering import KMeans
    from pyspark.mllib.feature import StandardScaler
    print ("Successfully imported Spark Modules")
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)
import sys
import os

# Load 10 percent of train data
import os.path
baseDir = os.path.join('data')
inputPath = os.path.join('kddcup.data_10_percent_corrected')
test_inputPath = os.path.join('kddcup_test.data')
fileName = os.path.join(baseDir, inputPath)
testFileName = os.path.join(baseDir, test_inputPath)
# Load 10 percent of the entire KDD data set, from the Hadoop file share
raw_data = sc.textFile(fileName)
test_data = sc.textFile(testFileName)
raw_data.take(5)

print "Train data size is {}".format(raw_data.count())
print "Test data size is {}".format(test_data.count())

Successfully imported Spark Modules
Train data size is 494021
Test data size is 311029


In [2]:
# Analyze the categorical features and display the distribution of "labels"

from numpy import array
from math import sqrt
from time import time
from collections import OrderedDict
#Counting all different labels
protocol_type = raw_data.map(lambda line: line.strip().split(",")[1])
services = raw_data.map(lambda line: line.strip().split(",")[2])
labels = raw_data.map(lambda line: line.strip().split(",")[-1])

t0 = time()
protocol_counts = protocol_type.countByValue()
services_counts = services.countByValue()
labels_counts = labels.countByValue()
tt = time()-t0

protocol_sorted_labels = OrderedDict(sorted(protocol_counts.items(), key=lambda t: t[1], reverse=True))
services_sorted_labels = OrderedDict(sorted(services_counts.items(), key=lambda t: t[1], reverse=True))
labels_sorted_labels = OrderedDict(sorted(labels_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in labels_sorted_labels.items():
    print(label, count)

print("Counted in {} seconds",format(round(tt,3)))

(u'smurf.', 280790)
(u'neptune.', 107201)
(u'normal.', 97278)
(u'back.', 2203)
(u'satan.', 1589)
(u'ipsweep.', 1247)
(u'portsweep.', 1040)
(u'warezclient.', 1020)
(u'teardrop.', 979)
(u'pod.', 264)
(u'nmap.', 231)
(u'guess_passwd.', 53)
(u'buffer_overflow.', 30)
(u'land.', 21)
(u'warezmaster.', 20)
(u'imap.', 12)
(u'rootkit.', 10)
(u'loadmodule.', 9)
(u'ftp_write.', 8)
(u'multihop.', 7)
(u'phf.', 4)
(u'perl.', 3)
(u'spy.', 2)
('Counted in {} seconds', '7.548')


In [3]:
# To prepare categorical features 'protocol', 'service' and 'flag' for input to Decision Tree
from pyspark.mllib.regression import LabeledPoint
from numpy import array

# convert comma separated string to list of features
csv_data = raw_data.map(lambda x: x.split(","))
test_csv_data = test_data.map(lambda x: x.split(","))

# Collect the different categories of the categorical features
protocols = csv_data.map(lambda x: x[1]).distinct().collect()
services = csv_data.map(lambda x: x[2]).distinct().collect()
flags = csv_data.map(lambda x: x[3]).distinct().collect()

In [4]:
def parsePointToLabelledPoint(point):
    # Capture the features and omit the label
    features = point[0:41]

    # convert categorical variable 'protocol' to numeric  by replacing it with index of the collection of protocols
    try: 
         features[1] = protocols.index(features[1])
    except:
         features[1] = len(protocols)

    # convert categorical variable 'service' to numeric  variable by replacing it with index of the collection of services
    try:
        features[2] = services.index(features[2])
    except:
        features[2] = len(services)
        # convert categorical variable 'flag' to numeric  variable by replacing it with index of the collection of flags
    try:
        features[3] = flags.index(features[3])
    except:
        features[3] = len(flags)

    # Convert label to binary label, normal or attack
    attack = 1.0
    if point[41]=='normal.':
        attack = 0.0
    
    # features = feature vector with categorical variables converted to numerical
    # attack = label with binary classification
    return LabeledPoint(attack, array([float(x) for x in features]))

training_data = csv_data.map(parsePointToLabelledPoint)
training_data.take(5)
test_data = test_csv_data.map(parsePointToLabelledPoint)

In [5]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, 
                                          categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
                                          impurity='gini', maxDepth=4, maxBins=100)
tt = time() - t0

print("Classifier trained in {} seconds",format(round(tt,3)))

print "Learned classification tree model:"
print tree_model.toDebugString()

('Classifier trained in {} seconds', '19.697')
Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 31 nodes
  If (feature 22 <= 50.0)
   If (feature 3 in {0.0,1.0,2.0,7.0,9.0,10.0})
    If (feature 9 <= 1.0)
     If (feature 36 <= 0.45)
      Predict: 0.0
     Else (feature 36 > 0.45)
      Predict: 1.0
    Else (feature 9 > 1.0)
     If (feature 4 <= 1098.0)
      Predict: 0.0
     Else (feature 4 > 1098.0)
      Predict: 1.0
   Else (feature 3 not in {0.0,1.0,2.0,7.0,9.0,10.0})
    If (feature 2 in {23.0,47.0,62.0})
     If (feature 24 <= 0.5)
      Predict: 0.0
     Else (feature 24 > 0.5)
      Predict: 1.0
    Else (feature 2 not in {23.0,47.0,62.0})
     If (feature 36 <= 0.0)
      Predict: 1.0
     Else (feature 36 > 0.0)
      Predict: 1.0
  Else (feature 22 > 50.0)
   If (feature 5 <= 0.0)
    If (feature 11 <= 0.0)
     If (feature 2 in {21.0})
      Predict: 0.0
     Else (feature 2 not in {21.0})
      Predict: 1.0
    Else (feature 11 > 0.0)
  

In [6]:
#Based on above structure of Decision Tree, 
# we conclude that features 22, 3 and 5 are the maximum entropy features
# Hence we now extract only these 3 features
def create_labeled_point_minimal(line_split):
    # leave_out = [41]
    clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23]

    # convert flag to numeric categorical variable
    try:
        clean_line_split[0] = flags.index(clean_line_split[0])
    except:
        clean_line_split[0] = len(flags)

    # convert label to binary label
    attack = 1.0
    if line_split[41]=='normal.':
        attack = 0.0

    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data_minimal = csv_data.map(create_labeled_point_minimal)
print(training_data_minimal.take(10))

training_data_minimal.values().cache()
test_data_minimal = test_csv_data.map(create_labeled_point_minimal)

[LabeledPoint(0.0, [10.0,5450.0,8.0]), LabeledPoint(0.0, [10.0,486.0,8.0]), LabeledPoint(0.0, [10.0,1337.0,8.0]), LabeledPoint(0.0, [10.0,1337.0,6.0]), LabeledPoint(0.0, [10.0,2032.0,6.0]), LabeledPoint(0.0, [10.0,2032.0,6.0]), LabeledPoint(0.0, [10.0,1940.0,1.0]), LabeledPoint(0.0, [10.0,4087.0,5.0]), LabeledPoint(0.0, [10.0,151.0,8.0]), LabeledPoint(0.0, [10.0,786.0,8.0])]


In [7]:
# Build the model
t0 = time()
tree_model_minimal = DecisionTree.trainClassifier(
    training_data_minimal, numClasses=2, 
    categoricalFeaturesInfo={0: len(flags)},
    impurity='gini', maxDepth=3, maxBins=32)
tt = time() - t0

print("Classifier trained in {} seconds",format(round(tt,3)))

print "Learned classification tree model with minimal fetures:"
print tree_model_minimal.toDebugString()

('Classifier trained in {} seconds', '9.2')
Learned classification tree model with minimal fetures:
DecisionTreeModel classifier of depth 3 with 15 nodes
  If (feature 2 <= 87.0)
   If (feature 0 in {0.0,1.0,2.0,7.0,9.0,10.0})
    If (feature 1 <= 0.0)
     Predict: 0.0
    Else (feature 1 > 0.0)
     Predict: 0.0
   Else (feature 0 not in {0.0,1.0,2.0,7.0,9.0,10.0})
    If (feature 1 <= 1644.0)
     Predict: 1.0
    Else (feature 1 > 1644.0)
     Predict: 1.0
  Else (feature 2 > 87.0)
   If (feature 1 <= 0.0)
    If (feature 2 <= 157.0)
     Predict: 1.0
    Else (feature 2 > 157.0)
     Predict: 1.0
   Else (feature 1 > 0.0)
    If (feature 2 <= 509.0)
     Predict: 0.0
    Else (feature 2 > 509.0)
     Predict: 1.0



In [8]:
predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy for decision tree  is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 23.574 seconds. Test accuracy for decision tree  is 0.9208


In [9]:
predictions_minimal = tree_model_minimal.predict(test_data_minimal.map(lambda p: p.features))
labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal)


t0 = time()
test_accuracy = labels_and_preds_minimal.filter(lambda (v, p): v == p).count() / float(test_data_minimal.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy for minimal decision tree is {}".format(round(tt,3), round(test_accuracy,4))

Prediction made in 11.198 seconds. Test accuracy for minimal decision tree is 0.9167


In [10]:
# Build the model using Logistic Regression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
t0 = time()
logisticR_model = LogisticRegressionWithLBFGS.train(training_data)
tt = time()-t0
print("Classifier trained in {} seconds",format(round(tt,3)))

('Classifier trained in {} seconds', '26.166')


In [12]:
# Evaluating the model on training data
labelsAndPreds_LogisticR = test_data.map(lambda p: (p.label, logisticR_model.predict(p.features)))
logisticR_accuracy = labelsAndPreds_LogisticR.filter(lambda (v, p): v == p).count() / float(test_data.count())
print "Prediction made in {} seconds. Test accuracy for Logistic Regression model is {}".format(round(tt,3), round(logisticR_accuracy,4))

Prediction made in 26.118 seconds. Test accuracy for Logistic Regression model is 0.9193


In [11]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
t0 = time()
SVM_model = SVMWithSGD.train(training_data, iterations=100)
tt = time()-t0
print("Classifier SVM trained in {} seconds",format(round(tt,3)))

('Classifier SVM trained in {} seconds', '28.105')


In [12]:
# Evaluating the model on training data
labelsAndPreds_SVM= test_data.map(lambda p: (p.label, SVM_model.predict(p.features)))
SVM_accuracy = labelsAndPreds_SVM.filter(lambda (v, p): v == p).count() / float(test_data.count())
print "Prediction made in {} seconds. Test accuracy for SVM model is {}".format(round(tt,3), round(SVM_accuracy,4))



Prediction made in 28.105 seconds. Test accuracy for SVM model is 0.8259


In [None]:
# Exctracting the anomalies out of the predictions made
data_and_preds = test_data_minimal.zip(predictions_minimal)
anomalies = data_and_preds.filter(lambda (v,p): p==1.0)
print "Number of Anomalies detected is",anomalies.count()
#anomalies.map(lambda (p): print "Anomaly Detected for {}")

In [14]:
from lightning import Lightning
from numpy import random, asarray, sqrt, arctan2, pi, clip
from seaborn import color_palette
#from sklearn import datasets
from colorsys import hsv_to_rgb

lgn = Lightning()

Lightning initialized


ValueError: Timeout value connect was (10.0, 10.0), but it must be an int or float.

In [None]:
#d, g = datasets.make_blobs(n_features=2, n_samples=200, centers=5, cluster_std=2.0, random_state=100)
plot_data = (test_data_minimal.map(lambda p: (p.features))).collect()
x = [(i[0]) for i in plot_data]
y = [(i[1]) for i in plot_data]

lgn.scatter(plot_data, group=test_data.label, alpha=0.8, size=12)