In [1]:
from keras.models import Sequential
import pandas as pd
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import BatchNormalization
from keras.optimizers import Adam, RMSprop, SGD
import sys, re, glob
import numpy as np
from keras.layers import Input, Dense, Flatten
import sys, re, glob
import pandas as pd
import numpy as np
from lxml import etree
from sklearn.model_selection import StratifiedKFold 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Data

In [2]:
def load_data(path, rng):
    routes = list()
    flows = list()

    for i in range(rng):

        for routes_file in glob.glob('{}/output_{}/*.routes'.format(path, i)):

            info = re.findall('(\w+)/output_(\d+)', routes_file)

            handle = open(routes_file, 'r')
            data = handle.read()
            handle.close()

            nodes = re.split('\n\n', data)
            nodes.pop()

            for node in nodes:

                header = re.findall('Node:\s+(\d+)\s+Time:\s+(\d+)', node)
                lines = re.findall('(\d{1,3}(?:\.\d{1,3}){3})\s+(\d{1,3}(?:\.\d{1,3}){3})\s+(\d{1,3}(?:\.\d{1,3}){3})\s+(\w+)\s+(-?\d+\.\d+)\s+(\d+)', node)

                for line in lines:
                    l = list(line)
                    l[4] = float(line[4])
                    l[5] = int(line[5])
                    routes.append(header[0] + tuple(l) + info[0])

        for flowmon_file in glob.glob('{}/output_{}/*.flowmon'.format(path, i)):
 
            info = re.findall('(\w+)/output_(\d+)', flowmon_file)

            with open(flowmon_file) as fobj:
                xml = fobj.read()
    
            root = etree.fromstring(xml)


            for flow in root.xpath('/FlowMonitor/FlowStats/Flow'):

                attributes = list()

                for attrib in flow.attrib:

                    attr = flow.attrib[attrib]
                    if 'ns' in attr:
                        attr = re.findall('(\d+)', attr)[0]

                    attributes.append(int(attr))

                flows.append(tuple(attributes) + info[0])

    routes_table = pd.DataFrame(routes, columns=['Node', 'Time', 'Destination', 'Gateway', 'Interface', 'Flag', 'Expire', 'Hops', 'Type', 'Test'])

    flag_agg = routes_table.groupby(['Type', 'Test', 'Time', 'Flag']).agg({'Flag' : ['count']})
    hops_agg = routes_table.groupby(['Type', 'Test', 'Node', 'Destination']).agg({'Hops' : ['min', 'max', 'mean', 'median', 'prod', 'sum', 'std', 'var']})
    
    unstack_flag_agg = flag_agg.unstack(['Time','Flag']).fillna(0)
    unstack_hops_agg = hops_agg.unstack(['Node', 'Destination']).fillna(0)
    
 
    flows_table = pd.DataFrame(flows, columns=['flowId', 'timeFirstTxPacket', 'timeFirstRxPacket', 'timeLastTxPacket', 'timeLastRxPacket', 'delaySum', 'jitterSum', 'lastDelay', 'txBytes', 'rxBytes', 'txPackets', 'rxPackets', 'lostPackets', 'timesForwarded', 'Type', 'Test'])

    lost_agg = flows_table.groupby(['Type', 'Test']).agg({'lostPackets' : ['sum', 'mean']})
    forwarded_agg = flows_table.groupby(['Type', 'Test']).agg({'timesForwarded' : ['sum', 'max', 'mean', 'var']})
    
    return lost_agg.join(forwarded_agg).join(unstack_flag_agg).join(unstack_hops_agg)


In [3]:
malicious = load_data("../data/malicious", 100)
normal = load_data("../data/normal", 100)



In [4]:
X = np.concatenate((malicious, normal))
y = np.concatenate((np.ones((100, 1)), np.zeros((100, 1))))

In [5]:
indexes = np.array(range(len(y)))
np.random.shuffle(indexes)
indexes
X = X[indexes]
y = y[indexes]

# Model

In [6]:
class GAN:
    def __init__(self, shape):
        self.SHAPE = shape
        self.OPTIMIZER = Adam()
        self.compile_models()
        
    def __generator(self):
        model = Sequential()
        model.add(Dense(2048, input_shape=(1000,)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(self.SHAPE, activation='sigmoid'))
        return model
    
    def __discriminator(self):
        model = Sequential()
        model.add(Dense(10, input_shape=(self.SHAPE,)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))
        return model
    
    def __stacked(self, generator, discriminator):
        discriminator.trainable = False
        model = Sequential()
        model.add(generator)
        model.add(discriminator)
        return model
    
    def compile_models(self):
        self.generator = self.__generator()
        self.discriminator = self.__discriminator()
        self.stacked = self.__stacked(self.generator, self.discriminator)
        
        self.generator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER)
        self.discriminator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER, metrics=['accuracy'] )
        self.stacked.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER)
        
    
    def train(self,X, y, epochs=200, batch = 100, debug=False):
        for cnt in range(epochs):

            ## train discriminator
            random_index =  np.random.randint(0, len(y) - batch)
            X_batch = X[random_index : random_index + batch]
            y_batch = y[random_index : random_index + batch]

            gen_noise = np.random.normal(0, 1, (batch,1000))
            syntetic = self.generator.predict(gen_noise)
                
            x_combined_batch = np.concatenate((X_batch, syntetic))
            y_combined_batch = np.concatenate((y_batch, np.zeros((batch, 1))))

            d_loss = self.discriminator.train_on_batch(x_combined_batch, y_combined_batch)

            # train generator
            noise = np.random.normal(0, 1, (batch,1000))
            y_mislabled = np.ones((batch, 1))
            g_loss = self.stacked.train_on_batch(noise, y_mislabled)
            if debug:
                print ('epoch: %d, [Discriminator :: d_loss: %f], [ Generator :: loss: %f]' % (cnt, d_loss[0], g_loss))

# Train

In [7]:
gan = GAN(malicious.shape[1])
gan.train(X, y, debug=True, epochs=100, batch=199)

epoch: 0, [Discriminator :: d_loss: 4.564472], [ Generator :: loss: 0.879590]
epoch: 1, [Discriminator :: d_loss: 4.517925], [ Generator :: loss: 0.998926]
epoch: 2, [Discriminator :: d_loss: 4.490653], [ Generator :: loss: 1.124255]
epoch: 3, [Discriminator :: d_loss: 4.501699], [ Generator :: loss: 1.206633]
epoch: 4, [Discriminator :: d_loss: 4.498006], [ Generator :: loss: 1.189893]
epoch: 5, [Discriminator :: d_loss: 4.496871], [ Generator :: loss: 1.244746]
epoch: 6, [Discriminator :: d_loss: 4.504513], [ Generator :: loss: 1.250917]
epoch: 7, [Discriminator :: d_loss: 4.504346], [ Generator :: loss: 1.284298]
epoch: 8, [Discriminator :: d_loss: 4.507780], [ Generator :: loss: 1.329013]
epoch: 9, [Discriminator :: d_loss: 4.506391], [ Generator :: loss: 1.267786]
epoch: 10, [Discriminator :: d_loss: 4.481155], [ Generator :: loss: 1.284520]
epoch: 11, [Discriminator :: d_loss: 4.536175], [ Generator :: loss: 1.306473]
epoch: 12, [Discriminator :: d_loss: 4.522892], [ Generator ::

# Preidiction

In [9]:
predict = gan.discriminator.predict(X)

In [10]:
np.mean(y == (predict > 0.5))

0.47

# Metrics

In [11]:
from sklearn import metrics 
print(metrics.confusion_matrix(y, (predict > 0.5)))

[[78 22]
 [84 16]]


In [12]:
metrics.accuracy_score(y, (predict > 0.5))

0.47

In [13]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) 
cvscores = [] 
for train, test in kfold.split(X, y): 
    gan = GAN(malicious.shape[1]) 
    gan.train(X[train], y[train], epochs=100, batch=100, debug=False) 
    predict = gan.discriminator.predict(X[test]) 
    acc = metrics.accuracy_score(y[test], (predict > 0.5)) 
    cvscores.append(acc) 
    print(acc) 
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

0.4
0.55
0.4
0.65
0.5
0.35
0.45
0.35
0.45
0.35
0.44% (+/- 0.09%)
