In [1]:
# Company: IBM Italy & Politecnico di Milano
# Last update: 30 May 2019
# Author: Niccolo Howard Minetti
# Version: 2.0 
# Notes: This script pools a good set of logs for training and testing the model and saves Tokenizer and Model.
# ID: net_trainer

In [2]:
import pandas as pd     #Pandas Library for data manipulation
import numpy as np     #Numpy Library for array operations
from sklearn.preprocessing import LabelBinarizer, LabelEncoder #SKLearn Library for creating Dictionary and Encoder

#ML framework on top of Tensorflow
from keras.preprocessing import text, sequence
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Activation, Dropout
from keras import utils

import pickle

Using TensorFlow backend.


In [3]:
label = 'categoryname_category' #Insert 'categoryname_category' or 'categoryname_highlevelcategory'
data = pd.read_pickle("logs/[INSERT_SAMPLED_LOGS_FILE_NAME]")         #Load log extraction

In [9]:
labelNum = 0   
collect_acc = []
    
while(labelNum != 32):                                  #Set number of categories to find in random log extraction
    data = data.sample(frac=1).reset_index(drop=True)  #Shuffle elements
    train_size = int(len(data) * .8)                  #80-20 division for training-testing
    train_rLogs = data['utf8_payload'][:train_size]
    train_cat = data[label][:train_size]
    
    labelNum = labelNum = len(train_cat.value_counts())

test_rLogs = data['utf8_payload'][train_size:]
test_cat = data[label][train_size:]

                                                    #Counting the number of labels in the subset
print("Found:", labelNum, "Categories \n")

vocab_size = 500                                  #Size of dictionary
tokenize = text.Tokenizer(num_words=vocab_size)   #init Tokenizer
tokenize.fit_on_texts(train_rLogs)                #Fit utf_payload to dictionary

x_train = tokenize.texts_to_matrix(train_rLogs)   #Maps logs to matrix
x_test = tokenize.texts_to_matrix(test_rLogs)

encoder = LabelBinarizer()
encoder.fit(train_cat)                            #Fits labels for training

y_train = encoder.transform(train_cat)
y_test = encoder.transform(test_cat)

#Linear stack of layers

model = []
model = Sequential()

#First Layer

model.add(Dense(512, input_shape=(vocab_size,)))    #Dense layers applies y =ax+b with a Activation function 
                                                    #with 512 neurons and an input the size of our dictionary

model.add(Activation('relu'))                       #ReLu: Rectified Linear Unit (Activation function)

#Second Layer

model.add(Dense(labelNum))


model.add(Activation('softmax'))                    #Softmax activation function

model.compile(loss='categorical_crossentropy',      #Categorical CrossEntropy is CrossEntropy with Softmax
              optimizer='adam',                     #Adaptive moment estimation for gradient optimisation
              metrics=['accuracy'])

history = model.fit(x_train, y_train,               #Trains model with 32 logs at a time
                    batch_size=32, 
                    epochs=2, 
                    verbose=1, 
                    validation_split=0.1)


score = model.evaluate(x_test, y_test,              #Evaluation of model
                       batch_size=32, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

collect_acc.append(score[1])

Found: 32 Categories 

Train on 3600 samples, validate on 400 samples
Epoch 1/2
Epoch 2/2
Test score: 0.1392168325781822
Test accuracy: 0.98


In [5]:
path = "/NLP2.0/model/logModel.h5"
model.save(path)

In [6]:
with open('tokenize.pickle', 'wb') as handle:
    pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open('encoder.pickle', 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
encoder.classes_

array(['Access Denied', 'Access Permitted', 'Auth Server Session Closed',
       'Compliance Policy Violation', 'Error', 'File Transfer',
       'Firewall Deny', 'Firewall Permit',
       'General Authentication Failed', 'Host Login Succeeded',
       'IRC Policy Violation', 'Information', 'Kerberos Session Denied',
       'Kerberos Session Opened', 'Messages', 'Misc Login Succeeded',
       'Misc Logout', 'Object Cached', 'Object Not Cached',
       'Privilege Escalation Succeeded', 'RADIUS Session Ended',
       'RADIUS Session Status', 'SSH Closed', 'Service Stopped',
       'Session Closed', 'Session Opened', 'System Action Allow',
       'System Status', 'User Login Failure', 'User Login Success',