In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random
import pickle

**Add data**

In [2]:
train_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv'
test_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv'

In [49]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names)

df_test = pd.read_csv(test_url, header=None, names = col_names)

print('Dimensions of the Training set:',df)
print('Dimensions of the Test set:',df_test)

Dimensions of the Training set:         duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0              0           tcp  ftp_data   SF        491          0     0   
1              0           udp     other   SF        146          0     0   
2              0           tcp   private   S0          0          0     0   
3              0           tcp      http   SF        232       8153     0   
4              0           tcp      http   SF        199        420     0   
...          ...           ...       ...  ...        ...        ...   ...   
125968         0           tcp   private   S0          0          0     0   
125969         8           udp   private   SF        105        145     0   
125970         0           tcp      smtp   SF       2231        384     0   
125971         0           tcp    klogin   S0          0          0     0   
125972         0           tcp  ftp_data   SF        151          0     0   

        wrong_fragment  urgent  hot  num_fa

In [50]:
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
loadmodule            2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: label, dtype: int64


**Step 1: Data preprocessing:**

One-Hot-Encoding

In [5]:
#for col_name in df.columns:
    #if df[col_name].dtypes == 'object' :
       # unique_cat = len(df[col_name].unique())
        #print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

In [6]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


**LabelEncoder**

**Insert categorical features into a 2D numpy array**

In [7]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [8]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2


['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

**Transform categorical features into numbers using LabelEncoder()**

In [9]:
#df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

**One-Hot-Encoding**

In [10]:
enc = OneHotEncoder(categories='auto')
#df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
#df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)


# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

**Test set **

In [12]:
testservice= df_test['service'].tolist()
#difference=list(set(trainservice) - set(testservice))
#string = 'service_'
#difference=[string + x for x in difference]
#difference

**New numeric columns are added to the main dataframe**

In [47]:
# test data
newdf_test=df_test.join(testdf_cat_data)
print (newdf_test)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_test.shape)

       duration protocol_type   service  flag  src_bytes  dst_bytes  land  \
0             0           tcp   private   REJ          0          0     0   
1             0           tcp   private   REJ          0          0     0   
2             2           tcp  ftp_data    SF      12983          0     0   
3             0          icmp     eco_i    SF         20          0     0   
4             1           tcp    telnet  RSTO          0         15     0   
...         ...           ...       ...   ...        ...        ...   ...   
22539         0           tcp      smtp    SF        794        333     0   
22540         0           tcp      http    SF        317        938     0   
22541         0           tcp      http    SF      54540       8314     0   
22542         0           udp  domain_u    SF         42         42     0   
22543         0           tcp    sunrpc   REJ          0          0     0   

       wrong_fragment  urgent  hot  num_failed_logins  logged_in  \
0      

 Dataset 0=Normal, 1=DoS, 2=Probe, 3=R2L, 4=U2R.
 DoS : 
 
 Probe : 
 
 R2L :
 
 U2R :

In [46]:
labeldf_test=newdf_test['label']
print (newdf_test['label'])

# change the label column
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})



# put the new label column back
newdf_test['label'] = newlabeldf_test

0        1
1        1
2        0
3        2
4        2
        ..
22539    0
22540    0
22541    1
22542    0
22543    2
Name: label, Length: 22544, dtype: int64


In [16]:
to_drop_DoS = [0,1]
to_drop_Probe = [0,2]
to_drop_R2L = [0,3]
to_drop_U2R = [0,4]

#test
DoS_df_test=newdf_test[newdf_test['label'].isin(to_drop_DoS)];
Probe_df_test=newdf_test[newdf_test['label'].isin(to_drop_Probe)];
R2L_df_test=newdf_test[newdf_test['label'].isin(to_drop_R2L)];
U2R_df_test=newdf_test[newdf_test['label'].isin(to_drop_U2R)];

print('Test:')
print('Dimensions of DoS:' ,DoS_df_test.shape)
print('Dimensions of Probe:' ,Probe_df_test.shape)
print('Dimensions of R2L:' ,R2L_df_test.shape)
print('Dimensions of U2R:' ,U2R_df_test.shape)

Test:
Dimensions of DoS: (17171, 117)
Dimensions of Probe: (12132, 117)
Dimensions of R2L: (12596, 117)
Dimensions of U2R: (9778, 117)


**Step 2: Feature Scaling**

In [52]:
# Split dataframes into X & Y
# test set
X_DoS_test = DoS_df_test.drop('label',axis=1)
Y_DoS_test = DoS_df_test.label

print(X_DoS_test)

X_Probe_test = Probe_df_test.drop('label',axis=1)
Y_Probe_test = Probe_df_test.label

X_R2L_test = R2L_df_test.drop('label',axis=1)
Y_R2L_test = R2L_df_test.label

X_U2R_test = U2R_df_test.drop('label',axis=1)
Y_U2R_test = U2R_df_test.label


       duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0             0          0          0     0               0       0    0   
1             0          0          0     0               0       0    0   
2             2      12983          0     0               0       0    0   
5             0        267      14515     0               0       0    0   
6             0       1022        387     0               0       0    0   
...         ...        ...        ...   ...             ...     ...  ...   
22538         0       1032          0     0               0       0    0   
22539         0        794        333     0               0       0    0   
22540         0        317        938     0               0       0    0   
22541         0      54540       8314     0               0       0    2   
22542         0         42         42     0               0       0    0   

       num_failed_logins  logged_in  num_compromised  root_shell  \
0                  

**Since the column names will be deleted at this stage, we save the column names for later use.**

In [18]:
colNames_test=list(X_DoS_test)

In [19]:
from sklearn import preprocessing

# test data
scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test=scaler5.transform(X_DoS_test) 

scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
X_Probe_test=scaler6.transform(X_Probe_test) 

scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
X_R2L_test=scaler7.transform(X_R2L_test) 

scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
X_U2R_test=scaler8.transform(X_U2R_test)

**Step 3: Feature Selection:**

---

**Recursive Feature Elimination (RFE)**

# Random Forest

In [20]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


'''clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
rfe = RFE(estimator=clf, n_features_to_select=13, step=1)

rfe.fit(X_DoS, Y_DoS.astype(int))
X_rfeDoS=rfe.transform(X_DoS)
true=rfe.support_
rfecolindex_DoS=[i for i, x in enumerate(true) if x]
rfecolnpply the classifier we trained to the test data ame_DoS=list(colNames[i] for i in rfecolindex_DoS)'''

'clf = RandomForestClassifier(n_estimators=10,n_jobs=2)\nrfe = RFE(estimator=clf, n_features_to_select=13, step=1)\n\nrfe.fit(X_DoS, Y_DoS.astype(int))\nX_rfeDoS=rfe.transform(X_DoS)\ntrue=rfe.support_\nrfecolindex_DoS=[i for i, x in enumerate(true) if x]\nrfecolnpply the classifier we trained to the test data ame_DoS=list(colNames[i] for i in rfecolindex_DoS)'

In [21]:
'''rfe.fit(X_Probe, Y_Probe.astype(int))
X_rfeProbe=rfe.transform(X_Probe)
true=rfe.support_
rfecolindex_Probe=[i for i, x in enumerate(true) if x]
rfecolname_Probe=list(colNames[i] for i in rfecolindex_Probe)'''

'rfe.fit(X_Probe, Y_Probe.astype(int))\nX_rfeProbe=rfe.transform(X_Probe)\ntrue=rfe.support_\nrfecolindex_Probe=[i for i, x in enumerate(true) if x]\nrfecolname_Probe=list(colNames[i] for i in rfecolindex_Probe)'

In [22]:
'''rfe.fit(X_R2L, Y_R2L.astype(int))
X_rfeR2L=rfe.transform(X_R2L)
true=rfe.support_
rfecolindex_R2L=[i for i, x in enumerate(true) if x]
rfecolname_R2L=list(colNames[i] for i in rfecolindex_R2L)'''

'rfe.fit(X_R2L, Y_R2L.astype(int))\nX_rfeR2L=rfe.transform(X_R2L)\ntrue=rfe.support_\nrfecolindex_R2L=[i for i, x in enumerate(true) if x]\nrfecolname_R2L=list(colNames[i] for i in rfecolindex_R2L)'

In [23]:
'''rfe.fit(X_U2R, Y_U2R.astype(int))
X_rfeU2R=rfe.transform(X_U2R)
true=rfe.support_
rfecolindex_U2R=[i for i, x in enumerate(true) if x]
rfecolname_U2R=list(colNames[i] for i in rfecolindex_U2R)'''

'rfe.fit(X_U2R, Y_U2R.astype(int))\nX_rfeU2R=rfe.transform(X_U2R)\ntrue=rfe.support_\nrfecolindex_U2R=[i for i, x in enumerate(true) if x]\nrfecolname_U2R=list(colNames[i] for i in rfecolindex_U2R)'

**Summary of features selected by RFE**

In [24]:
'''print('Features selected for DoS:',rfecolname_DoS)
print()
print('Features selected for Probe:',rfecolname_Probe)
print()
print('Features selected for R2L:',rfecolname_R2L)
print()
print('Features selected for U2R:',rfecolname_U2R)'''

"print('Features selected for DoS:',rfecolname_DoS)\nprint()\nprint('Features selected for Probe:',rfecolname_Probe)\nprint()\nprint('Features selected for R2L:',rfecolname_R2L)\nprint()\nprint('Features selected for U2R:',rfecolname_U2R)"

In [25]:
'''print(X_rfeDoS.shape)
print(X_rfeProbe.shape)
print(X_rfeR2L.shape)
print(X_rfeU2R.shape)'''


'print(X_rfeDoS.shape)\nprint(X_rfeProbe.shape)\nprint(X_rfeR2L.shape)\nprint(X_rfeU2R.shape)'

**Step 4: Load the model:**

Classifier is trained for all features and for reduced features, for later comparison.

The classifier model itself is stored in the clf variable.

In [56]:
# all features
#clf_DoS=RandomForestClassifier(n_estimators=10,n_jobs=2)
#clf_Probe=RandomForestClassifier(n_estimators=10,n_jobs=2)
#clf_R2L=RandomForestClassifier(n_estimators=10,n_jobs=2)
#clf_U2R=RandomForestClassifier(n_estimators=10,n_jobs=2)

filename_DoS = "model_DoS.sav"
filename_Probe = "model_Probe.sav"
filename_R2L = "model_R2L.sav"
filename_U2R = "model_U2R.sav"

loaded_model_DoS = pickle.load(open(filename_DoS, 'rb'))
loaded_model_DoS.fit(X_DoS_test, Y_DoS_test)

loaded_model_Probe = pickle.load(open(filename_DoS, 'rb'))
loaded_model_Probe.fit(X_Probe_test, Y_Probe_test)

loaded_model_R2L = pickle.load(open(filename_DoS, 'rb'))
loaded_model_R2L.fit(X_R2L_test, Y_R2L_test)

loaded_model_U2R = pickle.load(open(filename_DoS, 'rb'))
loaded_model_U2R.fit(X_U2R_test, Y_U2R_test)


RandomForestClassifier(n_estimators=10, n_jobs=2)

In [43]:
# selected features
clf_rfeDoS=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeProbe=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeR2L=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeU2R=RandomForestClassifier(n_estimators=10,n_jobs=2)

#clf_rfeDoS.fit(X_rfeDoS, Y_DoS.astype(int))
#clf_rfeProbe.fit(X_rfeProbe, Y_Probe.astype(int))
#clf_rfeR2L.fit(X_rfeR2L, Y_R2L.astype(int))
#clf_rfeU2R.fit(X_rfeU2R, Y_U2R.astype(int))

**Step 5: Prediction & Evaluation (validation):**


---



Using all Features for each category

Confusion Matrices


---



**DoS**

In [27]:
# Apply the classifier we trained to the test data (which it has never seen before)
loaded_model_DoS.predict(X_DoS_test)

array([1, 1, 0, ..., 0, 1, 0])

In [28]:
# View the predicted probabilities of the first 10 observations
loaded_model_DoS.predict_proba(X_DoS_test)[0:10]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [29]:
Y_DoS_pred=loaded_model_DoS.predict(X_DoS_test)

# Create confusion matrix
pd.crosstab(Y_DoS_test, Y_DoS_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
1,1,7459


**Probe**

In [30]:
Y_Probe_pred=loaded_model_Probe.predict(X_Probe_test)

# Create confusion matrix
pd.crosstab(Y_Probe_test, Y_Probe_pred, rownames=['Actual attackics'], colnames=['Predted attacks'])

Predted attacks,0,2
Actual attackics,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
2,3,2418


**R2L**

In [31]:
Y_R2L_pred=loaded_model_R2L.predict(X_R2L_test)

# Create confusion matrix
pd.crosstab(Y_R2L_test, Y_R2L_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,3
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9692,19
3,48,2837


**U2R**

In [32]:
Y_U2R_pred=loaded_model_U2R.predict(X_U2R_test)

# Create confusion matrix
pd.crosstab(Y_U2R_test, Y_U2R_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,4
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
4,3,64


**Cross Validation: Accuracy, Precision, Recall, F-measure**

**DoS**

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99785 (+/- 0.00195)
Precision: 0.99893 (+/- 0.00107)
Recall: 0.99651 (+/- 0.00322)
F-measure: 0.99779 (+/- 0.00262)


**Probe**

In [34]:
accuracy = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99670 (+/- 0.00195)
Precision: 0.99592 (+/- 0.00584)
Recall: 0.99334 (+/- 0.00673)
F-measure: 0.99431 (+/- 0.00684)


**U2R**

In [35]:
accuracy = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99765 (+/- 0.00260)
Precision: 0.98277 (+/- 0.06681)
Recall: 0.85937 (+/- 0.17639)
F-measure: 0.90403 (+/- 0.11054)


**R2L**

In [36]:
accuracy = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.97984 (+/- 0.00667)
Precision: 0.97478 (+/- 0.00849)
Recall: 0.96808 (+/- 0.01138)
F-measure: 0.97222 (+/- 0.00822)


In [37]:
from IPython.display import display
pd.set_option('display.max_columns', None)
!python3 zeek_anomaly_detector.py -a 20 -f conn.log

Simple Anomaly Detector for Zeek conn.log files. Version: 0.2
Author: Sebastian Garcia (eldraco@gmail.com), Veronica Valeros (vero.valeros@gmail.com)

Flows of the top anomalies
       id.orig_h  id.orig_p  ... resp_ip_bytes         score
270  10.0.90.215      49205  ...        313114  2.136671e+09
130  10.0.90.215      49224  ...        866188  8.076550e+08
269  10.0.90.215      49227  ...          6102  7.894519e+08
77   10.0.90.215      49204  ...        835508  7.688101e+08
219  10.0.90.215      49279  ...        643041  5.926922e+08
113  10.0.90.215      49214  ...        643001  5.922629e+08
141  10.0.90.215      49235  ...        542056  4.958893e+08
235  10.0.90.215      49291  ...        519275  4.721308e+08
123  10.0.90.215      49220  ...        519073  4.710229e+08
180    10.0.90.9      49155  ...             0  3.163401e+08
75   10.0.90.215      49194  ...          2253  3.159937e+08
135  10.0.90.215      49231  ...          2196  3.159202e+08
57   10.0.90

In [38]:
from zat.log_to_dataframe import LogToDataFrame
log_to_df = LogToDataFrame()
zeek_df = log_to_df.create_dataframe('dns.log')
pd.set_option('display.max_columns', None)
zeek_df

Unnamed: 0_level_0,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,rtt,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2019-03-19 01:44:55.654616064,CPB1V03iUT0K033Nvd,10.0.90.215,63262,10.0.90.9,53,udp,49344,0 days 00:00:00.000444,_ldap._tcp.default-first-site-name._sites.dc._...,1,C_INTERNET,33,SRV,0,NOERROR,T,F,T,T,0,littletigers-dc.littletigers.info,600.000000,F
2019-03-19 01:44:55.657289984,ChWH8K1OKSi0mN05Rk,10.0.90.215,55332,10.0.90.9,53,udp,43386,0 days 00:00:00.000169,littletigers-dc.littletigers.info,1,C_INTERNET,1,A,0,NOERROR,T,F,T,T,0,10.0.90.9,3600.000000,F
2019-03-19 01:44:55.853591040,CIZgTz2rKj4HcYbW36,10.0.90.215,53248,10.0.90.9,53,udp,52321,0 days 00:00:00.000177,_ldap._tcp.default-first-site-name._sites.fore...,1,C_INTERNET,33,SRV,0,NOERROR,T,F,T,T,0,littletigers-dc.littletigers.info,600.000000,F
2019-03-19 01:44:55.922339072,CQ3Qc14uDmuNV9fYA,10.0.90.215,64687,10.0.90.9,53,udp,19099,0 days 00:00:00.000277,_ldap._tcp.default-first-site-name._sites.litt...,1,C_INTERNET,33,SRV,0,NOERROR,T,F,T,T,0,littletigers-dc.littletigers.info,600.000000,F
2019-03-19 01:44:55.961795072,Cdp8py3610OKbdw9mf,10.0.90.215,59191,10.0.90.9,53,udp,10437,0 days 00:00:00.000205,_ldap._tcp.default-first-site-name._sites.doma...,1,C_INTERNET,33,SRV,0,NOERROR,T,F,T,T,0,littletigers-dc.littletigers.info,600.000000,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-03-19 04:54:32.000838912,CCwik94gCNsV0e3UYb,10.0.90.215,59497,10.0.90.9,53,udp,50462,0 days 00:00:00.024172,dns.msftncsi.com,1,C_INTERNET,1,A,0,NOERROR,F,F,T,T,0,131.107.255.255,1.000000,F
2019-03-19 04:54:32.025595904,CgD6UY2sxrSKb1Pu9c,10.0.90.215,51962,10.0.90.9,53,udp,44250,NaT,dns.msftncsi.com,1,C_INTERNET,28,AAAA,0,NOERROR,F,F,T,F,0,,,T
2019-03-19 04:57:23.728198912,CbROLz4PESkUJZJe35,10.0.90.215,55354,10.0.90.9,53,udp,46204,NaT,_ldap._tcp.default-first-site-name._sites.litt...,1,C_INTERNET,33,SRV,3,NXDOMAIN,F,F,T,F,0,,,F
2019-03-19 04:57:23.729139968,CVC7Ob3Jle36DxNz3l,10.0.90.215,61628,10.0.90.9,53,udp,18060,NaT,_ldap._tcp.littletigers-dc.littletigers.info,1,C_INTERNET,33,SRV,3,NXDOMAIN,F,F,T,F,0,,,F
