In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import joblib

In [2]:
df = pd.read_csv(r'C:\Users\Mark Silla\Downloads\archiv\train_mosaic.csv')
# load dataset
x_cols = [c for c in df.columns if c != 'Label']
# set input matrix and target column
X = df[x_cols]
y = df['Label']
# show first rows of data
df.head()

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0.0,739228.5,743103.4661,1264682.0,213775.0,49700000.0,41400000.0,79000000.0,20500000.0,DoS Hulk
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,443,11932077,12,16,5030,15703,1525,0,419.166667,644.896586,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [13]:
dict(X.loc[1])

{'Destination_Port': 60711.0,
 'Flow_Duration': 58.0,
 'Total_Fwd_Packets': 1.0,
 'Total_Backward_Packets': 1.0,
 'Total_Length_of_Fwd_Packets': 0.0,
 'Total_Length_of_Bwd_Packets': 0.0,
 'Fwd_Packet_Length_Max': 0.0,
 'Fwd_Packet_Length_Min': 0.0,
 'Fwd_Packet_Length_Mean': 0.0,
 'Fwd_Packet_Length_Std': 0.0,
 'Bwd_Packet_Length_Max': 0.0,
 'Bwd_Packet_Length_Min': 0.0,
 'Bwd_Packet_Length_Mean': 0.0,
 'Bwd_Packet_Length_Std': 0.0,
 'Flow_Bytes_Sec': 2.0,
 'Flow_Packets_Sec': 176182.0,
 'Flow_IAT_Mean': 58.0,
 'Flow_IAT_Std': 0.0,
 'Flow_IAT_Max': 58.0,
 'Flow_IAT_Min': 58.0,
 'Fwd_IAT_Total': 0.0,
 'Fwd_IAT_Mean': 0.0,
 'Fwd_IAT_Std': 0.0,
 'Fwd_IAT_Max': 0.0,
 'Fwd_IAT_Min': 0.0,
 'Bwd_IAT_Total': 0.0,
 'Bwd_IAT_Mean': 0.0,
 'Bwd_IAT_Std': 0.0,
 'Bwd_IAT_Max': 0.0,
 'Bwd_IAT_Min': 0.0,
 'Fwd_PSH_Flags': 0.0,
 'Bwd_PSH_Flags': 0.0,
 'Fwd_URG_Flags': 0.0,
 'Bwd_URG_Flags': 0.0,
 'Fwd_Header_Length': 32.0,
 'Bwd_Header_Length': 32.0,
 'Fwd_Packets_Sec': 17241.37931,
 'Bwd_Packets_Sec':

In [3]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [7]:
# fill missing values
train_mode = dict(X_train.mode().iloc[-1])
X_train = X_train.fillna(train_mode)
print(train_mode)

{'Destination_Port': 80.0, 'Flow_Duration': 3.0, 'Total_Fwd_Packets': 2.0, 'Total_Backward_Packets': 1.0, 'Total_Length_of_Fwd_Packets': 0.0, 'Total_Length_of_Bwd_Packets': 0.0, 'Fwd_Packet_Length_Max': 168.0, 'Fwd_Packet_Length_Min': 0.0, 'Fwd_Packet_Length_Mean': 0.0, 'Fwd_Packet_Length_Std': 0.0, 'Bwd_Packet_Length_Max': 0.0, 'Bwd_Packet_Length_Min': 0.0, 'Bwd_Packet_Length_Mean': 0.0, 'Bwd_Packet_Length_Std': 0.0, 'Flow_Bytes_Sec': 0.0, 'Flow_Packets_Sec': 219948.0, 'Flow_IAT_Mean': 3.0, 'Flow_IAT_Std': 0.0, 'Flow_IAT_Max': 3.0, 'Flow_IAT_Min': 3.0, 'Fwd_IAT_Total': 0.0, 'Fwd_IAT_Mean': 0.0, 'Fwd_IAT_Std': 0.0, 'Fwd_IAT_Max': 0.0, 'Fwd_IAT_Min': 3.0, 'Bwd_IAT_Total': 0.0, 'Bwd_IAT_Mean': 0.0, 'Bwd_IAT_Std': 0.0, 'Bwd_IAT_Max': 0.0, 'Bwd_IAT_Min': 0.0, 'Fwd_PSH_Flags': 0.0, 'Bwd_PSH_Flags': 0.0, 'Fwd_URG_Flags': 0.0, 'Bwd_URG_Flags': 0.0, 'Fwd_Header_Length': 40.0, 'Bwd_Header_Length': 40.0, 'Fwd_Packets_Sec': 666666.6667, 'Bwd_Packets_Sec': 0.0, 'Min_Packet_Length': 0.0, 'Max_Packe

In [5]:
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

In [None]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)