### Imports

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [13]:
# Load data
training_set = './nslkdd/KDDTrain+.txt'
test_set = './nslkdd/KDDTest+.txt' 

trainDf = pd.read_csv(training_set, header=None)
testDf = pd.read_csv(test_set, header=None)


In [14]:
# add the column labels
columns = (['duration'
,'protocol_type'
,'service'
,'status_flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'long_count'
,'srv_long_count'
,'long_serror_rate'
,'srv_long_serror_rate'
,'long_rerror_rate'
,'srv_long_rerror_rate'
,'long_same_srv_rate'
,'long_diff_srv_rate'
,'srv_long_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

drop_columns = ['level','dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
, 'num_shells']

trainDf.columns = columns
testDf.columns = columns

#### preprocessing

```python

In [15]:

# map normal to 0, all attacks to 1
is_attack = trainDf.attack.map(lambda a: 0 if a == 'normal' else 1)
test_attack = testDf.attack.map(lambda a: 0 if a == 'normal' else 1)

#data_with_attack = df.join(is_attack, rsuffix='_flag')
trainDf['attack_flag'] = is_attack
testDf['attack_flag'] = test_attack

# Drop drop_columns
trainDf.drop(drop_columns, axis=1, inplace=True)
testDf.drop(drop_columns, axis=1, inplace=True)


# Drop the attack column
trainDf.drop('attack', axis=1, inplace=True)
testDf.drop('attack', axis=1, inplace=True)

# view the result
trainDf.head()

Unnamed: 0,duration,protocol_type,service,status_flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,long_count,srv_long_count,long_serror_rate,srv_long_serror_rate,long_rerror_rate,srv_long_rerror_rate,long_same_srv_rate,long_diff_srv_rate,srv_long_diff_host_rate,attack_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,0,udp,other,SF,146,0,0,0,0,0,...,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,0


In [16]:
# unique service types
trainDf.service.unique()

array(['ftp_data', 'other', 'private', 'http', 'remote_job', 'name',
       'netbios_ns', 'eco_i', 'mtp', 'telnet', 'finger', 'domain_u',
       'supdup', 'uucp_path', 'Z39_50', 'smtp', 'csnet_ns', 'uucp',
       'netbios_dgm', 'urp_i', 'auth', 'domain', 'ftp', 'bgp', 'ldap',
       'ecr_i', 'gopher', 'vmnet', 'systat', 'http_443', 'efs', 'whois',
       'imap4', 'iso_tsap', 'echo', 'klogin', 'link', 'sunrpc', 'login',
       'kshell', 'sql_net', 'time', 'hostnames', 'exec', 'ntp_u',
       'discard', 'nntp', 'courier', 'ctf', 'ssh', 'daytime', 'shell',
       'netstat', 'pop_3', 'nnsp', 'IRC', 'pop_2', 'printer', 'tim_i',
       'pm_dump', 'red_i', 'netbios_ssn', 'rje', 'X11', 'urh_i',
       'http_8001', 'aol', 'http_2784', 'tftp_u', 'harvest'], dtype=object)

In [17]:
# Map the protocol types to integers
protocol_map = {}
for i, p in enumerate(trainDf.protocol_type.unique()):
    protocol_map[p] = i
    trainDf.loc[trainDf.protocol_type == p, 'protocol_type'] = i
    testDf.loc[testDf.protocol_type == p, 'protocol_type'] = i

# # Map the services to integers
# for i, p in enumerate(trainDf.service.unique()):
#     trainDf.loc[trainDf.service == p, 'service'] = i
#     testDf.loc[testDf.service == p, 'service'] = i

# save map to file
np.save('protocol_map.npy', protocol_map)

# drop service column
trainDf.drop('service', axis=1, inplace=True)
testDf.drop('service', axis=1, inplace=True)


# map the flags to integers
flag_map = {}
for i, p in enumerate(trainDf.status_flag.unique()):
    flag_map[p] = i
    trainDf.loc[trainDf.status_flag == p, 'status_flag'] = i
    testDf.loc[testDf.status_flag == p, 'status_flag'] = i

# save map to file
np.save('flag_map.npy', flag_map)

# view the result
trainDf.head()

Unnamed: 0,duration,protocol_type,status_flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,long_count,srv_long_count,long_serror_rate,srv_long_serror_rate,long_rerror_rate,srv_long_rerror_rate,long_same_srv_rate,long_diff_srv_rate,srv_long_diff_host_rate,attack_flag
0,0,0,0,491,0,0,0,0,0,0,...,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,0,1,0,146,0,0,0,0,0,0,...,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,0
2,0,0,1,0,0,0,0,0,0,0,...,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,1
3,0,0,0,232,8153,0,0,0,0,0,...,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,0
4,0,0,0,199,420,0,0,0,0,0,...,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,0


In [18]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(trainDf.drop('attack_flag', axis=1), trainDf.attack_flag, test_size=0.33, random_state=42)


In [19]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')

# Fit on training data

model.fit(X_train, y_train)

# Actual class predictions
rf_predictions = model.predict(X_test)
# accuracy
print("Accuracy:",accuracy_score(y_test, rf_predictions))
# classification report
print(classification_report(y_test,rf_predictions))



Accuracy: 0.9947320311748292
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     22193
           1       1.00      0.99      0.99     19379

    accuracy                           0.99     41572
   macro avg       0.99      0.99      0.99     41572
weighted avg       0.99      0.99      0.99     41572



In [20]:
# get importance of features
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(trainDf.columns, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances

[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: src_bytes            Importance: 0.25
Variable: dst_bytes            Importance: 0.17
Variable: status_flag          Importance: 0.11
Variable: long_same_srv_rate   Importance: 0.09
Variable: long_diff_srv_rate   Importance: 0.08
Variable: protocol_type        Importance: 0.05
Variable: long_count           Importance: 0.05
Variable: long_serror_rate     Importance: 0.05
Variable: logged_in            Importance: 0.04
Variable: srv_long_serror_rate Importance: 0.03
Variable: srv_long_count       Importance: 0.02
Variable: duration             Importance: 0.01
Variable: hot                  Importance: 0.01
Variable: num_compromised      Importance: 0.01
Variable: long_rerror_rate     Importance: 0.01
Variable: srv_long_rerror_rate Importance: 0.01
Variable: srv_long_diff_host_rate Importance: 0.01
Variable: land                 Importance: 0.0
Variable: wrong_fragment       Importance: 0.0
Variable: urgent               Importance: 0.0
Variable: num_failed_logins    Importanc

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [21]:
# test set predictions
test_rf_predictions = model.predict(testDf.drop('attack_flag', axis=1))

# accuracy
print("Accuracy:",accuracy_score(testDf.attack_flag, test_rf_predictions))

# classification report
print(classification_report(testDf.attack_flag,test_rf_predictions))


Accuracy: 0.7850425833924769
              precision    recall  f1-score   support

           0       0.68      0.96      0.79      9711
           1       0.96      0.65      0.78     12833

    accuracy                           0.79     22544
   macro avg       0.82      0.81      0.78     22544
weighted avg       0.84      0.79      0.78     22544



In [22]:
# load the data
P_test_set_path = 'record.csv'
P_test_set = pd.read_csv(P_test_set_path)
P_test_set.head()

FileNotFoundError: [Errno 2] No such file or directory: 'record.csv'

In [None]:
P_test_set.drop(['service'], axis=1, inplace=True)

In [None]:
# Map the protocol types to integers
for i, p in enumerate(P_test_set.protocol_type.unique()):
    P_test_set.loc[P_test_set.protocol_type == p, 'protocol_type'] = protocol_map.get(p, 3)

# Map the flags to integers
for i, p in enumerate(P_test_set.status_flag.unique()):
    P_test_set.loc[P_test_set.status_flag == p, 'status_flag'] = flag_map.get(p, 11)

# test set predictions
P_test_rf_predictions = model.predict(P_test_set)

In [None]:
pd.DataFrame(P_test_rf_predictions).value_counts()

0    46
1     3
Name: count, dtype: int64

In [None]:
# save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)