In [68]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


#define colname
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

#input the kdd data
df=pd.read_csv("kddcup.data_10_percent",header=None,names=col_names)

In [69]:
#add label for classification
labels = df['label'].copy()
labels[labels=='neptune.'] = 2
labels[labels=='smurf.'] = 1
labels[labels=="normal."] = 0
labels[labels=="back."]=3
labels[labels=="satan."]=3
labels[labels=="ipsweep."]=3
labels[labels=="portsweep."]=3
labels[labels=="warezclient."]=3
labels[labels=="teardrop."]=3
labels[labels=="pod."]=3
labels[labels=="nmap."]=3
labels[labels=="guess_passwd."]=3
labels[labels=="buffer_overflow."]=3
labels[labels=="land."]=3
labels[labels=="warezmaster."]=3
labels[labels=="imap."]=3
labels[labels=="rootkit."]=3
labels[labels=="loadmodule."]=3
labels[labels=="ftp_write."]=3
labels[labels=="multihop."]=3
labels[labels=="phf."]=3
labels[labels=="perl."]=3
labels[labels=="spy."]=3

labels.value_counts()

1    280790
2    107201
0     97278
3      8752
Name: label, dtype: int64

In [70]:
#delete the label feature
del df['label']

#define dummy variable
df = pd.get_dummies(df, columns=["protocol_type","service","flag"] , prefix=["Type_is","Type_is","Type_is"])

In [71]:
from sklearn.model_selection import train_test_split

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

In [72]:
#define the decision tree
tree_classifier=DecisionTreeClassifier(criterion='gini',max_depth=5,max_features=72)

#check the trainning time of decision tree
%time tree_classifier.fit(X_train,y_train.astype('int'))

CPU times: user 2.27 s, sys: 257 ms, total: 2.53 s
Wall time: 1.36 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=72, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [73]:
#check the predicting time of decision tree
%time pred_tree=tree_classifier.predict(X_test)

CPU times: user 52.2 ms, sys: 44.6 ms, total: 96.8 ms
Wall time: 95.6 ms


In [74]:
%time accuracy_score(y_test.astype('int'),pred_tree)

CPU times: user 8.99 ms, sys: 3.24 ms, total: 12.2 ms
Wall time: 11.3 ms


0.992864733566115

In [75]:
#define the random forest classifier
rf=RandomForestClassifier(max_depth=8,n_estimators=200,criterion="gini",max_features=72)

#check the trainning time of rf
%time rf.fit(X_train,y_train.astype('int'))

CPU times: user 4min 37s, sys: 2.02 s, total: 4min 39s
Wall time: 2min 31s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features=72, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [76]:
#check the predicting time of rf
%time pred_rf=rf.predict(X_test)

CPU times: user 2.02 s, sys: 65.9 ms, total: 2.09 s
Wall time: 1.1 s


In [77]:
#check the calculation time of accuracy
%time accuracy_score(y_test.astype('int'),pred_rf)

CPU times: user 17.7 ms, sys: 3.22 ms, total: 20.9 ms
Wall time: 10.1 ms


0.9991802034310004