In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
             "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
             "logged_in","num_compromised","root_shell","su_attempted","num_root",
             "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
             "is_host_login","is_guest_login","count","srv_count","serror_rate",
             "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
             "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
             "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","level"]

X_train = pd.read_csv('dataset/NSL-KDD/KDDTrain+.txt', header=None, names=col_names)
y_train = X_train.iloc[:,-2]
X_train = X_train.iloc[:,:-2]

X_test = pd.read_csv('dataset/NSL-KDD/KDDTest+.txt', header=None, names=col_names)
y_test = X_test.iloc[:,-2]
X_test = X_test.iloc[:,:-2]

In [3]:
# one hot categorical data
enc = OneHotEncoder(categories=[X_train.protocol_type.unique(),X_train.service.unique(),X_train.flag.unique()])
X_train_enc = pd.DataFrame(enc.fit_transform(X_train.iloc[:,1:4]).toarray())
X_test_enc = pd.DataFrame(enc.fit_transform(X_test.iloc[:,1:4]).toarray())

X_train_enc.columns = X_train_enc.columns.astype(str)
X_test_enc.columns = X_test_enc.columns.astype(str)

In [4]:
# join back to original
X_train = X_train.drop(columns=["protocol_type","service","flag"], axis=1)
X_test = X_test.drop(columns=["protocol_type","service","flag"], axis=1)

X_train = X_train.join(X_train_enc)
X_test = X_test.join(X_test_enc)

In [5]:
attack_category_dict = {
    1:['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'], #dos
    2:['ipsweep','mscan','nmap','portsweep','saint','satan'], #u2r
    3:['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'], #r2l
    4:['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','spy','snmpguess','warezclient','warezmaster','xlock','xsnoop'] #probe
}

In [6]:
def get_key_from_value(value):
    if value == 'normal':
        return 0

    for key, vals in attack_category_dict.items():
        if value in vals:
            return key

y_train = y_train.apply(get_key_from_value)
y_test = y_test.apply(get_key_from_value)

In [7]:
pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=10000), verbose=True)

In [None]:
pipe.fit(X_train, y_train)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.2s


In [None]:
pipe.score(X_test, y_test)