# 机器学习算法模型

## 1. 导入包

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 2. 定义预处理函数

In [2]:
def convertstringtonumber(df, lst):
    """ 字符串转为数字型"""
    for n in range(len(lst)):
        df = df.replace(lst[n], n)
    return df

def scalex(X):
    """ 数值标准化"""
    nmin, nmax = 0.0, 1.0
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (nmax - nmin) + nmin
    return X_scaled

## 3. 读取数据集

In [3]:
col_names = ["Duration", "Protocol_type", "Service", "Flag", "Src_bytes", "Dst_bytes", 
       "Land", "Wrong_fragment", "Urgent", "Hot", "Num_failed_logins", "Logged_in", 
       "Num_compromised", "Root_shell", "Su_attempted", "Num_root", "Num_file_creations",
       "Num_shells", "Num_access_files", "Num_outbound_cmds", "Is_hot_login", 
       "Is_guest_login", "Count", "Srv_count", "Serror_rate", "Srv_serror_rate",
       "Rerror_rate", "Srv_rerror_rate", "Same_srv_rate", "Diff_srv_rate",
       "Srv_diff_host_rate", "Dst_host_count", "Dst_host_srv_count", "Dst_host_same_srv_rate", 
       "Dst_host_diff_srv_rate", "Dst_host_same_src_port_rate", "Dst_host_srv_diff_host_rate",
       "Dst_host_serror_rate", "Dst_host_srv_serror_rate", "Dst_host_rerror_rate",
       "Dst_host_srv_rerror_rate", "attack_type", "Class"]

df_train = pd.read_csv("./dataset/KDDTrain+.txt", sep=",", header=None)
df_test = pd.read_csv("./dataset/KDDTest+.txt", sep=",", header=None)

df_train.columns = col_names
df_test.columns = col_names

df_train = df_train.drop(columns=['Class'])
df_test = df_test.drop(columns=['Class'])

print('训练集维度:',df_train.shape)
print('测试集维度:',df_test.shape)

训练集维度: (125973, 42)
测试集维度: (22544, 42)


## 4. 数据预处理

In [None]:
protocol_type = ['icmp', 'tcp', 'udp']
service = ['IRC','X11','Z39_50','aol','auth','bgp','courier','csnet_ns','ctf',
           'daytime','discard','domain','domain_u','echo','eco_i','ecr_i','efs',
           'exec','finger','ftp','ftp_data','gopher','harvest','hostnames','http',
           'http_2784','http_443','http_8001','imap4','iso_tsap','klogin','kshell',
           'ldap','link','login','mtp','name','netbios_dgm','netbios_ns',
           'netbios_ssn','netstat','nnsp','nntp','ntp_u','other','pm_dump','pop_2',
           'pop_3','printer','private','red_i','remote_job','rje','shell','smtp',
           'sql_net','ssh','sunrpc','supdup','systat','telnet','tftp_u','tim_i',
           'time','urh_i','urp_i','uucp','uucp_path','vmnet','whois']
flag = ['OTH','REJ','RSTO','RSTOS0','RSTR','S0','S1','S2','S3','SF','SH']

df_train = convertstringtonumber(df_train, protocol_type)
df_train = convertstringtonumber(df_train, service)
df_train = convertstringtonumber(df_train, flag)

df_test = convertstringtonumber(df_test, protocol_type)
df_test = convertstringtonumber(df_test, service)
df_test = convertstringtonumber(df_test, flag)
        
for n in range(len(col_names)-2): #df_train标准化
        m = col_names[n]
        if (np.max(df_train[m]) > 1): 
            if (len(np.unique(df_train[m])) > 1):
                df_train[m] = scalex(df_train[m])
            else:
                df_train[m] = np.int64(1)
                
for n in range(len(col_names)-2): #df_train标准化
        m = col_names[n]
        if (np.max(df_test[m]) > 1): 
            if (len(np.unique(df_test[m])) > 1):
                df_test[m] = scalex(df_test[m])
            else:
                df_test[m] = np.int64(1)

df_test.head(5)

In [None]:
labeldf_train = df_train['attack_type']
labeldf_test = df_test['attack_type']

# 用数字代替攻击类型
newlabeldf_train = labeldf_train.replace({ 'normal' : 0,'ftp_write': 1,'guess_passwd': 1,'imap': 1,'multihop': 1,'phf': 1,'spy': 1,'warezclient': 1,'warezmaster': 1,'sendmail': 1,'named': 1,'snmpgetattack': 1,'snmpguess': 1,'xlock': 1,'xsnoop': 1,'httptunnel': 1,
                            'ipsweep' : 1,'nmap' : 1,'portsweep' : 1,'satan' : 1,'mscan' : 1,'saint' : 1,
                            'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                            'buffer_overflow': 1,'loadmodule': 1,'perl': 1,'rootkit': 1,'ps': 1,'sqlattack': 1,'xterm': 1})

newlabeldf_test = labeldf_test.replace({ 'normal' : 0,'ftp_write': 1,'guess_passwd': 1,'imap': 1,'multihop': 1,'phf': 1,'spy': 1,'warezclient': 1,'warezmaster': 1,'sendmail': 1,'named': 1,'snmpgetattack': 1,'snmpguess': 1,'xlock': 1,'xsnoop': 1,'httptunnel': 1,
                            'ipsweep' : 1,'nmap' : 1,'portsweep' : 1,'satan' : 1,'mscan' : 1,'saint' : 1,
                            'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                            'buffer_overflow': 1,'loadmodule': 1,'perl': 1,'rootkit': 1,'ps': 1,'sqlattack': 1,'xterm': 1})

df_train['attack_type'] = newlabeldf_train
df_test['attack_type'] = newlabeldf_test

df_test.head(5)

## 5. 拆分数据集

In [None]:
# 二分类，分类标签为最后一列，多分类标签后边再使用
train_X = df_train.iloc[:,0:-1]
train_y = df_train.iloc[:,-1]

test_X = df_test.iloc[:,0:-1]
test_y = df_test.iloc[:,-1]

In [None]:
train_X.head(5)

In [None]:
train_y.head(5)

## 6. ml实验

### 导入模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [10]:
DT_b = DecisionTreeClassifier(random_state=0) # 决策树
RF_b = RandomForestClassifier(n_estimators=10, random_state=0) # 随机森林
GB_b = GradientBoostingClassifier(loss='deviance', learning_rate=0.5, n_estimators=75, max_features='auto',random_state=0) # 梯度提升树
SVM_b = svm.SVC() # SVM
NB_b = BernoulliNB() # 朴素贝叶斯
KNN_b = KNeighborsClassifier(n_jobs=1)
MLP_b = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter=1000, random_state=1) # 多层感知机

### 6.1 决策树

In [None]:
DT_b.fit(train_X,train_y)
DT_pred_y = DT_b.predict(test_X)
DT_t = classification_report(test_y,DT_pred_y,digits=4)
print("______________________________________________________________") 
print("DT分类结果：")
print(DT_t)
print("______________________________________________________________")

### 6.2 随机森林

In [None]:
RF_b.fit(train_X,train_y)
RF_pred_y = RF_b.predict(test_X)
RF_t = classification_report(test_y,RF_pred_y,digits=4)
print("______________________________________________________________") 
print("RF分类结果：")
print(RF_t)
print("______________________________________________________________")

### 6.3 梯度提升树

In [None]:
GB_b.fit(train_X,train_y)
GB_pred_y = GB_b.predict(test_X)
GB_t = classification_report(test_y,GB_pred_y,digits=4)
print("______________________________________________________________") 
print("GBDT分类结果：")
print(GB_t)
print("______________________________________________________________")

### 6.4 支持向量机

In [None]:
SVM_b.fit(train_X,train_y)
SVM_b.score(train_X,train_y)
#Predict Output
SVM_pred_y = SVM_b.predict(test_X)
SVM_t = classification_report(test_y,SVM_pred_y,digits=4)
print("______________________________________________________________") 
print("SVM分类结果：")
print(SVM_t)
print("______________________________________________________________") 

### 6.5 朴素贝叶斯

In [None]:
NB_b.fit(train_X,train_y)
NB_pred_y = NB_b.predict(test_X)
NB_t = classification_report(test_y,NB_pred_y,digits=4)
print("______________________________________________________________") 
print("NB分类结果：")
print(NB_t)
print("______________________________________________________________")

### 6.6 K-NN

In [None]:
KNN_b.fit(train_X,train_y)
KNN_pred_y = KNN_b.predict(test_X)
KNN_t = classification_report(test_y,KNN_pred_y,digits=4)
print("______________________________________________________________") 
print("K-NN分类结果：")
print(KNN_t)
print("______________________________________________________________")

### 6.7 多层感知机（MLP ANN）

In [None]:
MLP_b.fit(train_X,train_y)
MLP_pred_y = MLP_b.predict(test_X)
MLP_t = classification_report(test_y,MLP_pred_y,digits=4)
print("______________________________________________________________") 
print("MLP分类结果：")
print(MLP_t)
print("______________________________________________________________")