# 数据预处理

## 实验版本：
> 1. pandas版本 0.19.2   ---  0.22.0
> 2. numpy 版本 1.11.3   ---  1.14.5
> 3. anaconda 4.3.0   
> 4. sklearn版本 0.18.1  ---  0.19.2

In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
import matplotlib
import time

In [2]:
print("Pandas 版本:", pd.__version__)
print("NumPy 版本:", np.__version__)
print("Scikit-learn 版本:", sklearn.__version__)
print("Matplotlib 版本:", matplotlib.__version__)

Pandas 版本: 0.22.0
NumPy 版本: 1.14.5
Scikit-learn 版本: 0.19.2
Matplotlib 版本: 2.1.2


In [3]:
import matplotlib.pyplot as plt

In [4]:
# 加载数据库的列名
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

# 加载训练集和测试集
train_path=r"F:\Jupyter\kaggle\data\NSL-KDD\KDDTrain+.txt"
test_path=r"F:\Jupyter\kaggle\data\NSL-KDD\KDDTest+.txt"
df = pd.read_csv(train_path, header=None, names = col_names)
df_test = pd.read_csv(test_path, header=None, names = col_names)

df.drop('difficulty_level',inplace=True,axis=1)
df_test.drop('difficulty_level',inplace=True,axis=1)
#数据集的shape
print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


# Step 1: 数据预处理:
   

## 识别分类特征（一共有四个离散特征，分为两类——protocol_type，service，flag;和label）
>protocol_type 有三种：TCP, UDP, ICMP 

>service：目标主机的网络服务类型，一共有七十种；训练集有66种，测试集有64种 

>flag：一共有11种 

In [5]:
#  protocol_type (column 2), service (column 3), flag (column 4).这三个特征是离散的，非二进制的，
# 遍历离散型特征
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print('Testing set:')        
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))     

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories
Testing set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


## service : 目标主机的网络服务类型，离散类型，共有70种。
aol, auth, bgp, courier, csnet_ns, ctf, daytime, discard, domain, domain_u, echo, eco_i, ecr_i, efs, exec, finger, ftp, ftp_data, gopher, harvest, hostnames, http, http_2784, http_443, http_8001, imap4, IRC, iso_tsap, klogin, kshell, ldap, link, login, mtp, name, netbios_dgm, netbios_ns, netbios_ssn, netstat, nnsp, nntp, ntp_u, other, pm_dump, pop_2, pop_3, printer, private, red_i, remote_job, rje, shell, smtp, sql_net, ssh, sunrpc, supdup, systat, telnet, tftp_u, tim_i, time, urh_i, urp_i, uucp, uucp_path, vmnet, whois, X11, Z39_50。

# 分离离散型特征

In [6]:
#df_categorical_values————存储了包含'protocol_type'，'service'和'flag'。的训练集数据
#testdf_categorical_values——'protocol_type'，'service'和'flag'的测试集数据
from sklearn.preprocessing import LabelEncoder,OneHotEncoder


categorical_columns=['protocol_type', 'service', 'flag'] 
#将这三个离散特征分离出来
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]
df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [7]:
testdf_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,private,REJ
1,tcp,private,REJ
2,tcp,ftp_data,SF
3,icmp,eco_i,SF
4,tcp,telnet,RSTO


### 对测试集和训练集每一种可能的取值都进行命名
dumcols:含有这三个特征的训练集        
testcumcols:有这三个特征的测试集        
对于每个分类特征（'protocol_type'，'service'，'flag'），代码通过调用unique()方法获取其所有唯一值，然后使用sorted()函数对这些值进行排序。这样可以确保新的列名的顺序与独热编码后的列的顺序一致。


In [8]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
# 合并
dumcols=unique_protocol2 + unique_service2 + unique_flag2
print(dumcols)

#同理于测试集，由于测试集和训练集只有service有区别，所以只要对service特别处理
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'serv

## 使用LabelEncoder()将离散特征转换为数字
LabelEncoder会为每个类别分配一个唯一的整数。这些整数是按照类别的字母顺序分配的，从0开始。

In [9]:
#df_categorical_values_enc————将dumcols中的类别标签转换为数值。
#testdf_categorical_values_enc同理
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
print(df_categorical_values_enc.head())
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [10]:
df_pre = df.copy()
df_pre.update(df_categorical_values_enc)
df_pre.head()




Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,1,20,9,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,2,44,9,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,1,49,5,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,1,24,9,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,1,24,9,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [11]:
df_test_pre=df_test.copy()
df_test_pre.update(testdf_categorical_values_enc)
df_test_pre.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,1,45,1,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,1,45,1,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,1,19,9,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,0,13,9,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,1,55,2,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [12]:
 
label=df_pre['label']
label_test=df_test_pre['label']

#把.洗掉
label = label.str.rstrip('.')
label_test = label_test.str.rstrip('.')
newlabel=label.replace({'normal':0,
                        'neptune':1,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 
                        'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                        'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'worm': 3,
                        'httptunnel': 4,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabel_test=label_test.replace({'normal' : 0,
                                  'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 
                                 'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                                 'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'worm': 3,
                                 'httptunnel': 4,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

df_pre['label'] = newlabel
df_test_pre['label'] = newlabel_test
# print(newdf['label'].tail())


In [13]:
df_pre.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,1,20,9,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0,2,44,9,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
2,0,1,49,5,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
3,0,1,24,9,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
4,0,1,24,9,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [14]:
# 将数据集分割为特征（X）和目标变量（Y）
# assign X as a dataframe of feautures and Y as a series of outcome variables
X_pre = df_pre.drop('label',axis=1)
Y_pre = df_pre.label

X_pre_test = df_test_pre.drop('label',axis=1)
Y_pre_test = df_test_pre.label

In [15]:
colNames=list(X_pre)
colNames_test=list(X_pre_test)

In [16]:
# 计算训练集X的特征方差
variances_X = np.var(X_pre, axis=0)

# 计算测试集X_test的特征方差
variances_X_test = np.var(X_pre_test, axis=0)

# 检查是否有特征方差为零
zero_variance_features_X = np.where(variances_X == 0)[0]
zero_variance_features_X_test = np.where(variances_X_test == 0)[0]

if len(zero_variance_features_X) > 0:
    print("训练集X中有特征方差为零的特征：")
    for idx in zero_variance_features_X:
        print("特征名:", colNames[idx], "，索引:", idx)
        X_pre.drop(X_pre.columns[idx], axis=1, inplace=True)
else:
    print("训练集X中没有特征方差为零的特征。")

if len(zero_variance_features_X_test) > 0:
    print("测试集X_test中有特征方差为零的特征：")
    for idx in zero_variance_features_X_test:
        print("特征名:", colNames_test[idx], "，索引:", idx)
        X_pre_test.drop(X_pre_test.columns[idx], axis=1, inplace=True)
else:
    print("测试集X_test中没有特征方差为零的特征。")

训练集X中有特征方差为零的特征：
特征名: num_outbound_cmds ，索引: 19
测试集X_test中有特征方差为零的特征：
特征名: num_outbound_cmds ，索引: 19


## 标准化：StandardScaler() 
``` python
scaler1 = preprocessing.StandardScaler().fit(X_DoS) #这行代码创建了一个 StandardScaler 对象 scaler1，并使用 X_DoS 数据计算了均值和标准差。
X_DoS=scaler1.transform(X_DoS) #这行代码将 scaler1 的缩放参数应用于 X_DoS，实现标准化。标准化后的数据将具有零均值和单位标准差,并将结果保存回 X_DoS。
print(X_DoS.std(axis=0))#代码使用 std 函数检查了每种攻击类型的训练数据的标准差是否为1。如果数据已经正确地标准化，那么其标准差应该接近1。
```

In [17]:
from sklearn.preprocessing import MinMaxScaler

# 创建 MinMaxScaler 对象
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

# 对训练集特征进行缩放
X_scaled_train = scaler1.fit_transform(X_pre)

# 对测试集特征进行缩放（使用训练集的缩放参数）
X_scaled_test = scaler2.fit_transform(X_pre_test)


In [18]:
from sklearn.naive_bayes import MultinomialNB
start_time = time.time()

# all features
clf = MultinomialNB()
clf.fit(X_pre, Y_pre)
end_time = time.time()
# 计算执行时间
training_time = end_time - start_time
print(f"训练时间: {training_time:.2f} 秒")

训练时间: 0.27 秒


In [19]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
# 使用 cross_val_predict 进行交叉验证预测
y_pre_pred = cross_val_predict(clf, X_pre_test, Y_pre_test, cv=10)

# 混淆矩阵
pd.crosstab(Y_pre_test, y_pre_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])



Predicted attacks,0,1,2,3,4
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6872,473,1999,201,166
1,0,1721,5216,518,3
2,32,1,2383,0,5
3,479,18,1798,234,225
4,49,13,111,7,20


In [20]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score,f1_score
print(classification_report(Y_pre_test, y_pre_pred, digits=3))

             precision    recall  f1-score   support

          0      0.925     0.708     0.802      9711
          1      0.773     0.231     0.355      7458
          2      0.207     0.984     0.342      2421
          3      0.244     0.085     0.126      2754
          4      0.048     0.100     0.065       200

avg / total      0.707     0.498     0.516     22544



In [21]:
from sklearn.metrics import confusion_matrix

# 计算混淆矩阵
conf_matrix = confusion_matrix(Y_pre_test, y_pre_pred)

# 提取各分类的真正例、假正例、真负例、假负例
true_positives = np.diag(conf_matrix)
false_positives = np.sum(conf_matrix, axis=0) - true_positives
false_negatives = np.sum(conf_matrix, axis=1) - true_positives
true_negatives = np.sum(conf_matrix) - (true_positives + false_positives + false_negatives)

# 计算各分类的准确率
class_accuracies = true_positives / (true_positives + false_negatives)

# 将整体准确率加入到列表中
overall_accuracy = np.sum(true_positives) / np.sum(conf_matrix)

# 创建包含准确率的数据帧
index = [str(i) for i in range(len(class_accuracies))] + ['Overall Accuracy']
df_acc = pd.DataFrame({'Accuracy': np.append(class_accuracies, overall_accuracy)}, index=index)

# 打印数据帧
print(df_acc)

                  Accuracy
0                 0.707651
1                 0.230759
2                 0.984304
3                 0.084967
4                 0.100000
Overall Accuracy  0.498137
