# 数据预处理

## 实验版本：
> 1. pandas版本 0.19.2   ---  0.22.0
> 2. numpy 版本 1.11.3   ---  1.14.5
> 3. anaconda 4.3.0   
> 4. sklearn版本 0.18.1  ---  0.19.2

In [1]:
import pandas as pd
import numpy as np
import sklearn
import os
import matplotlib
import time

In [2]:
print("Pandas 版本:", pd.__version__)
print("NumPy 版本:", np.__version__)
print("Scikit-learn 版本:", sklearn.__version__)
print("Matplotlib 版本:", matplotlib.__version__)

Pandas 版本: 0.22.0
NumPy 版本: 1.14.5
Scikit-learn 版本: 0.19.2
Matplotlib 版本: 2.1.2


## 数据集NSL-KDD和KDD-99

> col——names :42个列名    
> 训练集：KDDTrain--df   及其列名（xxx,42）    
> 测试集：KDDTest--df_test    及其列名（xxx,42）    

In [3]:
import matplotlib.pyplot as plt

In [4]:
# 加载数据库的列名
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

# 加载训练集和测试集
train_path=r"F:\Jupyter\kaggle\data\NSL-KDD\KDDTrain+.txt"
test_path=r"F:\Jupyter\kaggle\data\NSL-KDD\KDDTest+.txt"
df = pd.read_csv(train_path, header=None, names = col_names)
df_test = pd.read_csv(test_path, header=None, names = col_names)

df.drop('difficulty_level',inplace=True,axis=1)
df_test.drop('difficulty_level',inplace=True,axis=1)
#数据集的shape
print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


## 数据集样例

In [5]:
df_test.head()



Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [6]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

## Label Distribution of Training and Test set
理论上测试集有23种label（22+1），训练集有38种
df的label有23种    
df_test的label有38种   

In [7]:
num_train = df['label'].nunique()
num_test = df_test['label'].nunique()
print('Number of unique labels in Training set:', num_train)
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Number of unique labels in Test set:', num_test)
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Number of unique labels in Training set: 23
Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Number of unique labels in Test set: 38
Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess   

# 计算 DataFrame（df）中每一列的缺失值数量。
具体来说，df.isnull() 方法会返回一个布尔类型的 DataFrame，其中缺失值位置为 True，
#非缺失值位置为 False。而 sum() 方法会对这个布尔类型的 DataFrame 进行求和操作，因为在 Python 中，True 被当作 1，False 被当作 0，所以最终的求和结果就是每列缺失值的数量。

In [8]:
df.isnull().sum() 


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [9]:
df_test.isnull().sum() 

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

# Step 1: 数据预处理:
   

## 识别分类特征（一共有四个离散特征，分为两类——protocol_type，service，flag;和label）
>protocol_type 有三种：TCP, UDP, ICMP 

>service：目标主机的网络服务类型，一共有七十种；训练集有66种，测试集有64种 

>flag：一共有11种 

In [10]:
#  protocol_type (column 2), service (column 3), flag (column 4).这三个特征是离散的，非二进制的，
# 遍历离散型特征
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print('Testing set:')        
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))     

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories
Testing set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


## service : 目标主机的网络服务类型，离散类型，共有70种。
aol, auth, bgp, courier, csnet_ns, ctf, daytime, discard, domain, domain_u, echo, eco_i, ecr_i, efs, exec, finger, ftp, ftp_data, gopher, harvest, hostnames, http, http_2784, http_443, http_8001, imap4, IRC, iso_tsap, klogin, kshell, ldap, link, login, mtp, name, netbios_dgm, netbios_ns, netbios_ssn, netstat, nnsp, nntp, ntp_u, other, pm_dump, pop_2, pop_3, printer, private, red_i, remote_job, rje, shell, smtp, sql_net, ssh, sunrpc, supdup, systat, telnet, tftp_u, tim_i, time, urh_i, urp_i, uucp, uucp_path, vmnet, whois, X11, Z39_50。

# 分离离散型特征

In [11]:
#df_categorical_values————存储了包含'protocol_type'，'service'和'flag'。的训练集数据
#testdf_categorical_values——'protocol_type'，'service'和'flag'的测试集数据
from sklearn.preprocessing import LabelEncoder,OneHotEncoder


categorical_columns=['protocol_type', 'service', 'flag'] 
#将这三个离散特征分离出来
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]
df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [12]:
testdf_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,private,REJ
1,tcp,private,REJ
2,tcp,ftp_data,SF
3,icmp,eco_i,SF
4,tcp,telnet,RSTO


### 对测试集和训练集每一种可能的取值都进行命名
dumcols:含有这三个特征的训练集        
testcumcols:有这三个特征的测试集        
对于每个分类特征（'protocol_type'，'service'，'flag'），代码通过调用unique()方法获取其所有唯一值，然后使用sorted()函数对这些值进行排序。这样可以确保新的列名的顺序与独热编码后的列的顺序一致。


In [13]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
# 合并
dumcols=unique_protocol2 + unique_service2 + unique_flag2
print(dumcols)

#同理于测试集，由于测试集和训练集只有service有区别，所以只要对service特别处理
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'serv

## 使用LabelEncoder()将离散特征转换为数字
LabelEncoder会为每个类别分配一个唯一的整数。这些整数是按照类别的字母顺序分配的，从0开始。

In [14]:
#df_categorical_values_enc————将dumcols中的类别标签转换为数值。
#testdf_categorical_values_enc同理
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
print(df_categorical_values_enc.head())
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


# One-Hot-Encoding独热编码
One-Hot-Encoding 独热编码，用于将所有分类特征转换为二元特征。    
该转换器的输入应该是一个整数矩阵，表示分类（离散）特征的值。输出将是一个稀疏矩阵，其中每列对应一个特征的一个可能值。

> 对分类特征进行独热编码（One-Hot-Encoding）。首先，创建了一个OneHotEncoder对象enc。然后，使用 fit_transform方法对df_categorical_values_enc进行拟合和转换，结果存储在df_categorical_values_encenc中>。这个方法会返回一个稀疏矩阵，表示独热编码后的数据。
接着，将独热编码后的数据转换为一个DataFrame，列名由dumcols指定。这样，每一列都对应一个原始的分类特征的一个类别，如果某个样本的该特征是该类别，那么对应的值为1，否则为0。
对于测试集，也进行了同样的处理，只是列名由testdumcols指定

In [15]:
# df_cat_data----训练集每一行对于提取出来的每个特征作为列向量，若有这个特征就把值设为1，没有就是0
# testdf_cat_data


enc = OneHotEncoder()
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)
# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Add  missing categories 
> 训练集和测试集中都有各自独有的属性，为了训练需要统一化；
首先，代码从训练集df和测试集df_test的 DataFrame 中提取出 'service' 列，并将其转换为列表（tolist() 方法）。
然后，使用 Python 的集合（set）操作找出训练集中存在但测试集中不存在的类别train_difference
和测试集中存在但训练集中不存在的类别test_difference
将这些新添加的类别的值全部设为0——————————————这样之后二者的维度和内容就完全可以对应上

> 使用 join() 方法将 df 和 df_cat_data 合并为一个新的 DataFrame newdf。这个操作基于索引进行，也就是说，它会将 df_cat_data 中的列添加到 df 中，同时保持行索引的对应关系。然后，使用 drop() 方法删除 'flag'、'protocol_type' 和 'service' 这三列。这可能是因为这些列已经被编码并添加到数据框中，原始的列就不再需要了。
对测试数据执行相同的操作：将 df_test 和 testdf_cat_data 合并，并删除 'flag'、'protocol_type' 和 'service' 这三列。


In [16]:

trainservice = df['service'].tolist()
testservice = df_test['service'].tolist()

train_difference = list(set(trainservice) - set(testservice))
test_difference = list(set(testservice) - set(trainservice))

string = 'service_'
train_difference = [string + x for x in train_difference]
test_difference = [string + x for x in test_difference]

print (train_difference)
print(test_difference)

['service_red_i', 'service_aol', 'service_http_2784', 'service_urh_i', 'service_harvest', 'service_http_8001']
[]


In [17]:
for col in train_difference:
    testdf_cat_data[col] = 0

for col in test_difference:
    df_cat_data[col] = 0

testdf_cat_data.shape

(22544, 84)

In [18]:
df_cat_data.shape

(125973, 84)

In [19]:
df_test.shape

(22544, 42)

In [20]:
df.shape

(125973, 42)

## 将处理过的离散型特征与原本的数据型特征进行合并
newdf--训练集
newdf--test测试集

In [21]:
newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)
# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)
print(newdf.shape)
print(newdf_test.shape)

(125973, 123)
(22544, 123)


# 将数据集划分给四种攻击类别
## 重命名五种类别: 0=normal, 1=DoS, 2=Probe, 3=R2L and 4=U2R.
labeldf————从newdf中把label提取出来，使用replace函数（将离散转换为数值）————》得到newlabeldf，最后再赋回到newdf的label中    
labeldf_test————同理

In [22]:
labeldf=newdf['label']
labeldf_test=newdf_test['label']

#把.洗掉
labeldf = labeldf.str.rstrip('.')
labeldf_test = labeldf_test.str.rstrip('.')
newlabeldf=labeldf.replace({'normal':0,
                            'neptune':1,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 
                             'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                             'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'worm': 3,
                             'httptunnel': 4,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({'normal' : 0,
                             'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 
                             'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,
                             'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'worm': 3,
                             'httptunnel': 4,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test
# print(newdf['label'].tail())

# Step 2: 特征缩放

In [23]:
# 将数据集分割为特征（X）和目标变量（Y）
# assign X as a dataframe of feautures and Y as a series of outcome variables
X = newdf.drop('label',axis=1)
Y = newdf.label

X_test = newdf_test.drop('label',axis=1)
Y_test = newdf_test.label

### 保存功能名称列表以备后用（每个攻击类别都相同）。在此阶段将删除列名。

In [24]:
#colNames 就是一个包含 X_DoS 中所有列名称的列表
colNames=list(X)
colNames_test=list(X_test)

In [25]:
# 计算训练集X的特征方差
variances_X = np.var(X, axis=0)

# 计算测试集X_test的特征方差
variances_X_test = np.var(X_test, axis=0)

# 检查是否有特征方差为零
zero_variance_features_X = np.where(variances_X == 0)[0]
zero_variance_features_X_test = np.where(variances_X_test == 0)[0]

if len(zero_variance_features_X) > 0:
    print("训练集X中有特征方差为零的特征：")
    for idx in zero_variance_features_X:
        print("特征名:", colNames[idx], "，索引:", idx)
        X.drop(X.columns[idx], axis=1, inplace=True)
        X_test.drop(X.columns[idx], axis=1, inplace=True)        
else:
    print("训练集X中没有特征方差为零的特征。")





训练集X中有特征方差为零的特征：
特征名: num_outbound_cmds ，索引: 16


## 标准化：StandardScaler() 
``` python
scaler1 = preprocessing.StandardScaler().fit(X_DoS) #这行代码创建了一个 StandardScaler 对象 scaler1，并使用 X_DoS 数据计算了均值和标准差。
X_DoS=scaler1.transform(X_DoS) #这行代码将 scaler1 的缩放参数应用于 X_DoS，实现标准化。标准化后的数据将具有零均值和单位标准差,并将结果保存回 X_DoS。
print(X_DoS.std(axis=0))#代码使用 std 函数检查了每种攻击类型的训练数据的标准差是否为1。如果数据已经正确地标准化，那么其标准差应该接近1。
```

In [26]:
from sklearn.preprocessing import MinMaxScaler

# 创建 MinMaxScaler 对象
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

# 对训练集特征进行缩放
X_scaled_train = scaler1.fit_transform(X)

# 对测试集特征进行缩放（使用训练集的缩放参数）
X_scaled_test = scaler2.fit_transform(X_test)

%%time 和 time 模块测量时间的方式不同，导致计算结果存在差异。
%%time 魔术命令：
%%time 是 Jupyter Notebook 或 IPython 环境中的一种方便的计时工具。
它会自动测量代码块的执行时间，包括代码块中的所有操作（包括内部函数调用、循环等）。
%%time 会考虑到代码块的整体执行时间，包括一切开销。
由于它是自动计时的，因此不需要手动记录开始和结束时间戳。
time 模块：
time 模块是 Python 标准库中的一个模块，用于处理时间和日期。
通过手动记录开始和结束时间戳，然后计算两者之间的差值，可以精确测量代码块的执行时间。
但是，time 模块只会计算你明确指定的代码块的执行时间，不会包括其他操作的开销。
因此，如果你在同一个代码块中同时使用了 %%time 和 time 模块，你会看到它们的计算结果存在差异。%%time 考虑了更多的因素，而 time 模块只计算了你明确指定的代码块的执行时间。

如果你想要更精确的计时结果，建议使用 time 模块手动记录开始和结束时间戳。

In [27]:

from sklearn.naive_bayes import MultinomialNB
start_time = time.time()

# all features
clf = MultinomialNB()
clf.fit(X, Y)
end_time = time.time()
# 计算执行时间
training_time = end_time - start_time
print(f"训练时间: {training_time:.2f} 秒")

训练时间: 0.19 秒


In [28]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
start_time = time.time()


# 使用 cross_val_predict 进行交叉验证预测
y_pred = cross_val_predict(clf, X_test, Y_test, cv=10)
end_time = time.time()
# 计算执行时间
training_time = end_time - start_time
print(f"训练时间: {training_time:.2f} 秒")
# 混淆矩阵
pd.crosstab(Y_test, y_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])



训练时间: 0.44 秒


Predicted attacks,0,1,2,3,4
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6896,481,1967,204,163
1,0,1717,5218,520,3
2,66,5,2346,0,4
3,650,48,1581,235,240
4,50,13,111,7,19


In [29]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score,f1_score
print(classification_report(Y_test, y_pred, digits=3))

             precision    recall  f1-score   support

          0      0.900     0.710     0.794      9711
          1      0.758     0.230     0.353      7458
          2      0.209     0.969     0.344      2421
          3      0.243     0.085     0.126      2754
          4      0.044     0.095     0.060       200

avg / total      0.691     0.497     0.512     22544



In [30]:
from sklearn.metrics import confusion_matrix

# 计算混淆矩阵
conf_matrix = confusion_matrix(Y_test, y_pred)

# 提取各分类的真正例、假正例、真负例、假负例
true_positives = np.diag(conf_matrix)
false_positives = np.sum(conf_matrix, axis=0) - true_positives
false_negatives = np.sum(conf_matrix, axis=1) - true_positives
true_negatives = np.sum(conf_matrix) - (true_positives + false_positives + false_negatives)

# 计算各分类的准确率
class_accuracies = true_positives / (true_positives + false_negatives)

# 将整体准确率加入到列表中
overall_accuracy = np.sum(true_positives) / np.sum(conf_matrix)

# 创建包含准确率的数据帧
index = [str(i) for i in range(len(class_accuracies))] + ['Overall Accuracy']
df_ac = pd.DataFrame({'Accuracy': np.append(class_accuracies, overall_accuracy)}, index=index)

# 打印数据帧
print(df_ac)

                  Accuracy
0                 0.710123
1                 0.230223
2                 0.969021
3                 0.085330
4                 0.095000
Overall Accuracy  0.497383
