In [1]:
# 载入原始数据集
import pandas as pd
test_data = pd.read_csv("KDDTest.csv")
train_data = pd.read_csv("KDDTrain.csv")

In [2]:
# 原始数据集处理
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

train_data['protocol_type'] = label_encoder.fit_transform(train_data['protocol_type'])
test_data['protocol_type'] = label_encoder.fit_transform(test_data['protocol_type'])
train_data['service'] = label_encoder.fit_transform(train_data['service'])
test_data['service'] = label_encoder.fit_transform(test_data['service'])
train_data['flag'] = label_encoder.fit_transform(train_data['flag'])
test_data['flag'] = label_encoder.fit_transform(test_data['flag'])

# 保存修改后的数据
train_data.to_csv("train_data.csv",index=False)
test_data.to_csv("test_data.csv",index=False)

In [3]:
# 处理自体数据
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
test_self = test_data[test_data['class'] == 'normal'].drop(['class', 'num'], axis=1)
train_self = train_data[train_data['class'] == 'normal'].drop(['class', 'num'], axis=1)

train_self = pd.DataFrame(scaler.fit_transform(train_self).round(5))
test_self = pd.DataFrame(scaler.fit_transform(test_self).round(5))
print('train_self:')
print(len(train_self))
print('test_self:')
print(len(test_self))
test_self.to_csv("self/test_self.csv",index=False)
train_self.to_csv("self/train_self.csv",index=False)

# 处理非自体数据
test_nonself = test_data[test_data['class'] != 'normal'].drop(['class', 'num'], axis=1)
train_nonself = train_data[train_data['class'] != 'normal'].drop(['class', 'num'], axis=1)
train_nonself = pd.DataFrame(scaler.fit_transform(train_nonself).round(5))
test_nonself = pd.DataFrame(scaler.fit_transform(test_nonself).round(5))
print('train_nonself:')
print(len(train_nonself))
print('test_nonself:')
print(len(test_nonself))
test_nonself.to_csv("nonself/test_nonself.csv",index=False)
train_nonself.to_csv("nonself/train_nonself.csv",index=False)


train_self:
13449
test_self:
9711
train_nonself:
11743
test_nonself:
12833


In [4]:
# 打印训练集、测试集中所有class的种类
print("训练集中的攻击类型:")
print(train_data['class'].unique())
print("\n测试集中的攻击类型:") 
print(test_data['class'].unique())


训练集中的攻击类型:
['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy']

测试集中的攻击类型:
['neptune' 'normal' 'saint' 'mscan' 'guess_passwd' 'smurf' 'apache2'
 'satan' 'buffer_overflow' 'back' 'warezmaster' 'snmpgetattack'
 'processtable' 'pod' 'httptunnel' 'nmap' 'ps' 'snmpguess' 'ipsweep'
 'mailbomb' 'portsweep' 'multihop' 'named' 'sendmail' 'loadmodule' 'xterm'
 'worm' 'teardrop' 'rootkit' 'xlock' 'perl' 'land' 'xsnoop' 'sqlattack'
 'ftp_write' 'imap' 'udpstorm' 'phf']


In [5]:
# 定义攻击类型映射字典
attack_mapping = {
    'back': 'dos', 'land': 'dos', 'neptune': 'dos', 'pod': 'dos', 'smurf': 'dos', 'teardrop': 'dos', 'udpstorm': 'dos', 'processtable': 'dos', 'mailbomb': 'dos',
    'apache2': 'dos',
    
    'buffer_overflow': 'u2r', 'loadmodule': 'u2r', 'perl': 'u2r', 'rootkit': 'u2r', 'ps': 'u2r', 'xterm': 'u2r', 'sqlattack': 'u2r',
    
    'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'multihop': 'r2l', 'phf': 'r2l', 'warezmaster': 'r2l',
    'snmpgetattack': 'r2l', 'httptunnel': 'r2l', 'snmpguess': 'r2l', 'named': 'r2l', 'sendmail': 'r2l', 'xlock': 'r2l',
    'xsnoop': 'r2l', 'worm': 'r2l','warezclient': 'r2l','spy':'r2l',
    
    'ipsweep': 'probe', 'nmap': 'probe', 'portsweep': 'probe', 'satan': 'probe', 'saint': 'probe', 'mscan': 'probe'
}

# 将训练集中的具体攻击类型映射为大类
train_data['class'] = train_data['class'].map(lambda x: attack_mapping.get(x, x))

# 将测试集中的具体攻击类型映射为大类  
test_data['class'] = test_data['class'].map(lambda x: attack_mapping.get(x, x))

# print("映射后训练集中的攻击类型:")
# print(train_data['class'].unique())
# print("\n映射后测试集中的攻击类型:")
# print(test_data['class'].unique())

# 保存映射后的数据
train_data.to_csv("seed_4_type/train_data.csv", index=False)
test_data.to_csv("seed_4_type/test_data.csv", index=False)

# 统计每个类别的数量
print("训练集中各类别的数量:")
print(train_data['class'].value_counts())

print("\n测试集中各类别的数量:")  
print(test_data['class'].value_counts())


训练集中各类别的数量:
class
normal    13449
dos        9234
probe      2289
r2l         209
u2r          11
Name: count, dtype: int64

测试集中各类别的数量:
class
normal    9711
dos       7458
r2l       2887
probe     2421
u2r         67
Name: count, dtype: int64


In [6]:
# # （全部种类）数据处理
# # (全部种类)测试集数据攻击分类
# test_back = test_data[test_data['class'] == 'back'].drop(['class', 'num'], axis=1)
# test_back = pd.DataFrame(scaler.fit_transform(test_back).round(5))

# test_buffer_overflow = test_data[test_data['class'] == 'buffer_overflow'].drop(['class', 'num'], axis=1) 
# test_buffer_overflow = pd.DataFrame(scaler.fit_transform(test_buffer_overflow).round(5))

# test_ftp_write = test_data[test_data['class'] == 'ftp_write'].drop(['class', 'num'], axis=1)  
# test_ftp_write = pd.DataFrame(scaler.fit_transform(test_ftp_write).round(5))

# test_guess_passwd = test_data[test_data['class'] == 'guess_passwd'].drop(['class', 'num'], axis=1)
# test_guess_passwd = pd.DataFrame(scaler.fit_transform(test_guess_passwd).round(5))

# test_httptunnel = test_data[test_data['class'] == 'httptunnel'].drop(['class', 'num'], axis=1)
# test_httptunnel = pd.DataFrame(scaler.fit_transform(test_httptunnel).round(5))

# test_imap = test_data[test_data['class'] == 'imap'].drop(['class', 'num'], axis=1)
# test_imap = pd.DataFrame(scaler.fit_transform(test_imap).round(5))

# test_ipsweep = test_data[test_data['class'] == 'ipsweep'].drop(['class', 'num'], axis=1)
# test_ipsweep = pd.DataFrame(scaler.fit_transform(test_ipsweep).round(5))

# test_land = test_data[test_data['class'] == 'land'].drop(['class', 'num'], axis=1)
# test_land = pd.DataFrame(scaler.fit_transform(test_land).round(5))

# test_loadmodule = test_data[test_data['class'] == 'loadmodule'].drop(['class', 'num'], axis=1)
# test_loadmodule = pd.DataFrame(scaler.fit_transform(test_loadmodule).round(5))

# test_mailbomb = test_data[test_data['class'] == 'mailbomb'].drop(['class', 'num'], axis=1)
# test_mailbomb = pd.DataFrame(scaler.fit_transform(test_mailbomb).round(5))

# test_mscan = test_data[test_data['class'] == 'mscan'].drop(['class', 'num'], axis=1)
# test_mscan = pd.DataFrame(scaler.fit_transform(test_mscan).round(5))

# test_multihop = test_data[test_data['class'] == 'multihop'].drop(['class', 'num'], axis=1)
# test_multihop = pd.DataFrame(scaler.fit_transform(test_multihop).round(5))

# test_named = test_data[test_data['class'] == 'named'].drop(['class', 'num'], axis=1)
# test_named = pd.DataFrame(scaler.fit_transform(test_named).round(5))

# test_neptune = test_data[test_data['class'] == 'neptune'].drop(['class', 'num'], axis=1)
# test_neptune = pd.DataFrame(scaler.fit_transform(test_neptune).round(5))

# test_nmap = test_data[test_data['class'] == 'nmap'].drop(['class', 'num'], axis=1)
# test_nmap = pd.DataFrame(scaler.fit_transform(test_nmap).round(5))

# test_perl = test_data[test_data['class'] == 'perl'].drop(['class', 'num'], axis=1)
# test_perl = pd.DataFrame(scaler.fit_transform(test_perl).round(5))

# test_phf = test_data[test_data['class'] == 'phf'].drop(['class', 'num'], axis=1)
# test_phf = pd.DataFrame(scaler.fit_transform(test_phf).round(5))

# test_pod = test_data[test_data['class'] == 'pod'].drop(['class', 'num'], axis=1)
# test_pod = pd.DataFrame(scaler.fit_transform(test_pod).round(5))

# test_portsweep = test_data[test_data['class'] == 'portsweep'].drop(['class', 'num'], axis=1)
# test_portsweep = pd.DataFrame(scaler.fit_transform(test_portsweep).round(5))

# test_processtable = test_data[test_data['class'] == 'processtable'].drop(['class', 'num'], axis=1)
# test_processtable = pd.DataFrame(scaler.fit_transform(test_processtable).round(5))

# test_rootkit = test_data[test_data['class'] == 'rootkit'].drop(['class', 'num'], axis=1)
# test_rootkit = pd.DataFrame(scaler.fit_transform(test_rootkit).round(5))

# test_saint = test_data[test_data['class'] == 'saint'].drop(['class', 'num'], axis=1)
# test_saint = pd.DataFrame(scaler.fit_transform(test_saint).round(5))

# test_satan = test_data[test_data['class'] == 'satan'].drop(['class', 'num'], axis=1)
# test_satan = pd.DataFrame(scaler.fit_transform(test_satan).round(5))

# test_sendmail = test_data[test_data['class'] == 'sendmail'].drop(['class', 'num'], axis=1)
# test_sendmail = pd.DataFrame(scaler.fit_transform(test_sendmail).round(5))

# test_snmpgetattack = test_data[test_data['class'] == 'snmpgetattack'].drop(['class', 'num'], axis=1)
# test_snmpgetattack = pd.DataFrame(scaler.fit_transform(test_snmpgetattack).round(5))

# test_snmpguess = test_data[test_data['class'] == 'snmpguess'].drop(['class', 'num'], axis=1)
# test_snmpguess = pd.DataFrame(scaler.fit_transform(test_snmpguess).round(5))

# test_sqlattack = test_data[test_data['class'] == 'sqlattack'].drop(['class', 'num'], axis=1)
# test_sqlattack = pd.DataFrame(scaler.fit_transform(test_sqlattack).round(5))

# test_teardrop = test_data[test_data['class'] == 'teardrop'].drop(['class', 'num'], axis=1)
# test_teardrop = pd.DataFrame(scaler.fit_transform(test_teardrop).round(5))

# test_udpstorm = test_data[test_data['class'] == 'udpstorm'].drop(['class', 'num'], axis=1)
# test_udpstorm = pd.DataFrame(scaler.fit_transform(test_udpstorm).round(5))

# test_warezmaster = test_data[test_data['class'] == 'warezmaster'].drop(['class', 'num'], axis=1)
# test_warezmaster = pd.DataFrame(scaler.fit_transform(test_warezmaster).round(5))

# test_worm = test_data[test_data['class'] == 'worm'].drop(['class', 'num'], axis=1)
# test_worm = pd.DataFrame(scaler.fit_transform(test_worm).round(5))

# test_xlock = test_data[test_data['class'] == 'xlock'].drop(['class', 'num'], axis=1)
# test_xlock = pd.DataFrame(scaler.fit_transform(test_xlock).round(5))

# test_xsnoop = test_data[test_data['class'] == 'xsnoop'].drop(['class', 'num'], axis=1)
# test_xsnoop = pd.DataFrame(scaler.fit_transform(test_xsnoop).round(5))

# test_xterm = test_data[test_data['class'] == 'xterm'].drop(['class', 'num'], axis=1)
# test_xterm = pd.DataFrame(scaler.fit_transform(test_xterm).round(5))

# test_apache2 = test_data[test_data['class'] == 'apache2'].drop(['class', 'num'], axis=1)
# test_apache2 = pd.DataFrame(scaler.fit_transform(test_apache2).round(5))

# test_ps = test_data[test_data['class'] == 'ps'].drop(['class', 'num'], axis=1)
# test_ps = pd.DataFrame(scaler.fit_transform(test_ps).round(5))

# test_smurf = test_data[test_data['class'] == 'smurf'].drop(['class', 'num'], axis=1)
# test_smurf = pd.DataFrame(scaler.fit_transform(test_smurf).round(5))

# total_len = len(test_nonself)

# # 打印每一类长度和比例
# print(f"test_Total: {len(test_nonself)}")
# print(f"test_back: {len(test_back)} ({len(test_back) / total_len:.2%})")
# print(f"test_buffer_overflow: {len(test_buffer_overflow)} ({len(test_buffer_overflow) / total_len:.2%})")
# print(f"test_ftp_write: {len(test_ftp_write)} ({len(test_ftp_write) / total_len:.2%})")
# print(f"test_guess_passwd: {len(test_guess_passwd)} ({len(test_guess_passwd) / total_len:.2%})")
# print(f"test_httptunnel: {len(test_httptunnel)} ({len(test_httptunnel) / total_len:.2%})")
# print(f"test_imap: {len(test_imap)} ({len(test_imap) / total_len:.2%})")
# print(f"test_ipsweep: {len(test_ipsweep)} ({len(test_ipsweep) / total_len:.2%})")
# print(f"test_land: {len(test_land)} ({len(test_land) / total_len:.2%})")
# print(f"test_loadmodule: {len(test_loadmodule)} ({len(test_loadmodule) / total_len:.2%})")
# print(f"test_mailbomb: {len(test_mailbomb)} ({len(test_mailbomb) / total_len:.2%})")
# print(f"test_mscan: {len(test_mscan)} ({len(test_mscan) / total_len:.2%})")
# print(f"test_multihop: {len(test_multihop)} ({len(test_multihop) / total_len:.2%})")
# print(f"test_named: {len(test_named)} ({len(test_named) / total_len:.2%})")
# print(f"test_neptune: {len(test_neptune)} ({len(test_neptune) / total_len:.2%})")
# print(f"test_nmap: {len(test_nmap)} ({len(test_nmap) / total_len:.2%})")
# print(f"test_perl: {len(test_perl)} ({len(test_perl) / total_len:.2%})")
# print(f"test_phf: {len(test_phf)} ({len(test_phf) / total_len:.2%})")
# print(f"test_pod: {len(test_pod)} ({len(test_pod) / total_len:.2%})")
# print(f"test_portsweep: {len(test_portsweep)} ({len(test_portsweep) / total_len:.2%})")
# print(f"test_processtable: {len(test_processtable)} ({len(test_processtable) / total_len:.2%})")
# print(f"test_rootkit: {len(test_rootkit)} ({len(test_rootkit) / total_len:.2%})")
# print(f"test_saint: {len(test_saint)} ({len(test_saint) / total_len:.2%})")
# print(f"test_satan: {len(test_satan)} ({len(test_satan) / total_len:.2%})")
# print(f"test_sendmail: {len(test_sendmail)} ({len(test_sendmail) / total_len:.2%})")
# print(f"test_snmpgetattack: {len(test_snmpgetattack)} ({len(test_snmpgetattack) / total_len:.2%})")
# print(f"test_snmpguess: {len(test_snmpguess)} ({len(test_snmpguess) / total_len:.2%})")
# print(f"test_sqlattack: {len(test_sqlattack)} ({len(test_sqlattack) / total_len:.2%})")
# print(f"test_teardrop: {len(test_teardrop)} ({len(test_teardrop) / total_len:.2%})")
# print(f"test_udpstorm: {len(test_udpstorm)} ({len(test_udpstorm) / total_len:.2%})")
# print(f"test_warezmaster: {len(test_warezmaster)} ({len(test_warezmaster) / total_len:.2%})")
# print(f"test_worm: {len(test_worm)} ({len(test_worm) / total_len:.2%})")
# print(f"test_xlock: {len(test_xlock)} ({len(test_xlock) / total_len:.2%})")
# print(f"test_xsnoop: {len(test_xsnoop)} ({len(test_xsnoop) / total_len:.2%})")
# print(f"test_xterm: {len(test_xterm)} ({len(test_xterm) / total_len:.2%})")
# print(f"test_apache2: {len(test_apache2)} ({len(test_apache2) / total_len:.2%})")
# print(f"test_ps: {len(test_ps)} ({len(test_ps) / total_len:.2%})")
# print(f"test_smurf: {len(test_smurf)} ({len(test_smurf) / total_len:.2%})")
# print('\n')

# # （全部种类）训练集数据攻击分类
# train_neptune = train_data[train_data['class'] == 'neptune'].drop(['class', 'num'], axis=1)
# train_neptune = pd.DataFrame(scaler.fit_transform(train_neptune).round(5))

# train_warezclient = train_data[train_data['class'] == 'warezclient'].drop(['class', 'num'], axis=1)
# train_warezclient = pd.DataFrame(scaler.fit_transform(train_warezclient).round(5))

# train_ipsweep = train_data[train_data['class'] == 'ipsweep'].drop(['class', 'num'], axis=1)
# train_ipsweep = pd.DataFrame(scaler.fit_transform(train_ipsweep).round(5))

# train_portsweep = train_data[train_data['class'] == 'portsweep'].drop(['class', 'num'], axis=1)
# train_portsweep = pd.DataFrame(scaler.fit_transform(train_portsweep).round(5))

# train_teardrop = train_data[train_data['class'] == 'teardrop'].drop(['class', 'num'], axis=1)
# train_teardrop = pd.DataFrame(scaler.fit_transform(train_teardrop).round(5))

# train_nmap = train_data[train_data['class'] == 'nmap'].drop(['class', 'num'], axis=1)
# train_nmap = pd.DataFrame(scaler.fit_transform(train_nmap).round(5))

# train_guess_passwd = train_data[train_data['class'] == 'guess_passwd'].drop(['class', 'num'], axis=1)
# train_guess_passwd = pd.DataFrame(scaler.fit_transform(train_guess_passwd).round(5))

# train_ftp_write = train_data[train_data['class'] == 'ftp_write'].drop(['class', 'num'], axis=1)
# train_ftp_write = pd.DataFrame(scaler.fit_transform(train_ftp_write).round(5))

# train_multihop = train_data[train_data['class'] == 'multihop'].drop(['class', 'num'], axis=1)
# train_multihop = pd.DataFrame(scaler.fit_transform(train_multihop).round(5))

# train_satan = train_data[train_data['class'] == 'satan'].drop(['class', 'num'], axis=1)
# train_satan = pd.DataFrame(scaler.fit_transform(train_satan).round(5))

# train_smurf = train_data[train_data['class'] == 'smurf'].drop(['class', 'num'], axis=1)
# train_smurf = pd.DataFrame(scaler.fit_transform(train_smurf).round(5))

# train_pod = train_data[train_data['class'] == 'pod'].drop(['class', 'num'], axis=1)
# train_pod = pd.DataFrame(scaler.fit_transform(train_pod).round(5))

# train_back = train_data[train_data['class'] == 'back'].drop(['class', 'num'], axis=1)
# train_back = pd.DataFrame(scaler.fit_transform(train_back).round(5))

# train_rootkit = train_data[train_data['class'] == 'rootkit'].drop(['class', 'num'], axis=1)
# train_rootkit = pd.DataFrame(scaler.fit_transform(train_rootkit).round(5))

# train_buffer_overflow = train_data[train_data['class'] == 'buffer_overflow'].drop(['class', 'num'], axis=1)
# train_buffer_overflow = pd.DataFrame(scaler.fit_transform(train_buffer_overflow).round(5))

# train_phf = train_data[train_data['class'] == 'phf'].drop(['class', 'num'], axis=1)
# train_phf = pd.DataFrame(scaler.fit_transform(train_phf).round(5))

# train_land = train_data[train_data['class'] == 'land'].drop(['class', 'num'], axis=1)
# train_land = pd.DataFrame(scaler.fit_transform(train_land).round(5))

# train_imap = train_data[train_data['class'] == 'imap'].drop(['class', 'num'], axis=1)
# train_imap = pd.DataFrame(scaler.fit_transform(train_imap).round(5))

# train_warezmaster = train_data[train_data['class'] == 'warezmaster'].drop(['class', 'num'], axis=1)
# train_warezmaster = pd.DataFrame(scaler.fit_transform(train_warezmaster).round(5))

# train_loadmodule = train_data[train_data['class'] == 'loadmodule'].drop(['class', 'num'], axis=1)
# train_loadmodule = pd.DataFrame(scaler.fit_transform(train_loadmodule).round(5))

# train_spy = train_data[train_data['class'] == 'spy'].drop(['class', 'num'], axis=1)
# train_spy = pd.DataFrame(scaler.fit_transform(train_spy).round(5))

# train_perl = train_data[train_data['class'] == 'perl'].drop(['class', 'num'], axis=1)
# train_perl = pd.DataFrame(scaler.fit_transform(train_perl).round(5))

# total_len = len(train_nonself)

# # 打印每一类长度和比例
# print(f"train_Total: {len(train_nonself)}")
# print(f"train_neptune: {len(train_neptune)} ({len(train_neptune) / total_len:.2%})")
# print(f"train_warezclient: {len(train_warezclient)} ({len(train_warezclient) / total_len:.2%})")
# print(f"train_ipsweep: {len(train_ipsweep)} ({len(train_ipsweep) / total_len:.2%})")
# print(f"train_portsweep: {len(train_portsweep)} ({len(train_portsweep) / total_len:.2%})")
# print(f"train_teardrop: {len(train_teardrop)} ({len(train_teardrop) / total_len:.2%})")
# print(f"train_nmap: {len(train_nmap)} ({len(train_nmap) / total_len:.2%})")
# print(f"train_guess_passwd: {len(train_guess_passwd)} ({len(train_guess_passwd) / total_len:.2%})")
# print(f"train_ftp_write: {len(train_ftp_write)} ({len(train_ftp_write) / total_len:.2%})")
# print(f"train_multihop: {len(train_multihop)} ({len(train_multihop) / total_len:.2%})")
# print(f"train_satan: {len(train_satan)} ({len(train_satan) / total_len:.2%})")
# print(f"train_smurf: {len(train_smurf)} ({len(train_smurf) / total_len:.2%})")
# print(f"train_pod: {len(train_pod)} ({len(train_pod) / total_len:.2%})")
# print(f"train_back: {len(train_back)} ({len(train_back) / total_len:.2%})")
# print(f"train_rootkit: {len(train_rootkit)} ({len(train_rootkit) / total_len:.2%})")
# print(f"train_buffer_overflow: {len(train_buffer_overflow)} ({len(train_buffer_overflow) / total_len:.2%})")
# print(f"train_phf: {len(train_phf)} ({len(train_phf) / total_len:.2%})")
# print(f"train_land: {len(train_land)} ({len(train_land) / total_len:.2%})")
# print(f"train_imap: {len(train_imap)} ({len(train_imap) / total_len:.2%})")
# print(f"train_warezmaster: {len(train_warezmaster)} ({len(train_warezmaster) / total_len:.2%})")
# print(f"train_loadmodule: {len(train_loadmodule)} ({len(train_loadmodule) / total_len:.2%})")
# print(f"train_spy: {len(train_spy)} ({len(train_spy) / total_len:.2%})")
# print(f"train_perl: {len(train_perl)} ({len(train_perl) / total_len:.2%})")

# # （全部种类）测试集未知数据攻击分类
# test_saint = test_data[test_data['class'] == 'saint'].drop(['class', 'num'], axis=1)
# test_saint = pd.DataFrame(scaler.fit_transform(test_saint).round(5))

# test_mscan = test_data[test_data['class'] == 'mscan'].drop(['class', 'num'], axis=1)
# test_mscan = pd.DataFrame(scaler.fit_transform(test_mscan).round(5))

# test_apache2 = test_data[test_data['class'] == 'apache2'].drop(['class', 'num'], axis=1)
# test_apache2 = pd.DataFrame(scaler.fit_transform(test_apache2).round(5))

# test_snmpgetattack = test_data[test_data['class'] == 'snmpgetattack'].drop(['class', 'num'], axis=1)
# test_snmpgetattack = pd.DataFrame(scaler.fit_transform(test_snmpgetattack).round(5))

# test_processtable = test_data[test_data['class'] == 'processtable'].drop(['class', 'num'], axis=1)
# test_processtable = pd.DataFrame(scaler.fit_transform(test_processtable).round(5))

# test_httptunnel = test_data[test_data['class'] == 'httptunnel'].drop(['class', 'num'], axis=1)
# test_httptunnel = pd.DataFrame(scaler.fit_transform(test_httptunnel).round(5))

# test_ps = test_data[test_data['class'] == 'ps'].drop(['class', 'num'], axis=1)
# test_ps = pd.DataFrame(scaler.fit_transform(test_ps).round(5))

# test_snmpguess = test_data[test_data['class'] == 'snmpguess'].drop(['class', 'num'], axis=1)
# test_snmpguess = pd.DataFrame(scaler.fit_transform(test_snmpguess).round(5))

# test_mailbomb = test_data[test_data['class'] == 'mailbomb'].drop(['class', 'num'], axis=1)
# test_mailbomb = pd.DataFrame(scaler.fit_transform(test_mailbomb).round(5))

# test_named = test_data[test_data['class'] == 'named'].drop(['class', 'num'], axis=1)
# test_named = pd.DataFrame(scaler.fit_transform(test_named).round(5))

# test_sendmail = test_data[test_data['class'] == 'sendmail'].drop(['class', 'num'], axis=1)
# test_sendmail = pd.DataFrame(scaler.fit_transform(test_sendmail).round(5))

# test_xterm = test_data[test_data['class'] == 'xterm'].drop(['class', 'num'], axis=1)
# test_xterm = pd.DataFrame(scaler.fit_transform(test_xterm).round(5))

# test_worm = test_data[test_data['class'] == 'worm'].drop(['class', 'num'], axis=1)
# test_worm = pd.DataFrame(scaler.fit_transform(test_worm).round(5))

# test_xlock = test_data[test_data['class'] == 'xlock'].drop(['class', 'num'], axis=1)
# test_xlock = pd.DataFrame(scaler.fit_transform(test_xlock).round(5))

# test_xsnoop = test_data[test_data['class'] == 'xsnoop'].drop(['class', 'num'], axis=1)
# test_xsnoop = pd.DataFrame(scaler.fit_transform(test_xsnoop).round(5))

# test_sqlattack = test_data[test_data['class'] == 'sqlattack'].drop(['class', 'num'], axis=1)
# test_sqlattack = pd.DataFrame(scaler.fit_transform(test_sqlattack).round(5))

# test_udpstorm = test_data[test_data['class'] == 'udpstorm'].drop(['class', 'num'], axis=1)
# test_udpstorm = pd.DataFrame(scaler.fit_transform(test_udpstorm).round(5))

# # 打印每一类长度
# print(f"test_saint: {len(test_saint)}")
# print(f"test_mscan: {len(test_mscan)}")
# print(f"test_apache2: {len(test_apache2)}")
# print(f"test_snmpgetattack: {len(test_snmpgetattack)}")
# print(f"test_processtable: {len(test_processtable)}")
# print(f"test_httptunnel: {len(test_httptunnel)}")
# print(f"test_ps: {len(test_ps)}")
# print(f"test_snmpguess: {len(test_snmpguess)}")
# print(f"test_mailbomb: {len(test_mailbomb)}")
# print(f"test_named: {len(test_named)}")
# print(f"test_sendmail: {len(test_sendmail)}")
# print(f"test_xterm: {len(test_xterm)}")
# print(f"test_worm: {len(test_worm)}")
# print(f"test_xlock: {len(test_xlock)}")
# print(f"test_xsnoop: {len(test_xsnoop)}")
# print(f"test_sqlattack: {len(test_sqlattack)}")
# print(f"test_udpstorm: {len(test_udpstorm)}")
# print('\n')


# （全部种类）生成unknown_set
unknown = pd.concat([test_saint, test_mscan, test_apache2, test_snmpgetattack, test_processtable, test_httptunnel, test_ps, test_snmpguess, test_mailbomb, test_named, test_sendmail, test_xterm, test_worm, test_xlock, test_xsnoop, test_sqlattack, test_udpstorm], axis=0)
unknown.to_csv("unknown.csv",index=False)

print(f"unknown: {len(unknown)}")
print('\n')


# # （全部种类）训练集采样
# import pandas as pd
# trainset_neptune_sampled = train_neptune.sample(n=1000, random_state=42)
# trainset_warezclient_sampled = train_warezclient.sample(n=50, random_state=42)
# trainset_ipsweep_sampled = train_ipsweep.sample(n=100, random_state=42)
# trainset_portsweep_sampled = train_portsweep.sample(n=100, random_state=42)
# trainset_teardrop_sampled = train_teardrop.sample(n=100, random_state=42)
# trainset_nmap_sampled = train_nmap.sample(n=50, random_state=42)
# trainset_guess_passwd_sampled = train_guess_passwd.sample(n=50, random_state=42)
# trainset_ftp_write_sampled = train_ftp_write
# trainset_multihop_sampled = train_multihop
# trainset_satan_sampled = train_satan.sample(n=100, random_state=42)
# trainset_smurf_sampled = train_smurf.sample(n=100, random_state=42)
# trainset_pod_sampled = train_pod.sample(n=50, random_state=42)
# trainset_back_sampled = train_back.sample(n=100, random_state=42)
# trainset_rootkit_sampled = train_rootkit
# trainset_buffer_overflow_sampled = train_buffer_overflow
# trainset_phf_sampled = train_phf
# trainset_land_sampled = train_land
# trainset_imap_sampled = train_imap
# trainset_warezmaster_sampled = train_warezmaster
# trainset_loadmodule_sampled = train_loadmodule
# trainset_spy_sampled = train_spy
# trainset_perl_sampled = train_perl

# seed = pd.concat([
#     trainset_neptune_sampled,
#     trainset_warezclient_sampled,
#     trainset_ipsweep_sampled, 
#     trainset_portsweep_sampled,
#     trainset_teardrop_sampled,
#     trainset_nmap_sampled,
#     trainset_guess_passwd_sampled,
#     trainset_ftp_write_sampled,
#     trainset_multihop_sampled,
#     trainset_satan_sampled,
#     trainset_smurf_sampled,
#     trainset_pod_sampled,
#     trainset_back_sampled,
#     trainset_rootkit_sampled,
#     trainset_buffer_overflow_sampled,
#     trainset_phf_sampled,
#     trainset_land_sampled,
#     trainset_imap_sampled,
#     trainset_warezmaster_sampled,
#     trainset_loadmodule_sampled,
#     trainset_spy_sampled,
#     trainset_perl_sampled
# ], axis=0)

# print(f"seed: {len(seed)}")
# # 打乱数据
# seed = seed.sample(frac=1, random_state=42)

# # 计算每组大小
# group_size = len(seed) // 7

# # 分成7组并保存
# for i in range(7):
#     start_idx = i * group_size
#     end_idx = start_idx + group_size if i < 6 else len(seed)
#     group = seed.iloc[start_idx:end_idx]
#     group.to_csv(f"seed_{i+1}.csv", index=False)
    
# print('\n')
# seed.to_csv("seed.csv",index=False)


In [6]:
# （四种）数据处理
# (四种)测试集数据攻击分类
test_dos = test_data[test_data['class'] == 'dos'].drop(['class', 'num'], axis=1)
test_dos = pd.DataFrame(scaler.fit_transform(test_dos).round(5))
test_dos.to_csv("seed_4_type/dos.csv",index=False)

test_probe = test_data[test_data['class'] == 'probe'].drop(['class', 'num'], axis=1) 
test_probe = pd.DataFrame(scaler.fit_transform(test_probe).round(5))
test_probe.to_csv("seed_4_type/probe.csv",index=False)

test_r2l = test_data[test_data['class'] == 'r2l'].drop(['class', 'num'], axis=1)  
test_r2l = pd.DataFrame(scaler.fit_transform(test_r2l).round(5))
test_r2l.to_csv("seed_4_type/r2l.csv",index=False)

test_u2r = test_data[test_data['class'] == 'u2r'].drop(['class', 'num'], axis=1)
test_u2r = pd.DataFrame(scaler.fit_transform(test_u2r).round(5))
test_u2r.to_csv("seed_4_type/u2r.csv",index=False)

total_len = len(test_nonself)

# 打印每一类长度和比例
print(f"test_Total: {len(test_nonself)}")
print(f"test_dos: {len(test_dos)} ({len(test_dos) / total_len:.2%})")
print(f"test_probe: {len(test_probe)} ({len(test_probe) / total_len:.2%})")
print(f"test_r2l: {len(test_r2l)} ({len(test_r2l) / total_len:.2%})")
print(f"test_u2r: {len(test_u2r)} ({len(test_u2r) / total_len:.2%})")

print('\n')

# （四种）训练集数据攻击分类
train_dos = train_data[train_data['class'] == 'dos'].drop(['class', 'num'], axis=1)
train_dos = pd.DataFrame(scaler.fit_transform(train_dos).round(5))

train_probe = train_data[train_data['class'] == 'probe'].drop(['class', 'num'], axis=1)
train_probe = pd.DataFrame(scaler.fit_transform(train_probe).round(5))

train_r2l = train_data[train_data['class'] == 'r2l'].drop(['class', 'num'], axis=1)
train_r2l = pd.DataFrame(scaler.fit_transform(train_r2l).round(5))

train_u2r = train_data[train_data['class'] == 'u2r'].drop(['class', 'num'], axis=1)
train_u2r = pd.DataFrame(scaler.fit_transform(train_u2r).round(5))

total_len = len(train_nonself)

# 打印每一类长度和比例
print(f"train_Total: {len(train_nonself)}")
print(f"train_dos: {len(train_dos)} ({len(train_dos) / total_len:.2%})")
print(f"train_probe: {len(train_probe)} ({len(train_probe) / total_len:.2%})")
print(f"train_r2l: {len(train_r2l)} ({len(train_r2l) / total_len:.2%})")
print(f"train_u2r: {len(train_u2r)} ({len(train_u2r) / total_len:.2%})")


test_Total: 12833
test_dos: 7458 (58.12%)
test_probe: 2421 (18.87%)
test_r2l: 2887 (22.50%)
test_u2r: 67 (0.52%)


train_Total: 11743
train_dos: 9234 (78.63%)
train_probe: 2289 (19.49%)
train_r2l: 209 (1.78%)
train_u2r: 11 (0.09%)


In [11]:
# （四种）训练集采样
trainset_dos_sampled = train_dos.sample(n=1000, random_state=42)
trainset_probe_sampled = train_probe.sample(n=500, random_state=42)
trainset_r2l_sampled = train_r2l.sample(n=209, random_state=42)
trainset_u2r_sampled = train_u2r.sample(n=11, random_state=42)

trainset_sampled = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)
trainset_sampled = trainset_sampled.sample(frac=1, random_state=42)
print(f"trainset_sampled: {len(trainset_sampled)}")
trainset_sampled.to_csv("seed_4_type/trainset_sampled.csv",index=False)

seed_dos = pd.concat([
    trainset_probe_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)

seed_dos = seed_dos.sample(frac=1, random_state=42)
print(f"seed_dos: {len(seed_dos)}")
seed_dos.to_csv("seed_4_type/dos/seed_dos.csv",index=False)

seed_probe = pd.concat([
    trainset_dos_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)

seed_probe = seed_probe.sample(frac=1, random_state=42)
print(f"seed_probe: {len(seed_probe)}")
seed_probe.to_csv("seed_4_type/probe/seed_probe.csv",index=False)

seed_r2l = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_u2r_sampled
], axis=0) 

seed_r2l = seed_r2l.sample(frac=1, random_state=42)
print(f"seed_r2l: {len(seed_r2l)}")
seed_r2l.to_csv("seed_4_type/r2l/seed_r2l.csv",index=False)

seed_u2r = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_r2l_sampled
], axis=0)

seed_u2r = seed_u2r.sample(frac=1, random_state=42)
print(f"seed_u2r: {len(seed_u2r)}")
seed_u2r.to_csv("seed_4_type/u2r/seed_u2r.csv",index=False)

trainset_sampled: 1720
seed_dos: 720
seed_probe: 1220
seed_r2l: 1511
seed_u2r: 1709
