In [10]:
# 载入原始数据集
import pandas as pd
test_data = pd.read_csv("KDDTest.csv")
train_data = pd.read_csv("KDDTrain.csv")

In [11]:
# 原始数据集处理
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

train_data['protocol_type'] = label_encoder.fit_transform(train_data['protocol_type'])
test_data['protocol_type'] = label_encoder.fit_transform(test_data['protocol_type'])
train_data['service'] = label_encoder.fit_transform(train_data['service'])
test_data['service'] = label_encoder.fit_transform(test_data['service'])
train_data['flag'] = label_encoder.fit_transform(train_data['flag'])
test_data['flag'] = label_encoder.fit_transform(test_data['flag'])

# 保存修改后的数据
train_data.to_csv("train_data.csv",index=False)
test_data.to_csv("test_data.csv",index=False)

In [12]:
# 对除class列外的所有特征进行归一化处理
from sklearn.preprocessing import MinMaxScaler
train_scaler = MinMaxScaler()
test_scaler =  MinMaxScaler()
# 获取除class列外的所有列
train_features = train_data.drop(['class', 'num'], axis=1)
test_features = test_data.drop(['class', 'num'], axis=1)

# 对特征进行归一化
train_normalized = pd.DataFrame(train_scaler.fit_transform(train_features))
test_normalized = pd.DataFrame(test_scaler.fit_transform(test_features))

# 将class列添加回归一化后的数据
train_normalized['class'] = train_data['class']
test_normalized['class'] = test_data['class']

train_data = train_normalized
test_data = test_normalized
train_data.to_csv("train_data.csv",index=False)
test_data.to_csv("test_data.csv",index=False)

In [13]:
# 处理自体数据
test_self = test_data[test_data['class'] == 'normal'].drop(['class'], axis=1)
train_self = train_data[train_data['class'] == 'normal'].drop(['class'], axis=1)
print('train_self:')
print(len(train_self))
print('test_self:')
print(len(test_self))
test_self.to_csv("self/test_self.csv",index=False)
train_self.to_csv("self/train_self.csv",index=False)

# 处理非自体数据
test_nonself = test_data[test_data['class'] != 'normal'].drop(['class'], axis=1)
train_nonself = train_data[train_data['class'] != 'normal'].drop(['class'], axis=1)

print('train_nonself:')
print(len(train_nonself))
print('test_nonself:')
print(len(test_nonself))
test_nonself.to_csv("nonself/test_nonself.csv",index=False)
train_nonself.to_csv("nonself/train_nonself.csv",index=False)


train_self:
13449
test_self:
9711
train_nonself:
11743
test_nonself:
12833


In [14]:
# 打印训练集、测试集中所有class的种类
print("训练集中的攻击类型:")
print(train_data['class'].unique())
print("\n测试集中的攻击类型:") 
print(test_data['class'].unique())


训练集中的攻击类型:
['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy']

测试集中的攻击类型:
['neptune' 'normal' 'saint' 'mscan' 'guess_passwd' 'smurf' 'apache2'
 'satan' 'buffer_overflow' 'back' 'warezmaster' 'snmpgetattack'
 'processtable' 'pod' 'httptunnel' 'nmap' 'ps' 'snmpguess' 'ipsweep'
 'mailbomb' 'portsweep' 'multihop' 'named' 'sendmail' 'loadmodule' 'xterm'
 'worm' 'teardrop' 'rootkit' 'xlock' 'perl' 'land' 'xsnoop' 'sqlattack'
 'ftp_write' 'imap' 'udpstorm' 'phf']


In [15]:
# 定义攻击类型映射字典
attack_mapping = {
    'back': 'dos', 'land': 'dos', 'neptune': 'dos', 'pod': 'dos', 'smurf': 'dos', 'teardrop': 'dos', 'udpstorm': 'dos', 'processtable': 'dos', 'mailbomb': 'dos',
    'apache2': 'dos',
    
    'buffer_overflow': 'u2r', 'loadmodule': 'u2r', 'perl': 'u2r', 'rootkit': 'u2r', 'ps': 'u2r', 'xterm': 'u2r', 'sqlattack': 'u2r',
    
    'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'multihop': 'r2l', 'phf': 'r2l', 'warezmaster': 'r2l',
    'snmpgetattack': 'r2l', 'httptunnel': 'r2l', 'snmpguess': 'r2l', 'named': 'r2l', 'sendmail': 'r2l', 'xlock': 'r2l',
    'xsnoop': 'r2l', 'worm': 'r2l','warezclient': 'r2l','spy':'r2l',
    
    'ipsweep': 'probe', 'nmap': 'probe', 'portsweep': 'probe', 'satan': 'probe', 'saint': 'probe', 'mscan': 'probe'
}

# 将训练集中的具体攻击类型映射为大类
train_data['class'] = train_data['class'].map(lambda x: attack_mapping.get(x, x))

# 将测试集中的具体攻击类型映射为大类  
test_data['class'] = test_data['class'].map(lambda x: attack_mapping.get(x, x))

print("映射后训练集中的攻击类型:")
print(train_data['class'].unique())
print("\n映射后测试集中的攻击类型:")
print(test_data['class'].unique())


映射后训练集中的攻击类型:
['normal' 'dos' 'r2l' 'probe' 'u2r']

映射后测试集中的攻击类型:
['dos' 'normal' 'probe' 'r2l' 'u2r']


In [16]:
# 保存映射后的数据
train_data.to_csv("4_type_train_data.csv", index=False)
test_data.to_csv("4_type_test_data.csv", index=False)

# 统计每个类别的数量
print("训练集中各类别的数量:")
print(train_data['class'].value_counts())

print("\n测试集中各类别的数量:")  
print(test_data['class'].value_counts())


训练集中各类别的数量:
class
normal    13449
dos        9234
probe      2289
r2l         209
u2r          11
Name: count, dtype: int64

测试集中各类别的数量:
class
normal    9711
dos       7458
r2l       2887
probe     2421
u2r         67
Name: count, dtype: int64


In [17]:
# （四种）数据处理
# (四种)测试集数据攻击分类
test_dos = test_data[test_data['class'] == 'dos'].drop(['class'], axis=1)
test_dos.to_csv("unknown/dos.csv",index=False)

test_probe = test_data[test_data['class'] == 'probe'].drop(['class'], axis=1) 
test_probe.to_csv("unknown/probe.csv",index=False)

test_r2l = test_data[test_data['class'] == 'r2l'].drop(['class'], axis=1)  
test_r2l.to_csv("unknown/r2l.csv",index=False)

test_u2r = test_data[test_data['class'] == 'u2r'].drop(['class'], axis=1)
test_u2r.to_csv("unknown/u2r.csv",index=False)

total_len = len(test_nonself)

# 打印每一类长度和比例
print(f"test_Total: {len(test_nonself)}")
print(f"test_dos: {len(test_dos)} ({len(test_dos) / total_len:.2%})")
print(f"test_probe: {len(test_probe)} ({len(test_probe) / total_len:.2%})")
print(f"test_r2l: {len(test_r2l)} ({len(test_r2l) / total_len:.2%})")
print(f"test_u2r: {len(test_u2r)} ({len(test_u2r) / total_len:.2%})")

print('\n')

# （四种）训练集数据攻击分类
train_dos = train_data[train_data['class'] == 'dos'].drop(['class'], axis=1)

train_probe = train_data[train_data['class'] == 'probe'].drop(['class'], axis=1)

train_r2l = train_data[train_data['class'] == 'r2l'].drop(['class'], axis=1)

train_u2r = train_data[train_data['class'] == 'u2r'].drop(['class'], axis=1)

total_len = len(train_nonself)

# 打印每一类长度和比例
print(f"train_Total: {len(train_nonself)}")
print(f"train_dos: {len(train_dos)} ({len(train_dos) / total_len:.2%})")
print(f"train_probe: {len(train_probe)} ({len(train_probe) / total_len:.2%})")
print(f"train_r2l: {len(train_r2l)} ({len(train_r2l) / total_len:.2%})")
print(f"train_u2r: {len(train_u2r)} ({len(train_u2r) / total_len:.2%})")


test_Total: 12833
test_dos: 7458 (58.12%)
test_probe: 2421 (18.87%)
test_r2l: 2887 (22.50%)
test_u2r: 67 (0.52%)


train_Total: 11743
train_dos: 9234 (78.63%)
train_probe: 2289 (19.49%)
train_r2l: 209 (1.78%)
train_u2r: 11 (0.09%)


In [18]:
# （四种）训练集采样
trainset_dos_sampled = train_dos.sample(n=1000, random_state=42)
trainset_probe_sampled = train_probe.sample(n=500, random_state=42)
trainset_r2l_sampled = train_r2l.sample(n=209, random_state=42)
trainset_u2r_sampled = train_u2r.sample(n=11, random_state=42)

trainset_sampled = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)

trainset_sampled = trainset_sampled.sample(frac=1, random_state=42)
print(f"trainset_sampled: {len(trainset_sampled)}")
trainset_sampled.to_csv("4_type_trainset_sampled.csv",index=False)

seed_dos = pd.concat([
    trainset_probe_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)

seed_dos = seed_dos.sample(frac=1, random_state=42)
print(f"seed_dos: {len(seed_dos)}")
seed_dos.to_csv("seed_4_type/dos/seed_dos.csv",index=False)

seed_probe = pd.concat([
    trainset_dos_sampled,
    trainset_r2l_sampled,
    trainset_u2r_sampled
], axis=0)

seed_probe = seed_probe.sample(frac=1, random_state=42)
print(f"seed_probe: {len(seed_probe)}")
seed_probe.to_csv("seed_4_type/probe/seed_probe.csv",index=False)

seed_r2l = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_u2r_sampled
], axis=0) 

seed_r2l = seed_r2l.sample(frac=1, random_state=42)
print(f"seed_r2l: {len(seed_r2l)}")
seed_r2l.to_csv("seed_4_type/r2l/seed_r2l.csv",index=False)

seed_u2r = pd.concat([
    trainset_dos_sampled,
    trainset_probe_sampled,
    trainset_r2l_sampled
], axis=0)

seed_u2r = seed_u2r.sample(frac=1, random_state=42)
print(f"seed_u2r: {len(seed_u2r)}")
seed_u2r.to_csv("seed_4_type/u2r/seed_u2r.csv",index=False)

trainset_sampled: 1720
seed_dos: 720
seed_probe: 1220
seed_r2l: 1511
seed_u2r: 1709
