In [1]:
import numpy as np
import pandas as pd

from keras.models import Model, Sequential
from keras.layers import Input, Dense,Activation
from keras.optimizers import RMSprop
from keras.layers.merge import _Merge
from keras import backend as K
from functools import partial
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.


# 1. 定义所需函数

In [2]:
def convertstringtonumber(df, lst):
    """ 字符串转为数字型"""
    for n in range(len(lst)):
        df = df.replace(lst[n], n)
    return df

def scalex(X):
    """ 数值标准化"""
    nmin, nmax = 0.0, 1.0
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (nmax - nmin) + nmin
    return X_scaled

def calcrmse(X_train, gensamples):
    """计算均方误差"""
    max_column = X_train.shape[1]
    rmse_lst = []
    for col in range(max_column):
        rmse_lst.append(np.sqrt(mean_squared_error(X_train[:,col], gensamples[:,col])))
    return np.sum(rmse_lst) / max_column

def wasserstein_loss(y_true, y_pred):
    """ Wasserstein distance Wasserstein距离"""
    return K.mean(y_true * y_pred)

def gradient_penalty_loss(y_true, y_pred, averaged_samples, lamba_reg):
    """ 计算GP-WGAN的梯度损失"""
    gradients = K.gradients(y_pred, averaged_samples)[0]
    gradients_sqr = K.square(gradients)
    gradients_sqr_sum = K.sum(gradients_sqr,
                              axis=np.arange(1, len(gradients_sqr.shape)))
    gradient_l2_norm = K.sqrt(gradients_sqr_sum)
    gradient_penalty = lamba_reg * K.square(1 - gradient_l2_norm)
    return K.mean(gradient_penalty)

class RandomWeightedAverage(_Merge):
    def _merge_function(self, inputs):
        weights = K.random_uniform((BATCH_SIZE, 1))
        return (weights * inputs[0]) + ((1 - weights) * inputs[1])

def generate_samples(generator_model, noise_dim, num_samples):
    """生成样本以供进一步分析"""
    return generator_model.predict(np.random.rand(num_samples, noise_dim))

def writetocsv(mtrx, flnm):
    """Save the samples for TDA with R (2nd notebook). We do not differentiate frauds from normal transactions
        保存生成的样本"""
    dtfrm = pd.DataFrame(mtrx)
    dtfrm.to_csv(flnm, sep=',', index=None, header=None)

# 2. 读取数据集

In [3]:
# attach the column names to the dataset
col_names = ["Duration", "Protocol_type", "Service", "Flag", "Src_bytes", "Dst_bytes", 
       "Land", "Wrong_fragment", "Urgent", "Hot", "Num_failed_logins", "Logged_in", 
       "Num_compromised", "Root_shell", "Su_attempted", "Num_root", "Num_file_creations",
       "Num_shells", "Num_access_files", "Num_outbound_cmds", "Is_hot_login", 
       "Is_guest_login", "Count", "Srv_count", "Serror_rate", "Srv_serror_rate",
       "Rerror_rate", "Srv_rerror_rate", "Same_srv_rate", "Diff_srv_rate",
       "Srv_diff_host_rate", "Dst_host_count", "Dst_host_srv_count", "Dst_host_same_srv_rate", 
       "Dst_host_diff_srv_rate", "Dst_host_same_src_port_rate", "Dst_host_srv_diff_host_rate",
       "Dst_host_serror_rate", "Dst_host_srv_serror_rate", "Dst_host_rerror_rate",
       "Dst_host_srv_rerror_rate", "attack_type", "Class"]

df_train = pd.read_csv("./dataset/KDDTrain+.txt", sep=",", header=None)

df_train.columns = col_names

df_train = df_train.drop(columns=['Class'])

print('Dimensions of the Training set:',df_train.shape)

df_train.head(5)

Dimensions of the Training set: (125973, 42)


Unnamed: 0,Duration,Protocol_type,Service,Flag,Src_bytes,Dst_bytes,Land,Wrong_fragment,Urgent,Hot,...,Dst_host_srv_count,Dst_host_same_srv_rate,Dst_host_diff_srv_rate,Dst_host_same_src_port_rate,Dst_host_srv_diff_host_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Dst_host_rerror_rate,Dst_host_srv_rerror_rate,attack_type
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


# 3. 数据预处理

In [4]:
protocol_type = ['icmp', 'tcp', 'udp']
service = ['IRC','X11','Z39_50','aol','auth','bgp','courier','csnet_ns','ctf',
           'daytime','discard','domain','domain_u','echo','eco_i','ecr_i','efs',
           'exec','finger','ftp','ftp_data','gopher','harvest','hostnames','http',
           'http_2784','http_443','http_8001','imap4','iso_tsap','klogin','kshell',
           'ldap','link','login','mtp','name','netbios_dgm','netbios_ns',
           'netbios_ssn','netstat','nnsp','nntp','ntp_u','other','pm_dump','pop_2',
           'pop_3','printer','private','red_i','remote_job','rje','shell','smtp',
           'sql_net','ssh','sunrpc','supdup','systat','telnet','tftp_u','tim_i',
           'time','urh_i','urp_i','uucp','uucp_path','vmnet','whois']
flag = ['OTH','REJ','RSTO','RSTOS0','RSTR','S0','S1','S2','S3','SF','SH']

df_train = convertstringtonumber(df_train, protocol_type)
df_train = convertstringtonumber(df_train, service)
df_train = convertstringtonumber(df_train, flag)
        
for n in range(len(col_names)-2): #df_train标准化
        m = col_names[n]
        if (np.max(df_train[m]) > 1): 
            if (len(np.unique(df_train[m])) > 1):
                df_train[m] = scalex(df_train[m])
            else:
                df_train[m] = np.int64(1)

df_train.head(5)

Unnamed: 0,Duration,Protocol_type,Service,Flag,Src_bytes,Dst_bytes,Land,Wrong_fragment,Urgent,Hot,...,Dst_host_srv_count,Dst_host_same_srv_rate,Dst_host_diff_srv_rate,Dst_host_same_src_port_rate,Dst_host_srv_diff_host_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Dst_host_rerror_rate,Dst_host_srv_rerror_rate,attack_type
0,0.0,0.5,0.289855,0.9,3.558064e-07,0.0,0,0.0,0.0,0.0,...,0.098039,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0.0,1.0,0.637681,0.9,1.057999e-07,0.0,0,0.0,0.0,0.0,...,0.003922,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0.0,0.5,0.710145,0.5,0.0,0.0,0,0.0,0.0,0.0,...,0.101961,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0.0,0.5,0.347826,0.9,1.681203e-07,6.223962e-06,0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0.0,0.5,0.347826,0.9,1.442067e-07,3.20626e-07,0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
labeldf_train = df_train['attack_type']

# 用数字代替攻击类型
newlabeldf_train = labeldf_train.replace({ 'normal': 0,
         'neptune': 1, 'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1, 'mailbomb': 1, 'apache2': 1,
         'processtable': 1, 'udpstorm': 1,
         'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2, 'mscan': 2, 'saint': 2,
         'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3, 'spy': 3, 'warezclient': 3, 'worm': 3,
         'warezmaster': 3, 'sendmail': 3, 'named': 3, 'snmpgetattack': 3, 'snmpguess': 3, 'xlock': 3, 'xsnoop': 3,
         'httptunnel': 3,
         'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4, 'ps': 4, 'sqlattack': 4, 'xterm': 4})

df_train['attack_type'] = newlabeldf_train

df_train.head(5)

Unnamed: 0,Duration,Protocol_type,Service,Flag,Src_bytes,Dst_bytes,Land,Wrong_fragment,Urgent,Hot,...,Dst_host_srv_count,Dst_host_same_srv_rate,Dst_host_diff_srv_rate,Dst_host_same_src_port_rate,Dst_host_srv_diff_host_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Dst_host_rerror_rate,Dst_host_srv_rerror_rate,attack_type
0,0.0,0.5,0.289855,0.9,3.558064e-07,0.0,0,0.0,0.0,0.0,...,0.098039,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0.0,1.0,0.637681,0.9,1.057999e-07,0.0,0,0.0,0.0,0.0,...,0.003922,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.5,0.710145,0.5,0.0,0.0,0,0.0,0.0,0.0,...,0.101961,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
3,0.0,0.5,0.347826,0.9,1.681203e-07,6.223962e-06,0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
4,0.0,0.5,0.347826,0.9,1.442067e-07,3.20626e-07,0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
to_drop_attack_forgen_R2L = [0,1,2,4]
to_drop_attack_forgen_U2R = [0,1,2,3]
to_drop_attack_forgen_Normal = [1,2,3,4]
to_drop_attack_forgen_DoS = [0,2,3,4]
to_drop_attack_forgen_Probe = [0,1,3,4]

attack_df_forgen_R2L=df_train[~df_train['attack_type'].isin(to_drop_attack_forgen_R2L)]
attack_df_forgen_U2R=df_train[~df_train['attack_type'].isin(to_drop_attack_forgen_U2R)]
attack_df_forgen_Normal=df_train[~df_train['attack_type'].isin(to_drop_attack_forgen_Normal)]
attack_df_forgen_DoS=df_train[~df_train['attack_type'].isin(to_drop_attack_forgen_DoS)]
attack_df_forgen_Probe=df_train[~df_train['attack_type'].isin(to_drop_attack_forgen_Probe)]

print(attack_df_forgen_R2L.shape)
print(attack_df_forgen_U2R.shape)
print(attack_df_forgen_Normal.shape)
print(attack_df_forgen_DoS.shape)
print(attack_df_forgen_Probe.shape)

(995, 42)
(52, 42)
(67343, 42)
(45927, 42)
(11656, 42)


In [7]:
attack_df_forgen_R2L.head(5)

Unnamed: 0,Duration,Protocol_type,Service,Flag,Src_bytes,Dst_bytes,Land,Wrong_fragment,Urgent,Hot,...,Dst_host_srv_count,Dst_host_same_srv_rate,Dst_host_diff_srv_rate,Dst_host_same_src_port_rate,Dst_host_srv_diff_host_rate,Dst_host_serror_rate,Dst_host_srv_serror_rate,Dst_host_rerror_rate,Dst_host_srv_rerror_rate,attack_type
13,0.0,0.5,0.289855,0.9,2.420353e-07,0.0,0,0.0,0.0,0.0,...,0.078431,1.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,3
48,0.0,0.5,0.289855,0.9,2.420353e-07,0.0,0,0.0,0.0,0.0,...,0.14902,1.0,0.0,1.0,0.18,0.0,0.0,0.0,0.0,3
148,0.0,0.5,0.289855,0.9,2.420353e-07,0.0,0,0.0,0.0,0.0,...,0.101961,1.0,0.0,1.0,0.19,0.0,0.0,0.0,0.0,3
190,0.353291,0.5,0.275362,0.9,2.536298e-07,9.046234e-07,0,0.0,0.0,0.077922,...,0.556863,0.56,0.02,0.0,0.0,0.0,0.0,0.0,0.0,3
222,9.3e-05,0.5,0.289855,0.9,6.029143e-07,0.0,0,0.0,0.0,0.0,...,0.109804,1.0,0.0,1.0,0.18,0.0,0.0,0.0,0.0,3


# 4. 定义GAN框架

In [8]:
# R2L
GRADIENT_PENALTY_WEIGHT = 0.01 #0.1
MAX_SIM = 10000 #10000
X_train = np.asarray(attack_df_forgen_Probe.iloc[:MAX_SIM, :-1])
np.random.shuffle(X_train)
#
MAX_EPOCH = 100 #15000
TRAINING_RATIO = 99
BATCH_SIZE = 100
#
# NUM_SAMPLES = 995 #5000
#
### Building the model
def make_generator(noise_dim=100):
    model = Sequential()
    model.add(Dense(64,  kernel_initializer='he_normal', input_dim=INPUT_DIM)) #输入层维度为INPUT_DIM，第一个隐藏层为128个神经元
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64,  kernel_initializer='he_normal'))                       #kernel_initializer='he_normal'为权重初始化
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64,  kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=noise_dim, activation='linear')) #最后一个隐藏层神经元数目为noise_dim，激活函数为线性函数
    return model
#
#    Last card of make_generator.
#    
def make_discriminator():
    model = Sequential()
    model.add(Dense(64, kernel_initializer='he_normal', input_dim=INPUT_DIM))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(64, kernel_initializer='he_normal'))
    model.add(Activation('relu')) # model.add(Activation('relu'))
    model.add(Dense(units=1, activation='linear'))
    return model
#
#    Last card of make_discriminator.
#
#     print("current_gradpenalty:", GRADIENT_PENALTY_WEIGHT)

INPUT_DIM = X_train.shape[1]
noise_dim = INPUT_DIM

generator = make_generator(noise_dim)
discriminator = make_discriminator() #创建生成器和鉴别器

for layer in discriminator.layers: 
    layer.trainable = False
discriminator.trainable = False # 固定鉴别器，训练模型之前，需要配置学习过程(compile)，这个过程叫模型编译

generator_input = Input(shape=(noise_dim,)) # 输入层：必须是InputLayer或者Input创建的Tensor
generator_layers = generator(generator_input)
discriminator_layers_for_generator = discriminator(generator_layers)

generator_model = Model(inputs=[generator_input], outputs=[discriminator_layers_for_generator])
generator_model.compile(optimizer=RMSprop(lr=0.001, rho=0.9, epsilon=1e-6), loss=wasserstein_loss)# 配置优化函数，损失函数

for layer in discriminator.layers: #固定生成器
    layer.trainable = True
for layer in generator.layers:
    layer.trainable = False
discriminator.trainable = True
generator.trainable = False 


real_samples = Input(shape=X_train.shape[1:]) #真实数据张量维度
generator_input_for_discriminator = Input(shape=(noise_dim,))
generated_samples_for_discriminator = generator(generator_input_for_discriminator)
discriminator_output_from_generator = discriminator(generated_samples_for_discriminator)
discriminator_output_from_real_samples = discriminator(real_samples)

averaged_samples = RandomWeightedAverage()([real_samples, generated_samples_for_discriminator])
averaged_samples_out = discriminator(averaged_samples)

discriminator_model = Model(inputs=[real_samples, generator_input_for_discriminator], 
                            outputs=[discriminator_output_from_real_samples, discriminator_output_from_generator, 
                                     averaged_samples_out])


### the loss function takes more inputs than the standard y_true and y_pred 
### values usually required for a loss function. Therefore, we will make it partial.
partial_gp_loss = partial(gradient_penalty_loss, averaged_samples=averaged_samples, lamba_reg=GRADIENT_PENALTY_WEIGHT)
partial_gp_loss.__name__ = 'gp_loss' 


# finally, we compile the model
discriminator_model.compile(optimizer=RMSprop(lr=0.001, rho=0.9, epsilon=1e-6), 
                            loss=[wasserstein_loss, wasserstein_loss, partial_gp_loss])

positive_y = np.ones((BATCH_SIZE, 1), dtype=np.float32)
negative_y = -positive_y
dummy_y = np.zeros((BATCH_SIZE, 1), dtype=np.float32) # 批处理函数上的列必须使用虚拟向量

for epoch in range(MAX_EPOCH + 1):
    np.random.shuffle(X_train)

    minibatches_size = BATCH_SIZE * TRAINING_RATIO
    for i in range(int(X_train.shape[0] // (BATCH_SIZE * TRAINING_RATIO))):
        discriminator_minibatches = X_train[i * minibatches_size:(i + 1) * minibatches_size]
        for j in range(TRAINING_RATIO):
            sample_batch = discriminator_minibatches[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
            noise = np.random.rand(BATCH_SIZE, noise_dim).astype(np.float32)

            discriminator_model.train_on_batch([sample_batch, noise], [positive_y, negative_y, dummy_y])

        generator_model.train_on_batch(np.random.rand(BATCH_SIZE, noise_dim), positive_y)

generatorgpwgan = generator
generated = generate_samples(generatorgpwgan, noise_dim, 10000) #在generator中，predict()输出的数据便是生成的Attack traffic
writetocsv(generated, "./generated_sample/generated_Probe_10000.csv")