In [45]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import GRU, Dense
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import random


In [38]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical

# 加载数据
df = pd.read_csv('jianceresults20000-40.csv')  # 修改为实际的文件路径
df = df[df['Resultnaxsi'] == 403]
payloads = df['Payload'].values
print(len(payloads))

# 添加开始和结束标识符
start_token = "<start>"
end_token = "<end>"

# 提取所有唯一的字符，并添加开始和结束标识符
chars = set(''.join(payloads))  # 所有Payload中出现的字符
chars = sorted(list(chars))
chars = [start_token, end_token] + chars  # 在字符集前加上 <start> 和 <end>

# 创建字符到索引的映射
char_to_index = {char: index for index, char in enumerate(chars)}
index_to_char = {index: char for index, char in enumerate(chars)}

# 进行One-hot编码
def one_hot_encode(payload):
    return [char_to_index[char] for char in payload]

# 将整数索引转换为One-hot编码
def integer_to_one_hot(encoded_payload):
    one_hot_encoded = np.zeros((len(encoded_payload), len(chars)))
    for i, index in enumerate(encoded_payload):
        one_hot_encoded[i, index] = 1
    return one_hot_encoded

# 训练数据生成
X, y = [], []
sequence_length = 50 # 每个输入序列的长度

for payload in payloads:
    # 添加 <start> 和 <end> 标识符
    payload_with_tokens = payload
    
    payload_encoded = [0]+one_hot_encode(payload_with_tokens)
    
    # 如果长度小于30，使用0填充至30；如果超过30，截断为30
    if len(payload_encoded) < sequence_length:
        # 填充不足的部分为0
        padding_length = sequence_length - len(payload_encoded)
        payload_encoded += [1] * padding_length  # 用全0填充
    elif len(payload_encoded) > sequence_length:
        # 截断多余部分
        payload_encoded = payload_encoded[:sequence_length]

    # 生成X和y
    X.append(payload_encoded[:-1])  # 输入序列是前29个字符
    y.append(payload_encoded[1:])  # 标签是后29个字符

# 转换为NumPy数组并进行One-hot编码
X = np.array([integer_to_one_hot(seq) for seq in X])
y = np.array([integer_to_one_hot(seq) for seq in y])

# 打印X和y的形状以确认数据是否正确生成
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

from keras.models import Sequential
from keras.layers import GRU, Dense
from keras.optimizers import Adam

# 构建GRU模型
model = Sequential()
model.add(GRU(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))  # 256个GRU单元
model.add(Dense(len(chars), activation='softmax'))  # 输出层，大小为字符集的大小，softmax用于分类

# 编译模型
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# 打印模型摘要
model.summary()

# 训练GRU模型
model.fit(X, y, epochs=50, batch_size=128)

4601
Shape of X: (4601, 49, 52)
Shape of y: (4601, 49, 52)
Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_6 (GRU)                 (None, 49, 256)           238080    
                                                                 
 dense_6 (Dense)             (None, 49, 52)            13364     
                                                                 
Total params: 251444 (982.20 KB)
Trainable params: 251444 (982.20 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50

<keras.src.callbacks.History at 0x7f061c108790>

In [39]:
len(X[0][0])

52

In [14]:
chars

['<start>',
 '<end>',
 '!',
 '%',
 '&',
 "'",
 '(',
 ')',
 '+',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '<',
 '=',
 '>',
 '@',
 'C',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'x',
 '{',
 '|',
 '}',
 '~']

In [41]:
import numpy as np

def sample(predictions, temperature=1.0):
    # 对概率进行温度调整，避免过于确定性或者过于随机的选择
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions + 1e-8) / temperature  # 进行log转换并除以温度
    predictions = np.exp(predictions)  # 还原为概率分布
    predictions = predictions / np.sum(predictions)  # 归一化为概率分布
    
    # 使用调整后的概率进行采样
    return np.random.choice(len(predictions), p=predictions)

def generate_payload_with_sampling(model, start_token="<start>", end_token="<end>", sequence_length=100, max_length=200, temperature=1.0):
    # 初始化生成的payload，包含开始标识符
    generated_payload = [start_token]
    
    # 将生成的payload转为One-hot编码
    current_sequence = [char_to_index[char] for char in generated_payload]  # 初始化为[<start>]
    # 填充序列到 sequence_length - 1 (99) 长度
    while len(current_sequence) < sequence_length - 1:
        current_sequence.append(char_to_index['1'])  # 使用空格或某个默认字符进行填充
    
    current_sequence = np.array(current_sequence).reshape(1, -1)  # 添加批量维度, 现在是(1, 99)
    
    # 将current_sequence转换为One-hot编码
    current_sequence_one_hot = np.zeros((1, len(current_sequence[0]), len(chars)))
    for i, index in enumerate(current_sequence[0]):
        current_sequence_one_hot[0, i, index] = 1  # 当前序列的One-hot编码
    
    # 开始生成字符，直到达到最大长度或遇到end_token
    while len(generated_payload) < max_length:
        # 进行预测，输出的概率分布
        predictions = model.predict(current_sequence_one_hot, verbose=0)
        
        # 从预测结果中选取通过温度调节后的字符
        predicted_index = sample(predictions[0, -1], temperature)  # 进行采样
        predicted_char = index_to_char[predicted_index]
        
        # 如果预测的是结束标识符，则停止生成
        if predicted_char == end_token:
            break
        
        # 将预测的字符添加到生成的payload中
        generated_payload.append(predicted_char)
        
        # 更新当前序列，移除最左边的字符，加入新预测的字符
        current_sequence = np.append(current_sequence[:, 1:], [[predicted_index]], axis=1)
        
        # 更新current_sequence的One-hot编码
        current_sequence_one_hot = np.zeros((1, len(current_sequence[0]), len(chars)))
        for i, index in enumerate(current_sequence[0]):
            current_sequence_one_hot[0, i, index] = 1  # 更新为新的One-hot编码
    
    return ''.join(generated_payload[1:])  # 忽略开始标识符返回结果

# 使用训练好的模型生成payload
generated_payload = generate_payload_with_sampling(model, sequence_length=sequence_length ,temperature=0.7)  # 调整温度值来调节生成的随机性
print("Generated Payload:")
print(generated_payload)


Generated Payload:
+&&%0bnot%7e0%0bor%270


In [None]:
import pandas as pd

# 假设 generations 列表已经包含生成的负载
generations = []

for i in range(4000):
    generat = generate_payload_with_sampling(model, sequence_length=sequence_length, temperature=1) 
    print(generat)
    generations.append(generat)

# 将生成的负载保存到DataFrame
df_generations = pd.DataFrame(generations, columns=["Generated_Payload"])

# 将DataFrame保存到CSV文件
output_csv = 'generated_payloads.csv'  # 自定义保存的文件名
df_generations.to_csv(output_csv, index=False)

# 输出保存的结果
print(f"Generated payloads saved to {output_csv}")

In [25]:
import pandas as pd

def find_max_payload_length(csv_file):
    # 读取CSV文件
    df = pd.read_csv(csv_file)
    
    # 计算 Payload 列中每个元素的长度
    df['Payload_length'] = df['Payload'].apply(len)
    
    # 获取最大长度
    max_length = df['Payload_length'].max()

    # 输出最大长度
    print(f"The maximum length of the elements in the 'Payload' column is: {max_length}")
    return max_length

# 运行计算最大长度的函数
csv_file = 'output40.csv'  # 请确保文件路径正确
max_length = find_max_payload_length(csv_file)


The maximum length of the elements in the 'Payload' column is: 81
