In [None]:
import torch
from tqdm import tqdm
import os
import pandas as pd
tqdm.pandas()
import datetime
import argparse
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
from transformers.modeling_utils import top_k_top_p_filtering
from torch import nn
from torch.nn import Identity
import torch.nn.functional as F
import numpy as np


class ValueHead(nn.Module):
    """The ValueHead class implements a head for GPT2 that returns a scalar for each output token."""
    def __init__(self, config):
        super().__init__()
        self.detach_head = False
        self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
        if self.summary_type == "attn":
            raise NotImplementedError
            
        self.summary = Identity()
        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                num_classes = config.num_labels
            else:
                num_classes = config.hidden_size
            self.summary = nn.Linear(config.hidden_size, num_classes)  #768->1
            
        self.activation = Identity()  #this
        if hasattr(config, "summary_activation") and config.summary_activation == "tanh":
            self.activation = nn.Tanh()
            
        self.first_dropout = Identity()  #0.1
        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
            self.first_dropout = nn.Dropout(config.summary_first_dropout)
            
        self.last_dropout = Identity()
        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)  
            
        self.flatten = nn.Flatten()

    def forward(self, hidden_states, cls_index=None):
  
        if self.detach_head:
            output = hidden_states.detach()
        else:
            output = hidden_states
        output = self.first_dropout(output)  #有，0.1
        output = self.summary(output)        # 768->1
        output = self.activation(output)     #无
        output = self.last_dropout(output)   #无
        return output


class GPT2HeadWithValueModel(GPT2PreTrainedModel):
    """The GPT2HeadWithValueModel class implements a GPT2 language model with a secondary, scalar head."""
    def __init__(self, config):
        super().__init__(config)
        # print(config)
        # print('vocab_size shape :',config.vocab_size)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.v_head = ValueHead(config)
        self.init_weights()

    def get_output_embeddings(self):
        return self.lm_head

    def detach_value_head(self):
        self.v_head.detach_head = True

    def forward(
        self,
        input_ids=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        mc_token_ids=None,
        lm_labels=None,
        mc_labels=None,
    ):
        transformer_outputs = self.transformer(
            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
        value = self.v_head(hidden_states).squeeze(-1)
        outputs = (lm_logits,) + transformer_outputs[1:] + (value,)
        return outputs

def respond_to_batch(model , queries, txt_len=20, top_k=0, top_p=1.0):
    """Sample text from language model."""
    input_ids = queries
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    with torch.no_grad():  
        for i in range(txt_len):
            outputs = model(input_ids)
            next_token_logits = outputs[0][:, -1, :]

            next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
            input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)
    return input_ids, outputs[2], outputs[0]

def generate(model, total_nums,  txt_out_len):
    #start 项审生成
    bs = 64
    wafData=pd.DataFrame()
    wafData['content']=['0' for _ in range(300)]
    wafData['tokens']=wafData['content'].progress_apply(lambda x: gpt2_tokenizer.encode(x, return_tensors="pt").to(device)[0, :txt_in_len])
    wafData['query'] = wafData['tokens'].progress_apply(lambda x: gpt2_tokenizer.decode(x))

    #生成过程
    valueList =torch.tensor([ ], device='cuda:0')  #store every payload's current value 
    responseList_original= torch.tensor([], device='cuda:0').int()  
    probList =torch.tensor([ ], device='cuda:0')
    while len(responseList_original)<total_nums:
        torch.cuda.empty_cache()
        df_batch = wafData.sample(bs)
        query_tensors = torch.stack(df_batch['tokens'].tolist())
        response_tensors, valuelist1, problist1 = respond_to_batch(model, query_tensors, txt_len=txt_out_len)
        valueList= torch.cat((valueList, valuelist1), dim=0)  # total_nums * txt_out_len   记录每一项生成的current value
        probList= torch.cat((probList, problist1), dim=0)     # total_nums * txt_out_len *vocabsize  记录生成过程中每一个选项的具体概率，未进行softmax
        responseList_original= torch.cat((responseList_original, response_tensors), dim=0)  #total_nums * (txt_out_len+1)  记录模型生成的 token，未进行decoder
    return  valueList, probList,responseList_original

from collections import defaultdict
import random
import numpy as np
import time
import csv
from tqdm import tqdm
import argparse
import torch
from tqdm import tqdm
import os
import pandas as pd
tqdm.pandas()
import datetime
import argparse
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from sklearn.model_selection import train_test_split
import ast

# 读取数据
# 初始化 GPT-2 模型和 tokenizer
base_model_path = "/data/shikangwei/gptrlt/givenmodel/pretrain_model_sql"
model = GPT2Model.from_pretrained(base_model_path)
tokenizer = GPT2Tokenizer.from_pretrained(base_model_path)
config = GPT2Config.from_pretrained(base_model_path)

class GPT2RewardPredictor(nn.Module):
    def __init__(self, config):
        super(GPT2RewardPredictor, self).__init__()
        self.gpt2 = GPT2Model(config)
        self.reward_prediction_head = nn.Linear(config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        # Assuming each element in batch_inputs is a tensor
        # Unpack the input tensors and make predictions for each element
        predictions = [self.predict_single(input_tensor) for input_tensor in inputs]
        # Combine the predictions into a single tensor
        predictions = torch.cat(predictions, dim=0)
        return predictions

    def predict_single(self, input_ids):
        # Assuming input_ids is a tensor of shape (batch_size, sequence_length)
        outputs = self.gpt2(input_ids)
        last_hidden_states = outputs[0]
        
        # Assuming last_hidden_states is of shape (batch_size, sequence_length, hidden_size)
        transformer_output = last_hidden_states

        # Assuming transformer_output is of shape (batch_size, sequence_length, hidden_size)
        final_step_activation = transformer_output[:, -1, :]

        # Predict reward
        reward_logits = self.reward_prediction_head(final_step_activation)
        reward_probabilities = self.sigmoid(reward_logits)

        return reward_probabilities




# 初始化模型和损失函数
classifier_model =  GPT2RewardPredictor(config)
# # Load the model
trained_model_path = "/data/shikangwei/gptrlt/model/gpt2sqlreward4"
checkpoint = torch.load(trained_model_path + "/model_checkpoint.pth")
classifier_model.load_state_dict(checkpoint['model_state_dict'])
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


classifier_model.to(device='cuda:0')
classifier_model.eval()  # 设置模型为评估模式


In [None]:
#lm_name = "/data/shikangwei/gptrlt/givenmodel/fine_tune_model_sql_modsecurity"  #论文给出训练好的模型
lm_name = "/data/shikangwei/gptrlt/givenmodel/pretrain_model_sql"                #给出的预训练模型
total_nums = int(64)
txt_in_len=int(1)
txt_out_len=int(75)

In [None]:
#inference配置
tokenizer = gpt2_tokenizer = GPT2Tokenizer.from_pretrained(lm_name)
gpt2_model = GPT2HeadWithValueModel.from_pretrained(lm_name)

_ = gpt2_model.eval()
_ = gpt2_model.to(device)
# 加载用于训练的模型（与推理模型可以是同一个模型的另一个实例）
rl_name = "/data/shikangwei/gptrlt/givenmodel/pretrain_model_sql"  

model_training = GPT2HeadWithValueModel.from_pretrained(rl_name)   #lm, 从开始开始训练
model_training.train()  # 确保模型在训练模式，默认即为此模式
# 检查是否有多个GPU可用，并使用DataParallel如果是的话
# if torch.cuda.device_count() > 1:
#     print(f"Let's use {torch.cuda.device_count()} GPUs!")
#     # 这里将模型包装在DataParallel中
#     model_training = nn.DataParallel(model_training)
model_training.to(device)

In [None]:
import matplotlib.pyplot as plt
stopcount=0
optimizer = optim.SGD(model_training.parameters(), lr=1.4e-5)
kk=-1
average=torch.tensor([], device=device)
stopthreshold=0.01

while True:
    kk+=1
    valueList, probList, responseList_original= generate(model_training, total_nums, txt_out_len)
    cunAt = []
    cunRt = []
    cunT = []
    cunrt = []
    response1 = model_training(responseList_original[:][:])
    xinde=response1[0]
    pp=gpt2_model(responseList_original[:][:])
    pp=pp[0]
    for i in range(len(responseList_original)):
        firsttry = responseList_original[i]
    
        T = txt_out_len-1
        cunT.append(T)
        
        # 假设的Rt和V值
        Rt = torch.zeros(T+1, device=device)  # T+1个值，从0到T
        nested_tensor = firsttry.unsqueeze(0)
        nested_tensor1 = []
        nested_tensor1.append(nested_tensor)
        
        Rt[T] = classifier_model(nested_tensor1)[0]                                               #第六行 代码实现
        
        #计算0- T-1 区间的KL散度,直接保存到 
        pprobs = F.softmax(pp[i], dim=1)
        paiprobs = F.softmax(probList[i], dim=1) #probList的第i项
        rtxinde = F.softmax(xinde[i], dim=1)
        for ii in range(T):
            # Rt[ii] = - 0.2* torch.log(paiprobs[ii][firsttry[ii+1]] /  pprobs[ii][firsttry[ii+1]])  #七八行 代码实现
            Rt[ii] = 0
 
        # Vt = valueList[i][:T+1]
 
        # 图二中提到的gamma值
        gamma = 0.99
#         T = len(Rt) - 1  # Subtract 1 because tensors are 0-indexed
#         # Initialize A_t to be the same shape as Rt, filled with zeros
        A_t = torch.zeros_like(Rt)
        Vt = valueList[i]
        # Calculate A_t using the given formula
        for t in range(T + 1):
            discount = 1  # Discount factor γ^0 is 1
            for k in range(t, T + 1):
                A_t[t] += discount * Rt[k]
                discount *= gamma  # Increase the discount factor γ by one power for each step
            A_t[t] -= Vt[t]  # Subtract V(s_t) from the discounted sum of rewards
        # # Calculate A_t using the one-step advantage estimate
        # for t in range(T+1):
        #     if(t<T):
        #         A_t[t] = Rt[t] + gamma * valueList[i][t+1] - valueList[i][t]
        #     else:
        #         A_t[t] = Rt[t]   - valueList[i][t]
            
        # 计算返回估计
        R_hat = torch.zeros_like(Rt)
        for t in reversed(range(T+1)):
            R_hat[t] = Rt[t] + (gamma * R_hat[t+1] if t < T else 0)                                #第九行代码实现

        #rt保存
        rt = torch.randn(T+1, device=device)  # T+1个值，从0到T
        for ii in range(len(rt)):
            rt[ii] = rtxinde[ii][firsttry[ii+1]] / paiprobs[ii][firsttry[ii+1]]
  
        #存储
        cunAt.append(A_t) 
        cunRt.append(R_hat)
        cunrt.append(rt)
        
    #计算policy value loss
    policy_loss =torch.tensor(0.0, device= device)
    epsilon = 0.2
    for ii in range(len(cunT)):
        for j in range(cunT[ii]+1):
            clipped_ratio = torch.clamp(cunrt[ii][j], 1 - epsilon, 1 + epsilon)
            policy_loss = policy_loss - torch.min(cunrt[ii][j] *cunAt[ii][j] , clipped_ratio * cunAt[ii][j])
    policy_loss= policy_loss/len(cunT)
    
    #计算 value loss
    value_loss = torch.tensor(0.0, device= device)
    newvalueList= response1[2]
    for i in range(len(cunT)):
        for j in range(cunT[i]+1):
            value_loss += (newvalueList[i][j] - cunRt[i][j])**2
    value_loss = value_loss / len(cunT)
    total_loss = policy_loss + value_loss
    print(total_loss)
    # 梯度下降步骤
    optimizer.zero_grad()  # 清空之前的梯度
    total_loss.backward()  # 反向传播，计算当前梯度
    optimizer.step()       # 根据梯度更新模型参数

    # 计算平均reward
    averagereward =torch.tensor([0.0], device= device)
    for i in range(len(cunT)):
        averagereward[0] +=  cunRt[i][-1]
    averagereward[0]= averagereward[0] / len(cunT)
    
    average = torch.cat((average, averagereward), 0)
    average1= average.detach().cpu().numpy()
    plt.figure(figsize=(10, 5))
    plt.plot(average1, marker='o', linestyle='-', color='b')
    plt.title('Average Reward per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Average Reward')
    plt.grid(True)
    plt.show()
    
 
    if kk==1000 :
        break
#2024，3，20. 12，05。2GPU同时运行1000次，结束时间为：3,21. 19,00.. 31小时
#2024, 3,22. 11.03. 2GPU同时运行1000次（NVlink）。结束时间： 3,23 15.49(看到) 不到29小时
#2024，4，4 11.28 1GPU 1000次 