In [None]:
'''
对抗训练
1. adversarial training
2. adversarial fine-tuning
'''
import pandas as pd
import numpy as np
import json
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score
import os
from torch.functional import F
from transformers import RobertaTokenizer,RobertaForSequenceClassification
from transformers import AdamW,get_linear_schedule_with_warmup
from datasets import load_dataset

In [None]:
def adversari_training(model,tokenizer,train_data_path,valid_data_path,train_batch_size,eval_batch_size,num_epochs,lr,early_stopping,outputdir,trainlogdir,adv_training_data_path,training_or_finetuning = "training"):
    '''
    当training_or_finetuning == "fine-tuning"时,进行adversarial fine-tuning
    否则,进行adversarial training
    '''
    traindataset = load_dataset("csv",data_files=train_data_path)["train"]
    validdataset = load_dataset("csv",data_files=valid_data_path)["train"]
    if training_or_finetuning == "training":
        advdataset = load_dataset("csv",data_files=adv_training_data_path)["train"]
        for i in advdataset:
            traindataset = traindataset.add_item(i)
            
    elif training_or_finetuning == "fine-tuning":
        advdataset = load_dataset("csv",data_files=adv_training_data_path)["train"]
        advdataset = advdataset.map(tokenize_function,batched=True,remove_columns=["text"]).rename_column("label","labels")
        adv_dataloader = DataLoader(advdataset , collate_fn=collate_fn,batch_size = train_batch_size)
    def tokenize_function(examples):
        return tokenizer(examples["text"],truncation = True,padding=True) 
    def collate_fn(examples):
        return tokenizer.pad(examples, padding="max_length", return_tensors="pt")
    train_tokenized_dataset = traindataset.map(tokenize_function,batched=True,remove_columns=["text"]).rename_column("label","labels")
    valid_tokenized_dataset = validdataset.map(tokenize_function,batched=True,remove_columns=["text"]).rename_column("label","labels")
    train_dataloader = DataLoader(train_tokenized_dataset,shuffle=True,collate_fn=collate_fn,batch_size = train_batch_size)
    eval_dataloader = DataLoader(valid_tokenized_dataset , collate_fn=collate_fn,batch_size = eval_batch_size)
        
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.resize_token_embeddings(len(tokenizer))
    trainlogdf = pd.DataFrame(columns=["step","trainloss","validloss","acc","f1-score"])
    rowindex = 0
    eval_no_progress_count = 0
    epochloss = []
    max_eval_acc = 0
    optimizer = AdamW(params=model.parameters(), lr=lr)
    iter_to_accumlate = 4
    # Instantiate scheduler

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    for epoch in range(num_epochs):
        model.train()
        allloss = 0
        for step,batch in enumerate(tqdm(train_dataloader)):
            batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss/iter_to_accumlate
            loss.backward()
            allloss += loss.item()
            trainlogdf.loc[rowindex] = [rowindex,loss.item(),None,None,None]
            rowindex += 1
            epochloss.append(loss.item())
            if (step+1)%iter_to_accumlate==0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
            if (step+1)%(10*iter_to_accumlate) == 0:
                print("epoch",epoch,"step",step,"loss",loss,sep=" ")
        print("epoch",epoch,"step",step,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))
        
        count = 0
        model.eval()
        validloss = 0
        preds = []
        labels = []
        for evalstep,batch in enumerate(tqdm(eval_dataloader)):
            labels += batch['labels'].cpu()
            batch.to(device)
            with torch.no_grad():
                output = model(**batch)
            validloss += output.loss.item()
            pred = torch.argmax(F.softmax(output.logits.cpu(),dim=1),dim=1)
            preds += pred
            count += int(sum(batch['labels'].cpu() == pred))
        model.train()
        eval_acc = count/132
        trainlogdf.loc[rowindex-1,"validloss"] = validloss/132
        trainlogdf.loc[rowindex-1,"acc"] = eval_acc
        trainlogdf.loc[rowindex-1,"f1-score"] = f1_score(np.array(labels),np.array(preds),average="macro")
        print("epoch ",epoch,"step",step,"acc ",eval_acc)
        if eval_acc < max_eval_acc:
            eval_no_progress_count += 1
            if eval_no_progress_count >=early_stopping:
                print("Early Stopping:Epoch",epoch," Step",step,"Eval_acc",eval_acc,sep=" ")
                break
            else:
                print("Early Stopping record count",eval_no_progress_count,"Max eval acc",max_eval_acc,sep=" ")
        if eval_acc > max_eval_acc:
            if training_or_finetuning == "fine-tuning":
                '''
                加载对抗样本进行训练
                '''
                model.train()
                print("adver fine-tuning")
                allloss = 0
                for step,batch in tqdm(enumerate(adv_dataloader)):
                    batch.to(device)
                    output = model(**batch)
                    loss = output.loss
                    loss.backward()
                    allloss += loss.item()
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                print("adver fine-tuning loss",allloss/(len(adv_dataloader)*train_batch_size))
                
            max_eval_acc = eval_acc
            print("Update Max eval acc",max_eval_acc)
            eval_no_progress_count = 0
            model.save_pretrained(outputdir)
            torch.save(model.state_dict(),os.path.join("checkpoint","model.bin"))
            torch.save(optimizer.state_dict(),os.path.join("checkpoint","optimizer.bin"))
            torch.save(lr_scheduler.state_dict(),os.path.join("checkpoint","lr_scheduler.bin"))
        
    trainlogdf.to_csv(trainlogdir)
    tokenizer.save_pretrained(outputdir)



In [None]:
model_path = "microsoft/codebert-base"
model = RobertaForSequenceClassification.from_pretrained(model_path,num_labels = 66)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
adversari_training(model,tokenizer,"../dataset/data_folder/processed_gcjpy/train.csv","../dataset/data_folder/processed_gcjpy/valid.csv",2,2,30,5e-5,5,"CODEBERT-ADV-TRAINING","adv-training.log","../dataset/data_folder/processed_gcjpy/adv_training.csv","training")

In [None]:
model_path = "microsoft/codebert-base"
model = RobertaForSequenceClassification.from_pretrained(model_path,num_labels = 66)
tokenizer = RobertaTokenizer.from_pretrained(model_path)
adversari_training(model,tokenizer,"../dataset/data_folder/processed_gcjpy/train.csv","../dataset/data_folder/processed_gcjpy/valid.csv",2,2,30,5e-5,5,"CODEBERT-ADV-FINE-TUNING","adv-fine-tung.log","../dataset/data_folder/processed_gcjpy/adv_training.csv","fine-tuning")