In [1]:
# (0)Import dependency package
import os
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric
from transformers import RobertaTokenizer, RobertaModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# (1) Define used fuctions
class RobertaC2(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int , gpt_model_name:str):
        super(RobertaC2,self).__init__()
        self.roberta_model = RobertaModel.from_pretrained(gpt_model_name)
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_size,num_classes)
        )
       # self.fc1 = nn.Sequential(nn.Linear(hidden_size, num_classes))

        
    def forward(self,input_id, mask):
        """
        input_id: encoded inputs ids of sent.
        """
        last_seq = self.roberta_model(input_ids=input_id, attention_mask=mask).last_hidden_state[:, 0, :]
        #print(last_seq,last_seq.size()) # torch.Size([bs, 1024])
        linear_output = self.fc1(last_seq)
        return linear_output
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    
def evaluate_loss(net, device, criterion, val_dataloader):
    net.eval()
    mean_loss = 0
    count = 0
    with torch.no_grad():
        for val_input, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input["input_ids"].squeeze(1).to(device)
            logits = net(input_id,mask)
            mean_loss += criterion(logits, val_label).item()
            count += 1

    return mean_loss / count            
def train_roberta(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it,(train_input, train_label) in enumerate(tqdm(train_loader)):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)
            with autocast():
                # Obtaining the logits from the model
                logits = net(input_id,mask)

                # Computing loss
                loss = criterion(logits, train_label)
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
           # print(loss,loss.is_cuda)
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='./models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(model_name, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()   
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df,tokenizer):
        self.labels = list(df['label'])
        self.texts = [tokenizer(tokenizer.cls_token+df['hs'][index]+tokenizer.sep_token+df['cn'][index]+tokenizer.sep_token,
                        padding='max_length',
                        max_length=256,
                        truncation=True,
                        return_tensors="pt") for index in range(len(df['index']))]
#         self.texts = [tokenizer('[CLS]'+df['hs'][index]+'[SEP]'+df['cn'][index]+'[EOS]',
#                                 padding='max_length',
#                                 max_length=256,
#                                 truncation=True,
#                                 return_tensors="pt") for index in range(len(df['index']))]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        # Get a batch of labels
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        # Get a batch of inputs
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [13]:
# (2)Parameter setting
data_files='./path/hs_cn_dataset.csv'   # the training and testing datasets
save_path='./saved_models/'   #The path to save the finetuned models      
model_name ='./path/roberta-large' # The path to store roberta-large
hidden_size=1024
freeze = False  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 256  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
batch_size = 16  # 16 batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 2e-5  # learning rate
epochs = 4 # number of training epochs
set_seed(42)
tokenizer= RobertaTokenizer.from_pretrained(model_name)

In [16]:
# (3)Preprocess dataset
datasets = load_dataset('csv', data_files=data_files, 
                        cache_dir="./data/", delimiter=",")

# Spliting datasets
split = datasets['train'].train_test_split(test_size=0.1, seed=2,shuffle=True)  # split the original training data for validation
train_data = split['train']  # 90 % of the original training data
val_test = split['test']   # 10 % of the original training data
split =val_test.train_test_split(test_size=0.5,seed=1)
valid_data=split['train']
test_data=split['test']

# Transform data into pandas dataframes
df_train = pd.DataFrame(train_data)
df_val = pd.DataFrame(valid_data)
df_test = pd.DataFrame(test_data)

# Creating instances of training and validation set
train= Dataset(df_train,tokenizer)
val= Dataset(df_val,tokenizer)  
print("Reading training data...") 
train_loader = torch.utils.data.DataLoader(train, batch_size, shuffle=True,num_workers=5)
print("Reading validation data...")
val_loader = torch.utils.data.DataLoader(val, batch_size,num_workers=5)



In [15]:
# (4) Load training funtions
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = RobertaC2(hidden_size=hidden_size, num_classes=2, gpt_model_name=model_name)
net.to(device)

criterion = nn.CrossEntropyLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

In [10]:
# (5) Starting training and save the best model.
best_loss = np.Inf
best_ep = 1
nb_iterations = len(train_loader)
print_every = nb_iterations // 5  # print the training loss 5 times per epoch
iters = []
train_losses = []
val_losses = []

scaler = GradScaler()

for ep in range(epochs):

    net.train()
    running_loss = 0.0
    for it,(train_input, train_label) in enumerate(tqdm(train_loader)):
        train_label = train_label.to(device)
        mask = train_input['attention_mask'].to(device)
        input_id = train_input["input_ids"].squeeze(1).to(device)
        with autocast():
            logits = net(input_id,mask)
            loss = criterion(logits, train_label)
            loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged

        scaler.scale(loss).backward()

        if (it + 1) % iters_to_accumulate == 0:
            scaler.step(opti)
            scaler.update()
            lr_scheduler.step()
            opti.zero_grad()


        running_loss += loss.item()

        if (it + 1) % print_every == 0:  # Print training loss information
            print()
            print("Iteration {}/{} of epoch {} complete. Loss : {} "
                  .format(it+1, nb_iterations, ep+1, running_loss / print_every))

            running_loss = 0.0


    val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
    print()
    print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

    if val_loss < best_loss:
        print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
        print()
        net_copy = copy.deepcopy(net)  # save a copy of the model
        best_loss = val_loss
        best_ep = ep + 1

# Saving the model
path_to_model=save_path+'{}_lr_{}_val_loss_{}_ep_{}.pt'.format(model_name, lr, round(best_loss, 5), best_ep)
torch.save(net_copy.state_dict(), path_to_model)
print("The model has been saved in {}".format(path_to_model))

del loss
torch.cuda.empty_cache()

In [None]:
#(6) Evaluating
def cn_detect_model(model_path):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    net = RobertaC2(hidden_size=1024, num_classes=2,
                    gpt_model_name=model_name)  # RobertaSequenceClassifier
    print("Loading the  weights of the model...")
    net.load_state_dict(torch.load(,model_path,map_location=device))
    net.to(device)
    net.eval()
    for param in net.parameters():
        param.requires_grad = False
    return net, tokenizer
def evaluate(model, test_data):

    test = Dataset(test_data,tokenizer)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

        
    # Tracking variables
    predictions_labels = []
    true_labels = []
    
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in tqdm(test_dataloader):

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            
            # add original labels
            true_labels += test_label.cpu().numpy().flatten().tolist()
            # get predicitons to list
            predictions_labels += output.argmax(dim=1).cpu().numpy().flatten().tolist()
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return true_labels, predictions_labels
finetuned_model_path='"./saved_models/CnDetection-ReZG.pt"'
net, tokenizer = cn_detect_model(finetuned_model_path)
true_labels, pred_labels = evaluate(net, df_test)