In [1]:
'''
fine-tune CodeBERT
Author: Liu Jin Cheng
'''
from transformers import RobertaForSequenceClassification,RobertaTokenizer,get_linear_schedule_with_warmup
from datasets import load_dataset,Dataset
import os
import random
import numpy as np
import evaluate
import torch
import argparse
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.functional import F
from torch.cuda.amp import autocast as autocast,GradScaler
import pandas as pd
from sklearn.metrics import f1_score
from torch.optim import AdamW
import pandas as pd


In [2]:
data_path = os.path.join("..","dataset")
train_batch_size = 8
eval_batch_size = 16
lr = 5e-5
num_epochs = 5
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name,num_labels = 66)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be 

In [3]:

def compute_metrics(eval_pred):
        metirc = evaluate.load("accuracy")
        logits , labels = eval_pred
        predictions = np.argmax(logits,axis=-1)
        return metirc.compute(predictions=predictions,references=labels)


def tokenize_function(examples):
        return tokenizer(examples["text"],truncation = True,padding=True) 

def collate_fn(examples):
    return tokenizer.pad(examples, padding="max_length", return_tensors="pt")

In [4]:
dftrain = pd.read_pickle(os.path.join(data_path,"train.pkl"))
dfvalid = pd.read_pickle(os.path.join(data_path,"valid.pkl"))
traindatasets = Dataset.from_pandas(dftrain)
validdatasets = Dataset.from_pandas(dfvalid)
train_tokenized_dataset = traindatasets.map(tokenize_function,batched=True,remove_columns=["text","__index_level_0__"]).rename_column("label","labels")
valid_tokenized_dataset = validdatasets.map(tokenize_function,batched=True,remove_columns=["text","__index_level_0__"]).rename_column("label","labels")
train_dataloader = DataLoader(train_tokenized_dataset,shuffle=True,collate_fn=collate_fn,batch_size = train_batch_size)
eval_dataloader = DataLoader(valid_tokenized_dataset , collate_fn=collate_fn,batch_size = eval_batch_size)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Map:   0%|          | 0/90102 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [5]:
optimizer = AdamW(params=model.parameters(), lr=lr)

# Instantiate scheduler

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)


In [20]:
model.to(device)
max_eval_acc = 0
iter_to_accumlate = 4
epochloss = []
trainlogdf = pd.DataFrame(columns=["step","trainloss","validloss","acc","f1-score"])
rowindex = 11264
eval_no_progress_count = 1
early_stopping = 6
early_stopping_flag = False
trainlogdf = pd.read_csv("trainlog.csv")
del trainlogdf["Unnamed: 0"]

In [22]:
for epoch in range(num_epochs):
    model.train()
    allloss = 0
    for step,batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss/iter_to_accumlate
        loss.backward()
        allloss += loss.item()
        trainlogdf.loc[rowindex] = [rowindex,loss.item(),None,None,None]
        rowindex += 1
        epochloss.append(loss.item())
        if (step+1)%iter_to_accumlate==0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        if (step+1)%(50*iter_to_accumlate) == 0:
            print("epoch",epoch,"step",step,"loss",loss,sep=" ")
        
        if (step+1)%800 == 0:
            count = 0
            model.eval()
            validloss = 0
            preds = []
            labels = []
            for evalstep,batch in enumerate(tqdm(eval_dataloader)):
                labels += batch['labels'].cpu()
                batch.to(device)
                with torch.no_grad():
                    output = model(**batch)
                validloss += output.loss.item()
                pred = torch.argmax(F.softmax(output.logits.cpu(),dim=1),dim=1)
                preds += pred
                count += int(sum(batch['labels'].cpu() == pred))
            model.train()
            eval_acc = count/4000
            trainlogdf.loc[rowindex-1,"validloss"] = validloss/4000
            trainlogdf.loc[rowindex-1,"acc"] = eval_acc
            trainlogdf.loc[rowindex-1,"f1-score"] = f1_score(np.array(labels),np.array(preds),average="macro")
            print("epoch ",epoch,"step",step,"acc ",eval_acc)
            if eval_acc < max_eval_acc:
                eval_no_progress_count += 1
                if eval_no_progress_count >=early_stopping:
                    print("Early Stopping:Epoch",epoch," Step",step,"Eval_acc",eval_acc,sep=" ")
                    early_stopping_flag = True
                    break
                else:
                    print("Early Stopping record count",eval_no_progress_count,"/",early_stopping,"Will stop","Max eval acc",max_eval_acc,sep=" ")
            if eval_acc > max_eval_acc:
                max_eval_acc = eval_acc
                print("Update Max eval acc",max_eval_acc)
                eval_no_progress_count = 0
                model.save_pretrained("CodeBERTsaved_models")
                torch.save(model.state_dict(),os.path.join("checkpoint","model.bin"))
                torch.save(optimizer.state_dict(),os.path.join("checkpoint","optimizer.bin"))
                torch.save(lr_scheduler.state_dict(),os.path.join("checkpoint","lr_scheduler.bin"))
    print("epoch",epoch,"step",step,"trainLoss:",allloss/(len(train_dataloader)*train_batch_size))
    if early_stopping_flag:
        break
trainlogdf.to_csv("trainlog.csv")
tokenizer.save_pretrained("CodeBERTsaved_models")

  2%|▏         | 201/11263 [00:37<34:51,  5.29it/s]

epoch 0 step 199 loss tensor(0.1004, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▎         | 401/11263 [01:15<34:06,  5.31it/s]

epoch 0 step 399 loss tensor(0.0927, device='cuda:0', grad_fn=<DivBackward0>)


  5%|▌         | 601/11263 [01:53<33:47,  5.26it/s]

epoch 0 step 599 loss tensor(0.0404, device='cuda:0', grad_fn=<DivBackward0>)


  7%|▋         | 799/11263 [02:30<32:24,  5.38it/s]

epoch 0 step 799 loss tensor(0.1115, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:32<00:00,  7.70it/s]


epoch  0 step 799 acc  0.84275
Update Max eval acc 0.84275


  9%|▉         | 1001/11263 [03:44<32:49,  5.21it/s]  

epoch 0 step 999 loss tensor(0.0719, device='cuda:0', grad_fn=<DivBackward0>)


 11%|█         | 1201/11263 [04:22<32:13,  5.20it/s]

epoch 0 step 1199 loss tensor(0.0650, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 1401/11263 [05:00<31:22,  5.24it/s]

epoch 0 step 1399 loss tensor(0.0320, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 1599/11263 [05:38<30:33,  5.27it/s]

epoch 0 step 1599 loss tensor(0.0767, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:32<00:00,  7.69it/s]
 14%|█▍        | 1601/11263 [06:11<18:51:56,  7.03s/it]

epoch  0 step 1599 acc  0.84225
Early Stopping record count 1 / 6 Will stop Max eval acc 0.84275


 16%|█▌        | 1801/11263 [06:49<29:47,  5.29it/s]   

epoch 0 step 1799 loss tensor(0.0253, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 2001/11263 [07:27<29:41,  5.20it/s]

epoch 0 step 1999 loss tensor(0.0649, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 2200/11263 [08:07<53:37,  2.82it/s]

epoch 0 step 2199 loss tensor(0.0578, device='cuda:0', grad_fn=<DivBackward0>)


 21%|██▏       | 2399/11263 [09:14<53:14,  2.78it/s]

epoch 0 step 2399 loss tensor(0.0592, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:57<00:00,  4.31it/s]
 21%|██▏       | 2400/11263 [10:13<43:47:27, 17.79s/it]

epoch  0 step 2399 acc  0.842
Early Stopping record count 2 / 6 Will stop Max eval acc 0.84275


 23%|██▎       | 2600/11263 [11:23<48:28,  2.98it/s]   

epoch 0 step 2599 loss tensor(0.0440, device='cuda:0', grad_fn=<DivBackward0>)


 25%|██▍       | 2800/11263 [12:26<53:13,  2.65it/s]

epoch 0 step 2799 loss tensor(0.1995, device='cuda:0', grad_fn=<DivBackward0>)


 27%|██▋       | 3000/11263 [13:33<51:21,  2.68it/s]

epoch 0 step 2999 loss tensor(0.0598, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 3199/11263 [14:41<50:19,  2.67it/s]

epoch 0 step 3199 loss tensor(0.0873, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:57<00:00,  4.35it/s]
 28%|██▊       | 3200/11263 [15:39<39:30:29, 17.64s/it]

epoch  0 step 3199 acc  0.8355
Early Stopping record count 3 / 6 Will stop Max eval acc 0.84275


 30%|███       | 3400/11263 [16:47<46:38,  2.81it/s]   

epoch 0 step 3399 loss tensor(0.0119, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 3600/11263 [17:55<45:15,  2.82it/s]

epoch 0 step 3599 loss tensor(0.0276, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 3801/11263 [18:37<23:36,  5.27it/s]

epoch 0 step 3799 loss tensor(0.1235, device='cuda:0', grad_fn=<DivBackward0>)


 36%|███▌      | 3999/11263 [19:15<23:08,  5.23it/s]

epoch 0 step 3999 loss tensor(0.0599, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:36<00:00,  6.87it/s]
 36%|███▌      | 4001/11263 [19:52<15:49:47,  7.85s/it]

epoch  0 step 3999 acc  0.84125
Early Stopping record count 4 / 6 Will stop Max eval acc 0.84275


 37%|███▋      | 4201/11263 [20:30<22:31,  5.23it/s]   

epoch 0 step 4199 loss tensor(0.0175, device='cuda:0', grad_fn=<DivBackward0>)


 39%|███▉      | 4401/11263 [21:08<22:00,  5.20it/s]

epoch 0 step 4399 loss tensor(0.0686, device='cuda:0', grad_fn=<DivBackward0>)


 41%|████      | 4601/11263 [21:50<21:09,  5.25it/s]

epoch 0 step 4599 loss tensor(0.0470, device='cuda:0', grad_fn=<DivBackward0>)


 43%|████▎     | 4799/11263 [22:28<20:30,  5.25it/s]

epoch 0 step 4799 loss tensor(0.0439, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:32<00:00,  7.62it/s]
 43%|████▎     | 4801/11263 [23:02<12:44:50,  7.10s/it]

epoch  0 step 4799 acc  0.84275


 44%|████▍     | 5001/11263 [23:40<20:05,  5.19it/s]   

epoch 0 step 4999 loss tensor(0.0288, device='cuda:0', grad_fn=<DivBackward0>)


 46%|████▌     | 5200/11263 [24:29<31:57,  3.16it/s]

epoch 0 step 5199 loss tensor(0.0628, device='cuda:0', grad_fn=<DivBackward0>)


 48%|████▊     | 5400/11263 [25:33<31:24,  3.11it/s]

epoch 0 step 5399 loss tensor(0.0103, device='cuda:0', grad_fn=<DivBackward0>)


 50%|████▉     | 5599/11263 [26:38<29:25,  3.21it/s]

epoch 0 step 5599 loss tensor(0.0279, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:53<00:00,  4.68it/s]
 50%|████▉     | 5600/11263 [27:32<25:45:44, 16.38s/it]

epoch  0 step 5599 acc  0.841
Early Stopping record count 5 / 6 Will stop Max eval acc 0.84275


 51%|█████▏    | 5800/11263 [28:37<30:34,  2.98it/s]   

epoch 0 step 5799 loss tensor(0.1296, device='cuda:0', grad_fn=<DivBackward0>)


 53%|█████▎    | 6000/11263 [29:41<24:38,  3.56it/s]

epoch 0 step 5999 loss tensor(0.0886, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▌    | 6200/11263 [30:42<27:23,  3.08it/s]

epoch 0 step 6199 loss tensor(0.0919, device='cuda:0', grad_fn=<DivBackward0>)


 57%|█████▋    | 6399/11263 [31:47<25:32,  3.17it/s]

epoch 0 step 6399 loss tensor(0.0128, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 250/250 [00:53<00:00,  4.67it/s]
 57%|█████▋    | 6399/11263 [32:41<24:50,  3.26it/s]


epoch  0 step 6399 acc  0.8405
Early Stopping:Epoch 0  Step 6399 Eval_acc 0.8405
epoch 0 step 6399 trainLoss: 0.004451404953563261


('CodeBERTsaved_models/tokenizer_config.json',
 'CodeBERTsaved_models/special_tokens_map.json',
 'CodeBERTsaved_models/vocab.json',
 'CodeBERTsaved_models/merges.txt',
 'CodeBERTsaved_models/added_tokens.json')