# Import Libs

In [1]:
from datasets import Dataset, load_metric, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler, DataCollatorWithPadding
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
import numpy as np
import pandas as pd
import os
import sys

In [2]:
# Uncomment this line to enable nbextension and restart the kernel before proceeding to next step
# !jupyter nbextension enable --py widgetsnbextension

# Configuration

In [3]:
class Config:
    # flag to target on Gaudi
    TRAIN_ON_HPU = False
    
    # dataset
    TRAIN_DS_PATH = './dataset/train.csv'
    EVAL_DS_PATH = './dataset/eval.csv'
    
    # checkpoint used in preprocessing and modelling
    CHECKPOINT = 'distilbert-base-uncased'
    
    # HF params
    MAX_SEQ_LENGTH = 256
    
    # training hyperparams
    EPOCHS = 5
    LR = 3e-5
    BATCH_SIZE = 16
    
    # saved model path
    MODEL_DIR = './model/distil-bert-uncased-finetuned-github-issues/'

In [4]:
cfg = Config()

# Load the data

In [5]:
train_ds = Dataset.from_pandas(pd.read_pickle(cfg.TRAIN_DS_PATH))
eval_ds = Dataset.from_pandas(pd.read_pickle(cfg.EVAL_DS_PATH))

In [6]:
train_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 20604
})

In [7]:
eval_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 8831
})

# Preprocessing

In [8]:
# download checkpoint tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.CHECKPOINT)

In [9]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['descriptions'], truncation=True, max_length=cfg.MAX_SEQ_LENGTH)

In [11]:
# tokenize dataset
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [12]:
## convert dataset labels from str to ClassLabel
lbels = train_ds.unique("labels")
label_feature = ClassLabel(names=lbels)

# Update default features
train_features = train_ds.features
train_features["labels"] = label_feature
eval_features = eval_ds.features
eval_features["labels"] = label_feature

# Update dataset
train_ds = train_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=train_features)
eval_ds = eval_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=eval_features)


0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [13]:
train_ds.features

{'descriptions': Value(dtype='string', id=None),
 'labels': ClassLabel(num_classes=3, names=['bug', 'enhancement', 'question'], names_file=None, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [14]:
# remove redundant column and set torch format to the datasets
train_ds = train_ds.remove_columns(['descriptions'])
eval_ds = eval_ds.remove_columns(['descriptions'])
train_ds.set_format("torch")
eval_ds.set_format("torch")

In [15]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_ds, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)

In [16]:
# sanity check
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

# Training and Evaluating Model

In [17]:
# define metrics
metric = load_metric("accuracy")
# def compute_metrics(eval_pred):
#     metric = load_metric("accuracy")
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)

# Prepare model labels - useful in inference API
labels = train_ds.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [18]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.CHECKPOINT, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [19]:
# optimizer
optimizer = AdamW(model.parameters(), lr=cfg.LR)

# lr scheduler
num_training_steps = cfg.EPOCHS * len(train_dataloader) # epoch * train_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(f"Training steps: {num_training_steps}")

Training steps: 6440


In [20]:
# Prepare target device for training
if cfg.TRAIN_ON_HPU:
    # habana_modules_directory = "/usr/lib/habanalabs"
    # habana_pth_plugin = "libhabana_pytorch_plugin.so"
    # sys.path.insert(0, habana_modules_directory)
    # torch.ops.load_library(
    #     os.path.abspath(
    #         os.path.join(habana_modules_directory, habana_pth_plugin)
    #     )
    # )
    from habana_frameworks.torch.utils.library_loader import load_habana_module
    load_habana_module()
    print("Targeting HPU")
    device = torch.device("hpu")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
model = model.to(device)

Using device: cuda


In [21]:
# train & eval helpers
def train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar=None):
    model.train()
    size = len(train_dataloader)
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # forward propagation
        outputs = model(**batch)
        loss = outputs.loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # step grad and lr
        optimizer.step()
        lr_scheduler.step()
        
        # print out training progress every 500 steps
        if batch_idx % 100 == 0:
            print(f"loss: {loss.item():>7f}  [{batch_idx:>5d}/{size:>5d}]")
        
        # update gobal bar progress
        if global_progress_bar:
            global_progress_bar.update(1)

def eval_epoch(eval_dataloader, model):
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
        # post-process pred
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # accumulate all batches' metrics
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    print(metric.compute())

In [22]:
# training and eval loops
global_progress_bar = tqdm(range(num_training_steps))
for epoch in range(cfg.EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar)
    print("Evaluation:")
    eval_epoch(eval_dataloader, model)
    print("\n")

  0%|          | 0/6440 [00:00<?, ?it/s]

Epoch 1
-------------------------------
loss: 1.144286  [    0/ 1288]
loss: 0.501930  [  100/ 1288]
loss: 0.363136  [  200/ 1288]
loss: 0.360400  [  300/ 1288]
loss: 0.478606  [  400/ 1288]
loss: 0.542012  [  500/ 1288]
loss: 0.286772  [  600/ 1288]
loss: 0.601538  [  700/ 1288]
loss: 0.748330  [  800/ 1288]
loss: 0.682168  [  900/ 1288]
loss: 0.281032  [ 1000/ 1288]
loss: 0.563609  [ 1100/ 1288]
loss: 0.487928  [ 1200/ 1288]
Evaluation:

{'accuracy': 0.7662778847242668}
Epoch 2
-------------------------------
loss: 0.698949  [    0/ 1288]
loss: 0.491764  [  100/ 1288]
loss: 0.297947  [  200/ 1288]
loss: 0.310821  [  300/ 1288]
loss: 0.406685  [  400/ 1288]
loss: 0.526123  [  500/ 1288]
loss: 0.450059  [  600/ 1288]
loss: 0.521728  [  700/ 1288]
loss: 0.360575  [  800/ 1288]
loss: 0.324181  [  900/ 1288]
loss: 0.286618  [ 1000/ 1288]
loss: 0.323537  [ 1100/ 1288]
loss: 0.588727  [ 1200/ 1288]
Evaluation:

{'accuracy': 0.7999094100328389}
Epoch 3
-------------------------------
loss: 0.

# Saving Model

In [28]:
    try:
        model.save_pretrained(cfg.MODEL_DIR)
        tokenizer.save_pretrained(cfg.MODEL_DIR)
        print(f"Trained model and its tokenizer are saved to {cfg.MODEL_DIR}")
    except Exception as e:
        print(e)
        print("Saving model failed")
            

Trained model and its tokenizer are saved to ./model/


In [None]:
# More info to save HF model: https://huggingface.co/transformers/v1.0.0/model_doc/overview.html#serialization-best-practices