# Import Libs

In [1]:
from datasets import Dataset, load_metric, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_scheduler, DataCollatorWithPadding
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
import torch
import numpy as np
import pandas as pd
import os

In [2]:
# Uncomment this line to enable nbextension and restart the kernel before proceeding to next step
# !jupyter nbextension enable --py widgetsnbextension

# Configuration

In [40]:
class Config:
    # flag to target on Gaudi
    TRAIN_ON_HPU = False
    
    # dataset
    TRAIN_DS_PATH = './dataset/train.pkl'
    EVAL_DS_PATH = './dataset/eval.pkl'
    
    # checkpoint used in preprocessing and modelling
    CHECKPOINT = 'distilbert-base-uncased'
    
    # HF params
    MAX_SEQ_LENGTH = 256
    
    # training hyperparams
    EPOCHS = 1
    LR = 3e-5
    BATCH_SIZE = 8
    
    # saved model path
    MODEL_DIR = './model/'

In [41]:
cfg = Config()

# Load the data

In [42]:
train_ds = Dataset.from_pandas(pd.read_pickle(cfg.TRAIN_DS_PATH))
eval_ds = Dataset.from_pandas(pd.read_pickle(cfg.EVAL_DS_PATH))

In [43]:
train_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 20604
})

In [44]:
eval_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 8831
})

# Preprocessing

In [45]:
# download checkpoint tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.CHECKPOINT)

In [46]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:
# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['descriptions'], truncation=True, max_length=cfg.MAX_SEQ_LENGTH)

In [48]:
# tokenize dataset
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [49]:
## convert dataset labels from str to ClassLabel
lbels = train_ds.unique("labels")
label_feature = ClassLabel(names=lbels)

# Update default features
train_features = train_ds.features
train_features["labels"] = label_feature
eval_features = eval_ds.features
eval_features["labels"] = label_feature

# Update dataset
train_ds = train_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=train_features)
eval_ds = eval_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=eval_features)


0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [50]:
train_ds.features

{'descriptions': Value(dtype='string', id=None),
 'labels': ClassLabel(num_classes=3, names=['bug', 'enhancement', 'question'], names_file=None, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [51]:
# remove redundant column and set torch format to the datasets
train_ds = train_ds.remove_columns(['descriptions'])
eval_ds = eval_ds.remove_columns(['descriptions'])
train_ds.set_format("torch")
eval_ds.set_format("torch")

In [52]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_ds, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)

In [53]:
# sanity check
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

# Train Model

In [79]:
# define metrics
metric = load_metric("glue", "mrpc")
# def compute_metrics(eval_pred):
#     metric = load_metric("accuracy")
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)

# Prepare model labels - useful in inference API
labels = train_ds.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [55]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.CHECKPOINT, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [56]:
# optimizer
optimizer = AdamW(model.parameters(), lr=cfg.LR)

# lr scheduler
num_training_steps = cfg.EPOCHS * len(train_dataloader) # epoch * train_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(f"Training steps: {num_training_steps}")

Training steps: 2576




In [None]:
# Prepare target device for training
if cfg.TRAIN_ON_HPU:
    habana_modules_directory = "/usr/lib/habanalabs"
    habana_pth_plugin = "libhabana_pytorch_plugin.so"
    sys.path.insert(0, habana_modules_directory)
    torch.ops.load_library(
        os.path.abspath(
            os.path.join(habana_modules_directory, habana_pth_plugin)
        )
    )
    print("Targeting HPU")
    device = torch.device("hpu")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
model = model.to(device)

In [80]:
# train & eval helpers
def train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar=None):
    model.train()
    size = len(train_dataloader)
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # forward propagation
        outputs = model(**batch)
        loss = outputs.loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        
        # step grad and lr
        optimizer.step()
        lr_scheduler.step()
        
        # print out training progress every 500 steps
        if batch_idx % 500 == 0:
            print(f"loss: {loss.item():>7f}  [{batch_idx:>5d}/{size:>5d}]")
        
        # update gobal bar progress
        if global_progress_bar:
            global_progress_bar.update(1)

def eval_epoch(eval_dataloader, model):
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
        # post-process pred
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # accumulate all batches' metrics
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    print(metric.compute())

In [78]:
# training and eval loops
global_progress_bar = tqdm(range(num_training_steps))
for epoch in range(cfg.EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_epoch(train_dataloader, model, optimizer, lr_scheduler, global_progress_bar)
    eval_epoch(eval_dataloader, model)

  0%|          | 0/2576 [00:00<?, ?it/s]

Epoch 1
-------------------------------
loss: 0.457990  [    0/ 2576]
loss: 0.134473  [  100/ 2576]
loss: 0.232406  [  200/ 2576]
loss: 0.781294  [  300/ 2576]
loss: 0.319747  [  400/ 2576]
loss: 0.480070  [  500/ 2576]
loss: 0.372879  [  600/ 2576]
loss: 0.242397  [  700/ 2576]
loss: 1.029765  [  800/ 2576]
loss: 0.588807  [  900/ 2576]
loss: 0.463248  [ 1000/ 2576]
loss: 0.325843  [ 1100/ 2576]
loss: 0.394999  [ 1200/ 2576]
loss: 0.581026  [ 1300/ 2576]
loss: 0.522310  [ 1400/ 2576]
loss: 0.203650  [ 1500/ 2576]
loss: 0.187857  [ 1600/ 2576]
loss: 0.303378  [ 1700/ 2576]
loss: 0.292468  [ 1800/ 2576]
loss: 0.436149  [ 1900/ 2576]
loss: 0.508642  [ 2000/ 2576]
loss: 0.572653  [ 2100/ 2576]
loss: 0.371249  [ 2200/ 2576]
loss: 0.932959  [ 2300/ 2576]
loss: 0.585826  [ 2400/ 2576]
loss: 0.363524  [ 2500/ 2576]
{'accuracy': 0.7931151624957536}


# Save Model

In [58]:
    try:
        torch.save(model.state_dict(), cfg.MODEL_DIR + "model.pt")
        print(f"Trained model is saved to {cfg.MODEL_DIR} as model.pt")
    except Exception as e:
        print(e)
        try:
            torch.save(model.state_dict(), "model.pt")
            print(f"Trained model is saved to current work directory as model.pt")
        except Exception as e:
            print(e)
            print("Saving model failed")

[Errno 2] No such file or directory: './model/model.pt'
Trained model is saved to current work directory as model.pt
