# Import Libs

In [1]:
from datasets import Dataset, load_metric, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_scheduler, DataCollatorWithPadding
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
import torch
import numpy as np
import pandas as pd
import os

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


# Configuration

In [25]:
class Config:
    # flag to target on Gaudi
    TRAIN_ON_HPU = False
    
    # dataset
    TRAIN_DS_PATH = './dataset/train.pkl'
    EVAL_DS_PATH = './dataset/eval.pkl'
    
    # checkpoint used in preprocessing and modelling
    CHECKPOINT = 'distilbert-base-uncased'
    
    # HF params
    MAX_SEQ_LENGTH = 514
    
    # training hyperparams
    EPOCHS = 1
    LR = 3e-5
    BATCH_SIZE = 8
    
    # saved model path
    MODEL_DIR = './model/'

In [26]:
cfg = Config()

# Load the data

In [6]:
train_ds = Dataset.from_pandas(pd.read_pickle(cfg.TRAIN_DS_PATH))
eval_ds = Dataset.from_pandas(pd.read_pickle(cfg.EVAL_DS_PATH))

In [7]:
train_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 20604
})

In [8]:
eval_ds

Dataset({
    features: ['descriptions', 'labels'],
    num_rows: 8831
})

# Preprocessing

In [9]:
# download checkpoint tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.CHECKPOINT)

In [10]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['descriptions'], truncation=True, max_length=cfg.MAX_SEQ_LENGTH)

In [12]:
# tokenize dataset
train_ds = train_ds.map(tokenize, batched=True)
eval_ds = eval_ds.map(tokenize, batched=True)

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [13]:
## convert dataset labels from str to ClassLabel
lbels = train_ds.unique("labels")
label_feature = ClassLabel(names=lbels)

# Update default features
features = train_ds.features
features["labels"] = label_feature

# Update dataset
train_ds = train_ds.map(lambda x : {"labels": label_feature.str2int(x["labels"])}, features=features)


0ex [00:00, ?ex/s]

In [14]:
train_ds.features

{'descriptions': Value(dtype='string', id=None),
 'labels': ClassLabel(num_classes=3, names=['bug', 'enhancement', 'question'], names_file=None, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [15]:
# remove redundant column and set torch format to the datasets
train_ds = train_ds.remove_columns(['descriptions'])
eval_ds = eval_ds.remove_columns(['descriptions'])
train_ds.set_format("torch")
eval_ds.set_format("torch")

In [16]:
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_ds, batch_size=cfg.BATCH_SIZE, collate_fn=data_collator)

In [17]:
# sanity check
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

# Train Model

In [18]:
# Helpers
# define metrics and metrics function
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Prepare model labels - useful in inference API
labels = train_ds.features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [19]:
# download model from model hub
model = AutoModelForSequenceClassification.from_pretrained(
    cfg.CHECKPOINT, 
    num_labels=num_labels, 
    label2id=label2id, 
    id2label=id2label,
)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [22]:
# optimizer
optimizer = AdamW(model.parameters(), lr=cfg.LR)

# lr scheduler
num_training_steps = cfg.EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(f"Training steps: {num_training_steps}")

Training steps: 7728




In [None]:
# Prepare training
if cfg.TRAIN_ON_HPU:
    habana_modules_directory = "/usr/lib/habanalabs"
    habana_pth_plugin = "libhabana_pytorch_plugin.so"
    sys.path.insert(0, habana_modules_directory)
    torch.ops.load_library(
        os.path.abspath(
            os.path.join(habana_modules_directory, habana_pth_plugin)
        )
    )
    print("Targeting HPU")
    device = torch.device("hpu")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
model = model.to(device)
progress_bar = tqdm(range(num_training_steps))


# training and eval loops
for epoch in range(cfg.EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    ### EVAL
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    metric.compute()
    


Using device: cuda


  0%|          | 0/7728 [00:00<?, ?it/s]

Epoch 1
-------------------------------


# Evaluate Model

# Save Model

In [None]:
    try:
        torch.save(model.state_dict(), cfg.MODEL_DIR + "model.pt")
        print(f"Trained model is saved to {cfg.MODEL_DIR} as model.pt")
    except Exception as e:
        print(e)
        try:
            torch.save(model.state_dict(), "model.pt")
            print(f"Trained model is saved to current work directory as model.pt")
        except Exception as e:
            print(e)
            print("Saving model failed")