<a href="https://colab.research.google.com/github/Michael-David-Lam/NLP-Final-Project/blob/main/Experiment_3_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Define General DistilBert Pipline

## Install dependencies

In [32]:
# Install dependencies
!pip install torch torchvision
!pip install transformers
!pip install datasets
!pip install evaluate
# !pip install seqeval



## Load packages and dataset

In [33]:
# Load packages
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate

In [34]:
# Load dataset
dataset = load_dataset("surrey-nlp/PLOD-CW-25")
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'ner_tags'],
        num_rows: 150
    })
})

In [35]:
dataset['train'][0]

{'tokens': ['MRR',
  ',',
  'mortality',
  'rate',
  'ratio',
  ';',
  'TBI',
  ',',
  'traumatic',
  'brain',
  'injury',
  '.'],
 'pos_tags': ['NOUN',
  'PUNCT',
  'NOUN',
  'NOUN',
  'NOUN',
  'PUNCT',
  'PROPN',
  'PUNCT',
  'ADJ',
  'NOUN',
  'NOUN',
  'PUNCT'],
 'ner_tags': ['B-AC',
  'O',
  'B-LF',
  'I-LF',
  'I-LF',
  'O',
  'B-AC',
  'O',
  'B-LF',
  'I-LF',
  'I-LF',
  'O']}

## Define Model & Tokenizer
### Tokenize data

In [49]:
# define model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4   # Class labels: B-AC, B-LF, I-LF, O
)

# this fucntion is to tokenize our dataset
def tokenize_and_align_labels(batch):
    tokenized = tokenizer(
        batch["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = batch["ner_tags"]
    label_to_id = {"B-AC": 0, "B-LF": 1, "I-LF": 2, "O": 3}
    aligned_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized.word_ids(i)
        aligned_label = []
        for word_idx in word_ids:
            if word_idx is None:
                aligned_label.append(-100)  # for padding or special tokens
            else:
                aligned_label.append(label_to_id.get( label[min(word_idx, len(label) - 1)], -100 ))
        aligned_labels.append(aligned_label)

    tokenized["labels"] = aligned_labels
    return tokenized
# create tokenized dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##Define F1 Metric function

In [37]:
metric = evaluate.load("f1")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels, true_preds = [], []
    for pred_row, label_row in zip(preds, labels):
        for p, l in zip(pred_row, label_row):
            if l != -100:  # ignore padding
                true_labels.append(l)
                true_preds.append(p)

    return metric.compute(predictions=true_preds, references=true_labels, average="weighted")

##Training Arguments

In [38]:
# define general training args
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_strategy ="epoch",
    report_to="none",
)

## Define Loss Function

In [39]:
import torch
from torch.nn import CrossEntropyLoss

# Mannually define class weights
# [B-AC, B-LF, I-LF, O]
class_weights = torch.tensor([1.0, 2.0, 2.0, 0.5]).to("cuda")

# define custom trainer subclass to override loss default funtion
class CustomTrainer(Trainer):
    # Custom loss function - weighted CrossEntropyLoss
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs["labels"]

        # Flatten the logits and labels to match shape for CrossEntropyLoss
        logits = logits.view(-1, model.config.num_labels)
        labels = labels.view(-1)

        # Apply CrossEntropyLoss with class weights
        loss_fn = CrossEntropyLoss(weight=class_weights, ignore_index=-100)
        loss = loss_fn(logits, labels)

        if return_outputs:
            return loss, outputs
        else:
            return loss

# Experiment Setup-1
Optimizer = Adam

In [52]:
from torch.optim import Adam, AdamW

# Define optimizers
optimizer1 = Adam(model.parameters(), lr=1e-5)
optimizer2 = AdamW(model.parameters(), lr=1e-5)

### Default Trainer Class:
 * Loss Function - Cross Entropy Loss
 * Optimizer - Adam

In [53]:
# Init default trainer - Cross Entropy loss, Adam Optimizer
trainer1_cross = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer1, None) #custom optimizer
)
trainer1_cross.train()

Epoch,Training Loss,Validation Loss,F1
1,0.504,0.355043,0.875538
2,0.292,0.353592,0.880516
3,0.2513,0.33727,0.882887
4,0.2274,0.338602,0.885893
5,0.215,0.339942,0.886398


TrainOutput(global_step=1250, training_loss=0.29795047912597655, metrics={'train_runtime': 148.6217, 'train_samples_per_second': 67.285, 'train_steps_per_second': 8.411, 'total_flos': 326644561920000.0, 'train_loss': 0.29795047912597655, 'epoch': 5.0})

In [54]:
trainer1_cross.evaluate(eval_dataset=tokenized_dataset["test"])

{'eval_loss': 0.2549479603767395,
 'eval_f1': 0.9076819111211382,
 'eval_runtime': 1.0305,
 'eval_samples_per_second': 242.602,
 'eval_steps_per_second': 31.053,
 'epoch': 5.0}

###Custom Trainer Class:
  * Loss Function - Weighted Cross Entropy Loss
  * Optimizer - Adam

In [55]:
# Init custom trainer - Weighted Cross Entropy loss, Adam Optimizer
trainer1_crossW = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer1, None) #custom optimizer
)
trainer1_crossW.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2335,0.41579,0.887523
2,0.1878,0.435304,0.885267
3,0.1632,0.477409,0.888828
4,0.1477,0.463142,0.886348
5,0.1411,0.47017,0.887665


TrainOutput(global_step=1250, training_loss=0.17465748291015626, metrics={'train_runtime': 148.4646, 'train_samples_per_second': 67.356, 'train_steps_per_second': 8.42, 'total_flos': 326644561920000.0, 'train_loss': 0.17465748291015626, 'epoch': 5.0})

In [56]:
trainer1_crossW.evaluate(eval_dataset=tokenized_dataset["test"])

{'eval_loss': 0.3371643126010895,
 'eval_f1': 0.9076053030230822,
 'eval_runtime': 1.1806,
 'eval_samples_per_second': 211.762,
 'eval_steps_per_second': 27.106,
 'epoch': 5.0}

# Experiment Setup-2
Optimizer = AdamW

### Default Trainer Class:
  * Loss Function - Cross Entropy Loss
  * Optimizer - AdamW

In [57]:
# Init second default trainer - Cross Entropy loss, AdamW Optimizer
trainer2_cross = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer2, None) #custom optimizer
)
trainer2_cross.train()

Epoch,Training Loss,Validation Loss,F1
1,0.1387,0.428351,0.886464
2,0.1136,0.469192,0.885303
3,0.1009,0.453622,0.882394
4,0.0979,0.457666,0.886917
5,0.0925,0.463837,0.884596


TrainOutput(global_step=1250, training_loss=0.10869351501464844, metrics={'train_runtime': 154.5043, 'train_samples_per_second': 64.723, 'train_steps_per_second': 8.09, 'total_flos': 326644561920000.0, 'train_loss': 0.10869351501464844, 'epoch': 5.0})

In [58]:
trainer2_cross.evaluate(eval_dataset=tokenized_dataset["test"])

{'eval_loss': 0.31509044766426086,
 'eval_f1': 0.9048358952897892,
 'eval_runtime': 1.0179,
 'eval_samples_per_second': 245.608,
 'eval_steps_per_second': 31.438,
 'epoch': 5.0}

### Custom Trainer Class:
  * Loss Function - Weighted Cross Entropy Loss
  * Optimizer - AdamW

In [59]:
# Init second custom trainer - Weighted Cross Entropy loss, AdamW Optimizer
trainer2_crossW = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    optimizers=(optimizer2, None) #custom optimizer
)
trainer2_crossW.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0734,0.657009,0.885023
2,0.0577,0.662524,0.886114
3,0.0509,0.703807,0.885237
4,0.053,0.689678,0.884397
5,0.0616,0.676018,0.88478


TrainOutput(global_step=1250, training_loss=0.059328802490234374, metrics={'train_runtime': 153.1119, 'train_samples_per_second': 65.312, 'train_steps_per_second': 8.164, 'total_flos': 326644561920000.0, 'train_loss': 0.059328802490234374, 'epoch': 5.0})

In [60]:
trainer2_crossW.evaluate(eval_dataset=tokenized_dataset["test"])

{'eval_loss': 0.4350109398365021,
 'eval_f1': 0.910539946631976,
 'eval_runtime': 1.0774,
 'eval_samples_per_second': 232.05,
 'eval_steps_per_second': 29.702,
 'epoch': 5.0}

#Visualization

In [61]:
#add code