In [3]:
# TODO:
# Clean-up notebook
# Documentation

# Topics for further research:
# More advanced adapters from the transformers library
# Prompt Tuning and Prefix Tuning
# Quantization
# RAGs, Agents and LangChain
# Additional MLFlow features

## Set-up

In [1]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding, AdamW, get_scheduler
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, PromptTuningConfig, PrefixTuningConfig, TaskType
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc
import mlflow
import mlflow.transformers
import petname
import random
import numpy as np

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load a sample dataset
dataset = load_dataset("imdb", split=["train", "test"])
train_test_split = dataset[0].train_test_split(test_size=0.1, seed=42, stratify_by_column='label')

dataset = {
    "train": train_test_split["train"],  # train split
    "validation": train_test_split["test"], # validation split
    "test": dataset[1],   # test split
}

dataset = DatasetDict(dataset)

In [3]:
print(sum(dataset['train']['label']) / len(dataset['train']['label']))
print(sum(dataset['validation']['label']) / len(dataset['validation']['label']))
print(sum(dataset['test']['label']) / len(dataset['test']['label']))

0.5
0.5
0.5


In [4]:
print(len(dataset['train']['label']))
print(len(dataset['validation']['label']))
print(len(dataset['test']['label']))

22500
2500
25000


In [5]:
# Function that evaluates a model on the test dataset using PyTorch
def eval_model(model, eval_dataset, eval_batch_size, collate_function=None):
    predictions = []
    true_labels = []

    dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=eval_batch_size, pin_memory=True, num_workers=4,
                                             collate_fn=collate_function)
    pbar = tqdm(total=len(dataloader))

    if collate_function is None:
        lable_col_name = 'label'
    else:
        lable_col_name = 'labels'

    for batch in dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch[lable_col_name].to(model.device)

        with torch.inference_mode():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            #logits = outputs.logits.to(torch.float16) - no improvement in speed

            # Get predictions: the index of the max value in each row (for classification)
            preds = torch.argmax(logits, dim=1)

        # Append predictions and true labels
        predictions.append(preds)
        true_labels.append(labels)
        pbar.update(1)

    pbar.close()

    # Convert lists to tensors for easy comparison
    predictions = torch.cat(predictions)
    true_labels = torch.cat(true_labels)

    correct = (predictions == true_labels).sum().item()  # Number of correct predictions
    total = true_labels.size(0)  # Total number of examples
    accuracy = correct / total  # Accuracy as the fraction of correct predictions

    # Calculate accuracy
    print(f"Model accuracy on provided set: {accuracy:.4f}")


In [6]:
def compute_metrics(eval_pred):
    # Unpack predictions and labels
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    probs = logits[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)  # Accuracy
    precision = precision_score(labels, predictions, average="binary")  # Precision
    recall = recall_score(labels, predictions, average="binary")  # Recall
    f1 = f1_score(labels, predictions, average="binary")  # F1 Score

    # ROC-AUC
    roc_auc = roc_auc_score(labels, probs)

    # Precision-Recall AUC
    precision_vals, recall_vals, _ = precision_recall_curve(labels, probs)
    pr_auc = auc(recall_vals, precision_vals)

    return {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "roc_auc": roc_auc,
    "pr_auc": pr_auc
    }

In [7]:
# Load the model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 22500/22500 [00:04<00:00, 5555.16 examples/s]
Map: 100%|██████████| 2500/2500 [00:00<00:00, 6408.85 examples/s]
Map: 100%|██████████| 25000/25000 [00:02<00:00, 10701.44 examples/s]


## Random weights - 50% achieved accuracy

In [None]:
# The head is initialized with random weights so the results will be 0.5 accuracy
eval_model(model, tokenized_dataset["test"], 16)

## Classification head fine-tuning - 85% achieved accuracy

In [None]:
# Set MLFlow experiment
experiment_name = "head_fine_tune"
mlflow.set_experiment(experiment_name)

In [None]:
# Creating output folder and run name
run_id = petname.Generate(words=2, separator='_')
run_id += f"_{random.randint(1000, 9999)}"
output_dir = f"./results/{run_id}"
run_id

In [11]:
# Freeze all parameters except for the classifier
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
# Print all trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable parameter: {name}")
    else:
        print(f"Frozen parameter: {name}")

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
# Training arguments tensorboard
training_args = TrainingArguments(
    learning_rate=2e-5,
    weight_decay=0.01,
    bf16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_strategy='epoch',
    save_total_limit=4, # 4 needed for early stopping with patience of 3
    output_dir=output_dir,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    logging_first_step=True,
    logging_steps=50
)

In [14]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,   # Number of evaluation steps with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum improvement to reset patience
)

In [None]:
# Initialize the Trainer class 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
with mlflow.start_run(run_name=run_id, log_system_metrics=True):
    # Log custom model parameters
    mlflow.log_param("dataset_version", "imbd test")
    mlflow.log_param("model_name", "my_test")
    mlflow.log_param("learning_rate", training_args.learning_rate)
    mlflow.log_param("num_train_epochs", training_args.num_train_epochs)
    mlflow.log_param("batch_size", training_args.per_device_train_batch_size)

    #mlflow.transformers.autolog()
    trainer.train()

# TODO: There are a lot more features in MLflow - for example auto-logging, versioning, serving models and datasets etc.

In [None]:
# Should have the same results as the evaluation function
results = trainer.evaluate()
print(results)

In [None]:
eval_model(model, tokenized_dataset["test"], 16, data_collator)

In [None]:
model.save_pretrained("./results/head_fine_tune")
tokenizer.save_pretrained("./results/head_fine_tune")

In [None]:
# # Load fine-tuned model
# fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./results/head_fine_tune")
# fine_tuned_model.to(device)

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./results/head_fine_tune")

## Full fine-tuning - achieved accuracy 93%

In [None]:
# Set MLFlow experiment
experiment_name = "custom_training_loop"
mlflow.set_experiment(experiment_name)

In [None]:
# Creating output folder and run name
run_id = petname.Generate(words=2, separator='_')
run_id += f"_{random.randint(1000, 9999)}"
output_dir = f"./results/{run_id}"
run_id

In [7]:
# Model parameters
train_batch_size = 16
eval_batch_size = 16
learning_rate = 2e-5
num_epochs = 3
log_training_loss_steps = 50
weight_decay = 0.01

In [10]:
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=train_batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=eval_batch_size, collate_fn=data_collator
)

In [None]:
# Verify that the data prep is correct
for batch in train_dataloader:
    input_ids = batch['input_ids'].to(model.device)
    attention_mask = batch['attention_mask'].to(model.device)
    labels = batch['labels'].to(model.device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    probs = torch.nn.functional.softmax(logits) 
    correct = (preds == labels).sum().item()
    total = labels.size(0)
    accuracy = correct / total
    print(outputs.loss)
    #print(logits)
    print(preds)
    #print(probs)
    print(accuracy)
    break


In [None]:
# Print all trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable parameter: {name}")
    else:
        print(f"Frozen parameter: {name}")

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
# Set the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
# Set the learning rate scheduler
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
# Training loop and validation evaluation loop
with mlflow.start_run(run_name=run_id, log_system_metrics=True):
    # Log custom model parameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("num_train_epochs", num_epochs)
    mlflow.log_param("train_batch_size", train_batch_size)
    mlflow.log_param("eval_batch_size", eval_batch_size)
    mlflow.log_param("weight_decay", weight_decay)

    model.train()
    step = 0  # Used for logging

    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        logits_train_list = []
        labels_train_list = []

        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            logits = outputs.logits
            labels = batch['labels']
            logits_train_list.append(logits.detach().cpu().numpy())
            labels_train_list.append(labels.cpu().numpy())

            if step % log_training_loss_steps == 0:
                mlflow.log_metric("training_loss", loss.item(), step=step)

            step += 1
            progress_bar.update(1)

        # Calculate additional metrics for the training set
        logits_train = np.concatenate(logits_train_list, axis=0)
        labels_train = np.concatenate(labels_train_list, axis=0)
        train_metrics = compute_metrics((logits_train, labels_train))

        # Log train metrics to MLflow
        for train_metric_name, train_metric_value in train_metrics.items():
            mlflow.log_metric(f"{train_metric_name}_train", train_metric_value, step=epoch)

        # Evaluate the model on the validation set
        model.eval()
        logits_list = []
        labels_list = []
        eval_loss = 0

        eval_bar = tqdm(range(len(eval_dataloader)), desc=f"Evaluating Epoch")

        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            eval_loss += outputs.loss.item()
            logits = outputs.logits
            labels = batch['labels'] 
            logits_list.append(logits.cpu().numpy())
            labels_list.append(labels.cpu().numpy())
            eval_bar.update(1)

        eval_bar.close()

        logits = np.concatenate(logits_list, axis=0)
        labels = np.concatenate(labels_list, axis=0)

        metrics = compute_metrics((logits, labels))

        # Log evaluation loss and metrics to MLflow
        mlflow.log_metric("eval_loss", eval_loss / len(eval_dataloader), step=epoch)
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(metric_name, metric_value, step=epoch)

        model.train()

    progress_bar.close()

In [None]:
# TODO: Additional features that can be implemented:
# bf16 precision
# Saving checkpoints
# Early stopping & keeping only necessary amount of checkpoints (patience + 1)

In [None]:
# Evaluate on the test set
eval_model(model, tokenized_dataset["test"], 16, data_collator)

In [None]:
model.save_pretrained("./results/full_fine_tune_custom_loop")
tokenizer.save_pretrained("./results/full_fine_tune_custom_loop")

In [4]:
# # Load fine-tuned model
# model = AutoModelForSequenceClassification.from_pretrained("./results/full_fine_tune_custom_loop")
# model.to(device)

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./results/full_fine_tune_custom_loop")

## Lora fine-tuning (90% accuracy)

In [None]:
# Set MLFlow experiment
experiment_name = "lora_fine_tune"
mlflow.set_experiment(experiment_name)

In [None]:
# Creating output folder and run name
run_id = petname.Generate(words=2, separator='_')
run_id += f"_{random.randint(1000, 9999)}"
output_dir = f"./results/{run_id}"
run_id

In [None]:
print(model)

In [None]:
# Print all trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable parameter: {name}")
    else:
        print(f"Frozen parameter: {name}")

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [11]:
# Configure LoRA

# r - the rank of the low-rank matrix that will be trained
# lora_alpha - A higher lora_alpha gives more weight to the LoRA updates, allowing the model to adapt more strongly to new tasks.
# lora_dropout - Introduces regularization to prevent overfitting in the low-rank LoRA updates during training.
# target_modules - to which layers of the pretrained LLM will the low rank matrices be added
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=4,
    lora_dropout=0.2,
    target_modules=["attention.q_lin", "attention.k_lin"]
)

lora_model = get_peft_model(model, lora_config)

In [None]:
# Print all trainable parameters
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(f"Trainable parameter: {name}")
    else:
        print(f"Frozen parameter: {name}")

In [None]:
total_params = sum(p.numel() for p in lora_model.parameters())
trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
# Training arguments
training_args = TrainingArguments(
    learning_rate=2e-5,
    weight_decay=0.01,
    bf16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_strategy='epoch',
    save_total_limit=4, # 4 needed for early stopping with patience of 3
    output_dir=output_dir,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    logging_first_step=True,
    logging_steps=50
)

In [14]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,   # Number of evaluation steps with no improvement before stopping
    early_stopping_threshold=0.0  # Minimum improvement to reset patience
)

In [None]:
# Initialize the Trainer class 
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
with mlflow.start_run(run_name=run_id, log_system_metrics=True):
    # Log custom model parameters
    mlflow.log_param("learning_rate", training_args.learning_rate)
    mlflow.log_param("num_train_epochs", training_args.num_train_epochs)
    mlflow.log_param("batch_size", training_args.per_device_train_batch_size)

    #mlflow.transformers.autolog()
    trainer.train()


In [None]:
# Should have the same results as the evaluation function
results = trainer.evaluate()
print(results)

In [None]:
# Evaluate on the test set
eval_model(lora_model, tokenized_dataset["test"], 16, data_collator)

In [None]:
# Save the model and tokenizer
lora_model.save_pretrained("./results/lora_fine_tune")
tokenizer.save_pretrained("./results/lora_fine_tune")

In [None]:
# # Load fine-tuned model
# lora_model = AutoModelForSequenceClassification.from_pretrained("./results/lora_fine_tune")
# lora_model.to(device)

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./results/lora_fine_tune")