# Finetune DistilBERT

Notebook by Jenna Sparks

In [1]:
!pip install -q codecarbon

In [2]:
!pip install datasets



In [3]:
!pip install transformers datasets codecarbon



In [4]:
!pip install peft



In [5]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Carbon and eval libraries
from codecarbon import EmissionsTracker
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
from datetime import datetime

# ML libraries
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

import torch
from torch.utils.data import DataLoader

# Incorporating LoRA
from peft import get_peft_model, LoraConfig, TaskType

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],  # Target specific Linear layers within attention
    lora_dropout=0.05,
    bias="none",
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Freeze some layers

In [7]:
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = False

for layer in model.distilbert.transformer.layer[:4]:  # Freeze first 4 layers
    for param in layer.parameters():
        param.requires_grad = False

# Load data

In [8]:
folder = '/content/drive/My Drive/Juliana/'

# load X_train, X_test, y_train, y_test from folder
X_train = pd.read_csv(folder + 'X_train.csv')
X_test = pd.read_csv(folder + 'X_test.csv')
y_train = pd.read_csv(folder + 'y_train.csv')
y_test = pd.read_csv(folder + 'y_test.csv')

In [9]:
# Combine features and labels
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Define the LABEL_MAPPING dictionary
LABEL_MAPPING = {
    "0_not_relevant": 0,
    "1_not_happening": 1,
    "2_not_human": 2,
    "3_not_bad": 3,
    "4_solutions_harmful_unnecessary": 4,
    "5_science_unreliable": 5,
    "6_proponents_biased": 6,
    "7_fossil_fuels_needed": 7
}

# Apply LABEL_MAPPING to the 'label' column in both train_df and test_df
train_df['label'] = train_df['label'].map(LABEL_MAPPING)
test_df['label'] = test_df['label'].map(LABEL_MAPPING)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a single dataset dictionary
dataset = {
    'train': train_dataset,
    'test': test_dataset
}

In [10]:
def tokenize_function(examples):
    return tokenizer(examples['quote'], padding='max_length', truncation=True)

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3599 [00:00<?, ? examples/s]

In [11]:
tokenized_test_datasets = dataset['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/945 [00:00<?, ? examples/s]

In [22]:
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
# )

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,  # Increased from 16
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    learning_rate=5e-5,  # Added learning rate
    lr_scheduler_type="linear",  # Added learning rate scheduler type
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save at the end of each epoch
    load_best_model_at_end=True,  # Load the best model when finished training (in terms of evaluation metrics)
    metric_for_best_model="accuracy",  # Use accuracy to determine the best model
)



In [None]:
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=tokenized_datasets,    # Training dataset
    eval_dataset=tokenized_test_datasets # Evaluation dataset
)

trainer.train()

[codecarbon INFO @ 23:24:52] [setup] RAM Tracking...
[codecarbon INFO @ 23:24:52] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 23:24:53] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 23:24:53] [setup] GPU Tracking...
[codecarbon INFO @ 23:24:53] No GPU found.
[codecarbon INFO @ 23:24:53] >>> Tracker's metadata:
[codecarbon INFO @ 23:24:53]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 23:24:53]   Python version: 3.11.11
[codecarbon INFO @ 23:24:53]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 23:24:53]   Available RAM : 12.675 GB
[codecarbon INFO @ 23:24:53]   CPU count: 2
[codecarbon INFO @ 23:24:53]   CPU model: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 23:24:53]   GPU count: None
[codecarbon INFO @ 23:24:53]   GPU model: None
[codecarbon INFO @ 23:24:54] Saving emissions data to file /content/results/emis

[codecarbon INFO @ 23:25:12] Energy consumed for RAM : 0.000020 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:25:12] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:25:12] 0.000197 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:25:27] Energy consumed for RAM : 0.000040 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:25:27] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:25:27] 0.000394 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:25:42] Energy consumed for RAM : 0.000059 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:25:42] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:25:42] 0.000590 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:25:57] Energy consumed for RAM : 0.000079 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:25:57] Energy consumed f

Epoch,Training Loss,Validation Loss
1,No log,No log


[codecarbon INFO @ 23:26:27] Energy consumed for RAM : 0.000119 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:26:27] Energy consumed for all CPUs : 0.001062 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:26:27] 0.001181 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:26:42] Energy consumed for RAM : 0.000139 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:26:42] Energy consumed for all CPUs : 0.001239 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:26:42] 0.001378 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:26:57] Energy consumed for RAM : 0.000158 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 23:26:57] Energy consumed for all CPUs : 0.001416 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 23:26:57] 0.001575 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:26:57] 0.003510 g.CO2eq/s mean an estimation of 110.69369988598176 kg.CO2eq/year
[codecarbon INFO @ 23:27:12] Energy consumed 

KeyError: "The `metric_for_best_model` training argument is set to 'eval_accuracy', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [24]:
results = trainer.evaluate()
print(f"Evaluation results: {results}")

Evaluation results: {'eval_runtime': 987.8977, 'eval_samples_per_second': 0.957, 'eval_steps_per_second': 0.015, 'epoch': 3.0}


In [34]:
def evaluate_model(model, dataset, tokenizer):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataset:
            inputs = tokenizer(batch['quote'], padding=True, truncation=True, return_tensors="pt")
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            labels = torch.tensor(batch['label']).to(model.device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = (np.array(all_preds) == np.array(all_labels)).mean()
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    conf_matrix = confusion_matrix(all_labels, all_preds)
    auc_roc = roc_auc_score(all_labels, all_preds, average='weighted', multi_class='ovr')

    print(f'Test Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'AUC-ROC: {auc_roc:.2f}')
    print('Confusion Matrix:')
    print(conf_matrix)

    return accuracy, precision, recall, f1, auc_roc, conf_matrix

# Use the function
metrics = evaluate_model(model, dataset['test'], tokenizer)

TypeError: iteration over a 0-d array