In [None]:
!pip install -q transformers datasets evaluate seqeval peft accelerate bitsandbytes


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from peft import LoraConfig, get_peft_model
import evaluate


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

data_path = "/content/drive/MyDrive/NLP_PROJECT/data/"

def read_conll(path):
    sentences, labels = [], []
    temp_tokens, temp_labels = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # End of sentence
                if temp_tokens:
                    sentences.append(temp_tokens)
                    labels.append(temp_labels)
                    temp_tokens, temp_labels = [], []
            else:
                splits = line.split()
                temp_tokens.append(splits[0])
                temp_labels.append(splits[-1])
    return sentences, labels

sentences, labels = read_conll(os.path.join(data_path, "dataset.txt"))

print("Total samples:", len(sentences))
print("First example:", sentences[0])
print("Labels:", labels[0])


Total samples: 19205
First example: ['-DOCSTART-']
Labels: ['O']


In [None]:
from sklearn.model_selection import train_test_split

train_sents, temp_sents, train_labels, temp_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)
val_sents, test_sents, val_labels, test_labels = train_test_split(
    temp_sents, temp_labels, test_size=0.5, random_state=42
)

print("Train:", len(train_sents))
print("Validation:", len(val_sents))
print("Test:", len(test_sents))


Train: 15364
Validation: 1920
Test: 1921


In [None]:
from collections import defaultdict

# Collect all unique labels
all_labels = sorted(set([lab for seq in train_labels + val_labels + test_labels for lab in seq]))

# Create label dictionaries
label2idx = {label: idx for idx, label in enumerate(all_labels)}
idx2label = {idx: label for label, idx in label2idx.items()}

print("Label mapping:", label2idx)

# Convert string labels → integer IDs
def encode_labels(label_sequences, mapping):
    return [[mapping[label] for label in seq] for seq in label_sequences]

train_label_ids = encode_labels(train_labels, label2idx)
val_label_ids = encode_labels(val_labels, label2idx)
test_label_ids = encode_labels(test_labels, label2idx)

# Quick sanity check
print("Sample tokens:", train_sents[0])
print("Sample labels (str):", train_labels[0])
print("Sample labels (int):", train_label_ids[0])


Label mapping: {'.O': 0, 'B-AffectedPopulation': 1, 'B-CollapsedStructure': 2, 'B-Date': 3, 'B-Death_And_Toll': 4, 'B-Fire': 5, 'B-Floods': 6, 'B-InfrastructureDamage': 7, 'B-Location': 8, 'B-MissingPersons': 9, 'B-NaturalHazards': 10, 'B-PowerOutage': 11, 'B-RoadBlocked': 12, 'B-WaterShortage': 13, 'B-WaterShrotage': 14, 'I-AffectedPopulation': 15, 'I-CollapsedStructure': 16, 'I-Date': 17, 'I-Death_And_Toll': 18, 'I-Floods': 19, 'I-InfrastructureDamage': 20, 'I-Location': 21, 'I-MissingPersons': 22, 'I-NaturalHazards': 23, 'I-PowerOutage': 24, 'I-RoadBlocked': 25, 'I-WaterShortage': 26, 'I-WaterShrotage': 27, 'O': 28}
Sample tokens: ['-', 'I', 'believe', 'there', 'should', 'be', 'consistent', 'standards', 'for', 'flood', 'and', 'coastal', 'resilience', 'in', 'England', '.']
Sample labels (str): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Floods', 'O', 'O', 'O', 'O', 'B-Location', 'O']
Sample labels (int): [28, 28, 28, 28, 28, 28, 28, 28, 28, 6, 28, 28, 28, 28, 8, 28]


In [None]:
from transformers import AutoModelForTokenClassification
model_checkpoint = "distilbert-base-uncased"

base_model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2idx),
    id2label=idx2label,
    label2id=label2idx
)



# Apply LoRA
lora_config = LoraConfig(
    r=8,              # rank
    lora_alpha=32,    # scaling
    target_modules=["q_lin", "v_lin"],  # DistilBERT uses q_lin, v_lin for attention
    lora_dropout=0.1,
    bias="none",
    task_type="TOKEN_CLS"
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 169,757 || all params: 66,554,938 || trainable%: 0.2551


In [None]:
!pip install --upgrade transformers

In [None]:
import transformers
print(transformers.__version__)

4.56.2


In [None]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Create datasets
train_ds = Dataset.from_dict({"tokens": train_sents, "labels": train_label_ids})
val_ds = Dataset.from_dict({"tokens": val_sents, "labels": val_label_ids})
test_ds = Dataset.from_dict({"tokens": test_sents, "labels": test_label_ids})

# Tokenize the datasets
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_word_idx:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_word_idx = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = DatasetDict({
    "train": train_ds.map(tokenize_and_align_labels, batched=True),
    "validation": val_ds.map(tokenize_and_align_labels, batched=True),
    "test": test_ds.map(tokenize_and_align_labels, batched=True),
})

print(tokenized_datasets)

Map:   0%|          | 0/15364 [00:00<?, ? examples/s]

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/1921 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 15364
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1920
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1921
    })
})


In [None]:
# %% Cell 7 – Training setup for LoRA disaster NER

import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import torch

batch_size = 16

# --- Training arguments ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=[]  #Disable all integrations (wandb, tensorboard, etc.)
)



# --- Data collator ---
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# --- Metrics setup ---
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_preds = []
    true_labels = []

    for pred_seq, label_seq in zip(predictions, labels):
        temp_preds = []
        temp_labels = []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:  # ignore padding
                temp_preds.append(idx2label[p])
                temp_labels.append(idx2label[l])
        true_preds.append(temp_preds)
        true_labels.append(temp_labels)

    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,                         # LoRA-adapted model
    args=training_args,
    train_dataset=tokenized_datasets["train"], # Using tokenized dataset
    eval_dataset=tokenized_datasets["validation"], # Using tokenized dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# --- Optional: inspect trainable parameters ---
model.print_trainable_parameters()

Downloading builder script: 0.00B [00:00, ?B/s]

trainable params: 169,757 || all params: 66,554,938 || trainable%: 0.2551


  trainer = Trainer(


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6294,0.188658,0.607987,0.655248,0.630733,0.956244
2,0.176,0.153837,0.650396,0.684673,0.667095,0.96024
3,0.1504,0.147042,0.660591,0.716293,0.687316,0.962343


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2883, training_loss=0.2508291283540332, metrics={'train_runtime': 14224.5422, 'train_samples_per_second': 3.24, 'train_steps_per_second': 0.203, 'total_flos': 733211384839056.0, 'train_loss': 0.2508291283540332, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)




  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.12677904963493347, 'eval_precision': 0.6513761467889908, 'eval_recall': 0.7319587628865979, 'eval_f1': 0.6893203883495146, 'eval_accuracy': 0.9649224090153334, 'eval_runtime': 246.6809, 'eval_samples_per_second': 7.787, 'eval_steps_per_second': 0.491, 'epoch': 3.0}
