In [None]:
!pip install transformers datasets seqeval --quiet

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from google.colab import files
import pandas as pd
import ast
from sklearn.model_selection import train_test_split, ParameterGrid
from datasets import Dataset
from transformers import BertTokenizerFast
import numpy as np
from transformers import BertForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import f1_score
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the CSV files assuming keys "train.csv" and "test.csv" in uploaded dict

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Parse the stringified lists (adjust column names if different)
train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval)
train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# print(train_df)

Train shape: (40000, 3)
Test shape: (5000, 2)


In [None]:
# Split into train/validation (adjust test_size as preferred)
full_df = train_df # saving the full dataset for further training
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# sets used for hyperparameter tuning (20% of training set and validation set)
train_tuning, remaining_test = train_test_split(train_df, test_size=0.8, random_state=42)
val_tuning, remaining_validation = train_test_split(val_df, test_size=0.8, random_state=42)

# Get unique labels from training set and create mapping dictionaries.
unique_labels = sorted({label for tags in train_df['NER Tag'] for label in tags})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Unique labels:", unique_labels)

Unique labels: ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def encode_examples(example):
    tokenized_input = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors="pt"
    )

    word_ids = tokenized_input.word_ids(batch_index=0)  # for single example
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label2id[example["NER Tag"][word_idx]])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_input["labels"] = torch.tensor(label_ids)

    return {k: v.squeeze() if isinstance(v, torch.Tensor) else v for k, v in tokenized_input.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Convert dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_tuning = Dataset.from_pandas(train_tuning)
val_tuning = Dataset.from_pandas(val_tuning)


# Apply tokenization function
train_dataset = train_dataset.map(encode_examples, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(encode_examples, remove_columns=val_dataset.column_names)

train_tuning = train_tuning.map(encode_examples, remove_columns = train_tuning.column_names)
val_tuning = val_tuning.map(encode_examples, remove_columns = val_tuning.column_names)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
# Set up the model
model = BertForTokenClassification.from_pretrained(
    "bert-large-cased", #bert-large-cased can be used ~ 340 mil parameters
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1
)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
param_count = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {param_count:,}")

Total parameters: 332,547,089


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Using device: cuda


In [None]:
# Define metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label] for label in sent_labels if label != -100]
        for sent_labels in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(sent_preds, sent_labels) if l != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]

    # Using seqeval's f1 score
    return {"f1": f1_score(true_labels, true_predictions)}

# Hyperparameter Tuning

We decide best combination of parameters by performing a grid search on a reduced training set (20% of it) to choose the most promising ones.

In [None]:
param_grid = {
    'learning_rate': [2e-5, 6e-5, 1e-4],
    'per_device_train_batch_size': [8, 16],
    'num_train_epochs': [2],
    'weight_decay': [0.01]
}

best_f1 = 0
best_params = {}

for params in ParameterGrid(param_grid):
    print(f"Training with parameters: {params}")
    model_dir = f"model_lr_{params['learning_rate']}_bs_{params['per_device_train_batch_size']}_epochs_{params['num_train_epochs']}_wd_{params['weight_decay']}"
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=params['learning_rate'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=16,
        num_train_epochs=params['num_train_epochs'],
        weight_decay=params['weight_decay'],
        save_total_limit=1,
        push_to_hub=False,
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tuning,
        eval_dataset=val_tuning,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    eval_results = trainer.evaluate()
    f1 = eval_results.get("eval_f1")
    print(f1)
    trainer.save_model(model_dir)

    if f1 > best_f1:
        best_f1 = f1
        best_params = params
        print(f"New best F1: {best_f1} with parameters: {best_params}")

        #Saving the best model
        trainer.save_model(f"best_model_f1_{best_f1:.4f}")

print(f"Best hyperparameters: {best_params}")
print(f"Best F1 score: {best_f1}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.2154,0.110526,0.822777
2,0.0899,0.103426,0.826124


0.826124319679175
New best F1: 0.826124319679175 with parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.01}


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.105668,0.828875
2,0.072600,0.110063,0.834191


0.834190966266438
New best F1: 0.834190966266438 with parameters: {'learning_rate': 2e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 6e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.0793,0.133238,0.820986
2,0.0378,0.129131,0.822254


0.8222539229671896


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 6e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.156882,0.822991
2,0.030300,0.15261,0.834758


0.8347578347578348
New best F1: 0.8347578347578348 with parameters: {'learning_rate': 6e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 0.0001, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.048,0.166208,0.826409
2,0.0284,0.147853,0.823529


0.8264086511098463


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training with parameters: {'learning_rate': 0.0001, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.150861,0.820986
2,0.039200,0.148165,0.827389


0.8273894436519258
Best hyperparameters: {'learning_rate': 6e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}
Best F1 score: 0.8347578347578348


In [None]:
print(f"Best hyperparameters: {best_params}")
print(f"Best F1 score: {best_f1}")

Best hyperparameters: {'learning_rate': 6e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}
Best F1 score: 0.8347578347578348


# Final Training on Full Dataset

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=6e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False,
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1
1,0.0999,0.091133,0.826858
2,0.0688,0.086657,0.835594
3,0.0393,0.096981,0.844492
4,0.0179,0.118275,0.848465


TrainOutput(global_step=9000, training_loss=0.0594336699379815, metrics={'train_runtime': 10410.8758, 'train_samples_per_second': 13.832, 'train_steps_per_second': 0.864, 'total_flos': 3.3435152068608e+16, 'train_loss': 0.0594336699379815, 'epoch': 4.0})

## Test Set

In [None]:
# Tokenization-only function for test set
def tokenize_test(example):
    tokenized = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors=None
    )
    tokenized["word_ids"] = tokenized.word_ids()
    return tokenized

In [None]:
# # 1) Check for duplicate IDs
# dupes = test_df["id"][test_df["id"].duplicated()]
# if len(dupes):
#     print(f"⚠️ Duplicate IDs found: {dupes.tolist()}")
# else:
#     print("✅ No duplicate IDs.")

# # 2) Check for sentences that get truncated by the tokenizer
# too_long = []
# for idx, sent in enumerate(test_df["Sentence"]):
#     toks = tokenizer(
#         sent,
#         truncation=True,
#         padding=False,
#         max_length=128,
#         is_split_into_words=True
#     )
#     # count actual word_ids (excluding special tokens)
#     wids = toks.word_ids()
#     # words retained = max word_idx + 1
#     max_word = max([w for w in wids if w is not None], default=-1) + 1
#     if max_word < len(sent):
#         too_long.append((test_df["id"].iloc[idx], len(sent), max_word))

# if too_long:
#     print("⚠️ Sentences being truncated (id, orig_len, kept_words):")
#     for t in too_long[:5]:
#         print(" ", t)
# else:
#     print("✅ No truncation issues (all sentences ≤128 tokens).")


In [None]:
def tokenize_with_word_ids(example):
    sent = example["Sentence"]
    tokenized = tokenizer(
        sent,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )
    example["word_ids"] = tokenized.word_ids()
    example.update(tokenized)
    return example

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(
    tokenize_with_word_ids,
    remove_columns=[]
)


pred_dataset = test_dataset.remove_columns(["Sentence", "id"])
pred_dataset.set_format(type="torch")


raw_preds = trainer.predict(pred_dataset)
preds = np.argmax(raw_preds.predictions, axis=2)

final_preds = []
for sent, pred_row, word_ids in zip(test_dataset["Sentence"], preds, test_dataset["word_ids"]):
    aligned = []
    prev = None
    for idx, widx in enumerate(word_ids):
        if widx is not None and widx != prev:
            aligned.append(id2label[pred_row[idx]])
        prev = widx

    # If we have fewer tags than words, pad with 'O'
    if len(aligned) < len(sent):
        padding = ["O"] * (len(sent) - len(aligned))
        aligned.extend(padding)

    # If too many (shouldn't happen), we truncate
    if len(aligned) > len(sent):
        aligned = aligned[: len(sent)]

    final_preds.append(aligned)


mismatches = [
    (ex["id"], len(ex["Sentence"]), len(p))
    for ex, p in zip(test_dataset, final_preds)
    if len(ex["Sentence"]) != len(p)
]
if mismatches:
    print("Still mismatches (should be none):", mismatches)
else:
    print("All lengths match after padding!")

# Finally building submission
submission_df = pd.DataFrame({
    "id": test_dataset["id"],
    "NER Tag": [str(p) for p in final_preds]
})
submission_df.to_csv("submission_final.csv", index=False)
files.download("submission_final.csv")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

All lengths match after padding!


In [None]:
# # Make submission dataframe; assume test_df has an "id" column
# submission_df = pd.DataFrame({
#     "id": test_df["id"],
#     "NER Tag": final_preds
# })

In [None]:
# submission_df["NER Tag"] = submission_df["NER Tag"].apply(str)
# submission_df.to_csv("submission_higherlearningrate32.csv", index=False)
# print("Submission file generated!")
# files.download("submission_higherlearningrate32.csv")