In [2]:
pip install --no-cache-dir transformers datasets seqeval --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m811.4 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cu

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [5]:
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, TrainerCallback
from datasets import Dataset
from seqeval.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import ast
import torch
import shutil

In [6]:
# Load the CSV files assuming keys "train.csv" and "test.csv" in uploaded dict
train_df = pd.read_csv("/kaggle/input/comp4211/train.csv")
test_df = pd.read_csv("/kaggle/input/comp4211/test.csv")

# Parse the stringified lists (adjust column names if different)
train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval)
train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (40000, 3)
Test shape: (5000, 2)


In [7]:
# Split into train/validation (adjust test_size as preferred)
full_df = train_df # saving the full dataset for further training
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# sets used for hyperparameter tuning (20% of training set and validation set)
# train_tuning, remaining_test = train_test_split(train_df, test_size=0.8, random_state=42)
# val_tuning, remaining_validation = train_test_split(val_df, test_size=0.8, random_state=42)

# Get unique labels from training set and create mapping dictionaries.
unique_labels = sorted({label for tags in train_df['NER Tag'] for label in tags})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Unique labels:", unique_labels)

Unique labels: ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [8]:
tokenizer = BertTokenizerFast.from_pretrained("bert-large-cased")

def encode_examples(example):
    # Tokenize input (word-level input, not sentence string)
    tokenized_input = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors="pt"
    )

    word_ids = tokenized_input.word_ids(batch_index=0)  # for single example
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label2id[example["NER Tag"][word_idx]])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_input["labels"] = torch.tensor(label_ids)

    # Remove batch dimension for Hugging Face datasets map compatibility
    return {k: v.squeeze() if isinstance(v, torch.Tensor) else v for k, v in tokenized_input.items()}

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [9]:
# Convert dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Apply tokenization function
train_dataset = train_dataset.map(encode_examples, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(encode_examples, remove_columns=val_dataset.column_names)

# train_tuning = Dataset.from_pandas(train_tuning)
# val_tuning = Dataset.from_pandas(val_tuning)
# train_tuning = train_tuning.map(encode_examples, remove_columns = train_tuning.column_names)
# val_tuning = val_tuning.map(encode_examples, remove_columns = val_tuning.column_names)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [10]:
# custom checkpoint callback
class SaveCheckpointCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        out = '/kaggle/working/checkpoints'
        os.makedirs(out, exist_ok=True)
        model.save_pretrained(f"{out}/epoch_{int(state.epoch)}")
        tokenizer.save_pretrained(f"{out}/epoch_{int(state.epoch)}")

In [11]:
# Set up the model
model = BertForTokenClassification.from_pretrained(
    "bert-large-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Optional: Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Using device: cuda


In [13]:
# Define metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label] for label in sent_labels if label != -100]
        for sent_labels in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(sent_preds, sent_labels) if l != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]

    # Using seqeval's f1 score
    return {"f1": f1_score(true_labels, true_predictions)}

In [None]:
# Grid search:
# param_grid = {
#     'learning_rate': [2e-5, 6e-5, 1e-4],
#     'per_device_train_batch_size': [8, 16],
#     'num_train_epochs': [2],
#     'weight_decay': [0.01]
# }

# best_f1 = 0
# best_params = {}

# for params in ParameterGrid(param_grid):
#     print(f"Training with parameters: {params}")
#     model_dir = f"model_lr_{params['learning_rate']}_bs_{params['per_device_train_batch_size']}_epochs_{params['num_train_epochs']}_wd_{params['weight_decay']}"
#     training_args = TrainingArguments(
#         output_dir="./results",
#         eval_strategy="epoch",
#         save_strategy="epoch",
#         learning_rate=params['learning_rate'],
#         per_device_train_batch_size=params['per_device_train_batch_size'],
#         per_device_eval_batch_size=16, #evaluation batch size consistent
#         num_train_epochs=params['num_train_epochs'],
#         weight_decay=params['weight_decay'],
#         save_total_limit=1,
#         push_to_hub=False,
#         load_best_model_at_end=True,
#         metric_for_best_model="f1"  #use f1 as metric
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_tuning,
#         eval_dataset=val_tuning,
#         compute_metrics=compute_metrics,
#         tokenizer=tokenizer
#     )

#     trainer.train()

#     eval_results = trainer.evaluate()
#     f1 = eval_results.get("eval_f1")
#     print(f1)
#     trainer.save_model(model_dir)

#     if f1 > best_f1:
#         best_f1 = f1
#         best_params = params
#         print(f"New best F1: {best_f1} with parameters: {best_params}")

#         #Saving the best model
#         trainer.save_model(f"best_model_f1_{best_f1:.4f}")

# print(f"Best hyperparameters: {best_params}")
# print(f"Best F1 score: {best_f1}")

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[SaveCheckpointCallback]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [17]:
# Train!
trainer.train()

# Save the model to a local directory (e.g., 'model')
trainer.save_model("/kaggle/working/bert_large_model")

# zip checkpoints
shutil.make_archive('/kaggle/working/checkpoints', 'zip', '/kaggle/working/checkpoints')

Epoch,Training Loss,Validation Loss,F1
1,0.0477,0.093634,0.839624
2,0.0331,0.113291,0.838654
3,0.016,0.129469,0.84294


'/kaggle/working/checkpoints.zip'

In [19]:
param_count = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {param_count:,}")

Total parameters: 332,547,089


In [20]:
from safetensors.torch import save_file

save_file(model.state_dict(), "bert_large.safetensors")

In [22]:
from IPython.display import FileLink
FileLink(r'bert_large.safetensors')

In [None]:
# # loading weights to the local machine
# from safetensors.torch import load_file
# import torch
# from transformers import BertForSequenceClassification

# model = BertForSequenceClassification.from_pretrained('bert-large-uncased')
# state_dict = load_file('bert_large_finetuned.safetensors')
# model.load_state_dict(state_dict)

In [23]:
# Create a tokenization-only function for test set
def tokenize_test(example):
    tokenized = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors=None  # leave as lists for HF Dataset
    )
    tokenized["word_ids"] = tokenized.word_ids()  # store the word_ids
    return tokenized

In [38]:
# 1) Check for duplicate IDs
dupes = test_df["id"][test_df["id"].duplicated()]
if len(dupes):
    print(f"Duplicate IDs found: {dupes.tolist()}")
else:
    print("No duplicate IDs.")

# 2) Check for sentences that get truncated by the tokenizer
too_long = []
for idx, sent in enumerate(test_df["Sentence"]):
    toks = tokenizer(
        sent,
        truncation=True,
        padding=False,
        max_length=128,
        is_split_into_words=True
    )
    # count actual word_ids (excluding special tokens)
    wids = toks.word_ids()
    # words retained = max word_idx + 1
    max_word = max([w for w in wids if w is not None], default=-1) + 1
    if max_word < len(sent):
        too_long.append((test_df["id"].iloc[idx], len(sent), max_word))

if too_long:
    print("Sentences being truncated (id, orig_len, kept_words):")
    for t in too_long[:5]:
        print(" ", t)
else:
    print("No truncation issues (all sentences ≤128 tokens).")

No duplicate IDs.
No truncation issues (all sentences ≤128 tokens).


In [39]:
# 1) Tokenize test set once, storing word_ids and keeping Sentence
def tokenize_with_word_ids(example):
    # Keep the original sentence for later length check
    sent = example["Sentence"]
    tokenized = tokenizer(
        sent,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )
    example["word_ids"] = tokenized.word_ids()
    # store tokenized fields
    example.update(tokenized)
    return example

# Build the test dataset, but don't drop 'Sentence' yet
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(
    tokenize_with_word_ids,
    remove_columns=[]  # keep all columns, including 'Sentence'
)

# 2) Align predictions: but first, prepare inputs only
# Create a view with only the model inputs
pred_dataset = test_dataset.remove_columns(["Sentence", "id"])  # keep only input_ids, attention_mask, token_type_ids, word_ids
pred_dataset.set_format(type="torch")

# 3) Run predictions
raw_preds = trainer.predict(pred_dataset)
preds = np.argmax(raw_preds.predictions, axis=2)

# 4) Align predictions to original words, padding if needed
final_preds = []
for sent, pred_row, word_ids in zip(test_dataset["Sentence"], preds, test_dataset["word_ids"]):
    aligned = []
    prev = None
    for idx, widx in enumerate(word_ids):
        if widx is not None and widx != prev:
            aligned.append(id2label[pred_row[idx]])
        prev = widx

    # If for some reason we have fewer tags than words, pad with 'O'
    if len(aligned) < len(sent):
        padding = ["O"] * (len(sent) - len(aligned))
        aligned.extend(padding)

    # Or if too many (shouldn't happen), truncate
    if len(aligned) > len(sent):
        aligned = aligned[: len(sent)]

    final_preds.append(aligned)

# 5) Sanity-check lengths
mismatches = [
    (ex["id"], len(ex["Sentence"]), len(p))
    for ex, p in zip(test_dataset, final_preds)
    if len(ex["Sentence"]) != len(p)
]
if mismatches:
    print("Still mismatches (should be none):", mismatches)
else:
    print("All lengths match after padding!")

# 6) Build submission
submission_df = pd.DataFrame({
    "id": test_dataset["id"],
    "NER Tag": [str(p) for p in final_preds]
})
submission_df.to_csv("submission.csv", index=False)
print("Submission file generated!")
# files.download("submission.csv")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

All lengths match after padding!
Submission file generated!


In [42]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)