In [None]:
!pip install transformers datasets seqeval --quiet

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from google.colab import files
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizerFast
import numpy as np
from transformers import BertForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import f1_score
import torch
from google.colab import files

In [None]:
# Load the CSV files assuming keys "train.csv" and "test.csv" in uploaded dict
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Parse the stringified lists (adjust column names if different)
train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval)
train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (40000, 3)
Test shape: (5000, 2)


In [None]:
# Split into train/validation (adjust test_size as preferred)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Get unique labels from training set and create mapping dictionaries.
unique_labels = sorted({label for tags in train_df['NER Tag'] for label in tags})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Unique labels:", unique_labels)

Unique labels: ['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def encode_examples(example):
    # Tokenize input (word-level input, not sentence string)
    tokenized_input = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors="pt"
    )

    word_ids = tokenized_input.word_ids(batch_index=0)  # for single example
    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label2id[example["NER Tag"][word_idx]])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_input["labels"] = torch.tensor(label_ids)

    # Remove batch dimension for Hugging Face datasets map compatibility
    return {k: v.squeeze() if isinstance(v, torch.Tensor) else v for k, v in tokenized_input.items()}

In [None]:
# Convert dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Apply tokenization function
train_dataset = train_dataset.map(encode_examples, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(encode_examples, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
# Set up the model
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Optional: Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Using device: cuda


In [None]:
# Define metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label] for label in sent_labels if label != -100]
        for sent_labels in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(sent_preds, sent_labels) if l != -100]
        for sent_preds, sent_labels in zip(predictions, labels)
    ]

    # Using seqeval's f1 score
    return {"f1": f1_score(true_labels, true_predictions)}

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False,
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
# Train!
trainer.train()

# Save the model to a local directory (e.g., 'model')
trainer.save_model("model_epoch6")

Epoch,Training Loss,Validation Loss,F1
1,0.1012,0.092147,0.830301
2,0.0772,0.086476,0.832721
3,0.06,0.087809,0.840006
4,0.0467,0.095723,0.840032
5,0.0352,0.102781,0.841566
6,0.0277,0.10833,0.842593


In [None]:
param_count = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {param_count:,}")

Total parameters: 107,732,753


In [None]:
# Create a tokenization-only function for test set
def tokenize_test(example):
    tokenized = tokenizer(
        example["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
        return_tensors=None  # leave as lists for HF Dataset
    )
    tokenized["word_ids"] = tokenized.word_ids()  # store the word_ids
    return tokenized

In [2]:
# # 1) Check for duplicate IDs
# dupes = test_df["id"][test_df["id"].duplicated()]
# if len(dupes):
#     print(f"Duplicate IDs found: {dupes.tolist()}")
# else:
#     print("No duplicate IDs.")

# # 2) Check for sentences that get truncated by the tokenizer
# too_long = []
# for idx, sent in enumerate(test_df["Sentence"]):
#     toks = tokenizer(
#         sent,
#         truncation=True,
#         padding=False,
#         max_length=128,
#         is_split_into_words=True
#     )
#     # count actual word_ids (excluding special tokens)
#     wids = toks.word_ids()
#     # words retained = max word_idx + 1
#     max_word = max([w for w in wids if w is not None], default=-1) + 1
#     if max_word < len(sent):
#         too_long.append((test_df["id"].iloc[idx], len(sent), max_word))

# if too_long:
#     print("Sentences being truncated (id, orig_len, kept_words):")
#     for t in too_long[:5]:
#         print(" ", t)
# else:
#     print("No truncation issues (all sentences ≤128 tokens).")


In [None]:
# 1) Tokenize test set once, storing word_ids and keeping Sentence
def tokenize_with_word_ids(example):
    # Keep the original sentence for later length check
    sent = example["Sentence"]
    tokenized = tokenizer(
        sent,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )
    example["word_ids"] = tokenized.word_ids()
    # store tokenized fields
    example.update(tokenized)
    return example

# Build the test dataset, but don't drop 'Sentence' yet
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(
    tokenize_with_word_ids,
    remove_columns=[]  # keep all columns, including 'Sentence'
)

# 2) Align predictions: but first, prepare inputs only
# Create a view with only the model inputs
pred_dataset = test_dataset.remove_columns(["Sentence", "id"])  # keep only input_ids, attention_mask, token_type_ids, word_ids
pred_dataset.set_format(type="torch")

# 3) Run predictions
raw_preds = trainer.predict(pred_dataset)
preds = np.argmax(raw_preds.predictions, axis=2)

# 4) Align predictions to original words, padding if needed
final_preds = []
for sent, pred_row, word_ids in zip(test_dataset["Sentence"], preds, test_dataset["word_ids"]):
    aligned = []
    prev = None
    for idx, widx in enumerate(word_ids):
        if widx is not None and widx != prev:
            aligned.append(id2label[pred_row[idx]])
        prev = widx

    # If for some reason we have fewer tags than words, pad with 'O'
    if len(aligned) < len(sent):
        padding = ["O"] * (len(sent) - len(aligned))
        aligned.extend(padding)

    # Or if too many (shouldn't happen), truncate
    if len(aligned) > len(sent):
        aligned = aligned[: len(sent)]

    final_preds.append(aligned)

# 5) Sanity-check lengths
mismatches = [
    (ex["id"], len(ex["Sentence"]), len(p))
    for ex, p in zip(test_dataset, final_preds)
    if len(ex["Sentence"]) != len(p)
]
if mismatches:
    print("Still mismatches (should be none):", mismatches)
else:
    print("All lengths match after padding!")

# 6) Build submission
submission_df = pd.DataFrame({
    "id": test_dataset["id"],
    "NER Tag": [str(p) for p in final_preds]
})
submission_df.to_csv("submission.csv", index=False)
print("Submission file generated!")
files.download("submission.csv")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

✅ All lengths match after padding!
Submission file generated!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Make submission dataframe; assume test_df has an "id" column
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "NER Tag": final_preds
})

In [None]:
submission_df["NER Tag"] = submission_df["NER Tag"].apply(str)
submission_df.to_csv("submission.csv", index=False)
print("Submission file generated!")

Submission file generated!


In [None]:
mismatches = []

for idx, (sent, pred) in enumerate(zip(test_df["Sentence"], final_preds)):
    if len(sent) != len(pred):
        mismatches.append((test_df["id"][idx], len(sent), len(pred)))

if mismatches:
    print(f"Found {len(mismatches)} mismatched entries:")
    for mid, slen, plen in mismatches:
        print(f" - ID {mid}: Sentence length = {slen}, Prediction length = {plen}")
else:
    print("All predictions match sentence lengths!")

✅ All predictions match sentence lengths!


In [None]:
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>