In [1]:
# Install required libraries
!pip install transformers datasets seqeval

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import evaluate

In [5]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Load the BERT tokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [22]:
# Preprocess the dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True,padding="max_length", is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token (e.g., [CLS], [SEP])
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Start of a new word
            else:
                label_ids.append(-100)  # Subword of the same word
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
# Tokenize and align labels
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [30]:
# Define label names
label_names = dataset["train"].features["ner_tags"].feature.names


In [31]:
# Load the BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(dataset["train"].features["ner_tags"].feature.names),  # Number of NER tags
    id2label={i: label for i, label in enumerate(dataset["train"].features["ner_tags"].feature.names)},
    label2id={label: i for i, label in enumerate(dataset["train"].features["ner_tags"].feature.names)}
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)




In [33]:
# Load evaluation metric
metric = evaluate.load("seqeval")

In [34]:
# Function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [35]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [36]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0471,0.039571,0.926995,0.940256,0.933578,0.989194
2,0.0068,0.037636,0.947086,0.948839,0.947961,0.990986
3,0.0073,0.037398,0.944064,0.951531,0.947783,0.9912


TrainOutput(global_step=2634, training_loss=0.05134493474344367, metrics={'train_runtime': 4036.6556, 'train_samples_per_second': 10.435, 'train_steps_per_second': 0.653, 'total_flos': 1.1007299854181376e+16, 'train_loss': 0.05134493474344367, 'epoch': 3.0})

In [37]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.037635840475559235, 'eval_precision': 0.9470855031076768, 'eval_recall': 0.9488387748232918, 'eval_f1': 0.9479613282891972, 'eval_accuracy': 0.9909855535220591, 'eval_runtime': 89.6545, 'eval_samples_per_second': 36.25, 'eval_steps_per_second': 2.275, 'epoch': 3.0}


In [38]:
# Save the model
model.save_pretrained("./ner-bert-model")
tokenizer.save_pretrained("./ner-bert-model")

('./ner-bert-model/tokenizer_config.json',
 './ner-bert-model/special_tokens_map.json',
 './ner-bert-model/vocab.txt',
 './ner-bert-model/added_tokens.json',
 './ner-bert-model/tokenizer.json')

In [39]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model="./ner-bert-model", tokenizer="./ner-bert-model")
results = ner_pipeline("John works at Google in New York.")
print(results)

Device set to use cuda:0


[{'entity': 'B-PER', 'score': 0.9963966, 'index': 1, 'word': 'John', 'start': 0, 'end': 4}, {'entity': 'B-ORG', 'score': 0.9926785, 'index': 4, 'word': 'Google', 'start': 14, 'end': 20}, {'entity': 'B-LOC', 'score': 0.9985232, 'index': 6, 'word': 'New', 'start': 24, 'end': 27}, {'entity': 'I-LOC', 'score': 0.9981046, 'index': 7, 'word': 'York', 'start': 28, 'end': 32}]
