In [20]:
!pip install transformers datasets seqeval evaluate --quiet

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset,DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import evaluate

In [22]:
dataset = load_dataset("rjac/kaggle-entity-annotated-corpus-ner-dataset")

# Access the training split
train_dataset = dataset["train"]

# Print the first example
print(train_dataset[0])

{'sentence_id': ' 1', 'tokens': ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0]}


In [23]:
# labels=list(set([label for example in train_dataset for label in example["ner_tags"]]))
# labels.sort()
# label2id={label:idx for idx,label in enumerate(labels)}
# id2label = {i: str(i) for i in range(17)}
# print("labels:",labels)
# print("label to id:",label2id)


In [24]:
labels = [
    "O",       # 0
    "B-PER",   # 1
    "I-PER",   # 2
    "B-ORG",   # 3
    "I-ORG",   # 4
    "B-LOC",   # 5
    "I-LOC",   # 6
    "B-MISC",  # 7
    "I-MISC",  # 8
    "B-TIME",  # 9
    "I-TIME",  # 10
    "B-DATE",  # 11
    "I-DATE",  # 12
    "B-MONEY", # 13
    "I-MONEY", # 14
    "B-PERCENT", # 15
    "I-PERCENT"  # 16
]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

print("labels:", labels)
print("id2label:", id2label)


labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-TIME', 'I-TIME', 'B-DATE', 'I-DATE', 'B-MONEY', 'I-MONEY', 'B-PERCENT', 'I-PERCENT']
id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC', 9: 'B-TIME', 10: 'I-TIME', 11: 'B-DATE', 12: 'I-DATE', 13: 'B-MONEY', 14: 'I-MONEY', 15: 'B-PERCENT', 16: 'I-PERCENT'}


In [25]:
model_name="bert-base-cased"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)
device="cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Using device:",device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu


In [26]:
def tokenize_and_align_labels(examples):
    tokenized_inputs=tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    labels=[]
    for i,label in enumerate(examples["ner_tags"]):
        word_ids=tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx=None
        label_ids=[]
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx!=previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx=word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"]=labels
    return tokenized_inputs

In [27]:
tokenized_datasets=train_dataset.map(tokenize_and_align_labels,batched=True)
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids","attention_mask","labels"]
)

In [28]:
def compute_metrics(eval_pred):
    predictions,labels=eval_pred
    predictions=np.argmax(predictions,axis=2)

    true_predictions=[
        [id2label[p] for (p,l) in zip(prediction,label) if l!=-100] for prediction,label in zip(predictions,labels)
    ]
    true_labels=[
       [ id2label[l] for (p,l) in zip(prediction,label) if l!=-100] for prediction,label in zip(predictions,labels)
    ]

    return {
        "precision":precision_score(true_labels,true_predictions),
        "recall":recall_score(true_labels,true_predictions),
        "f1":f1_score(true_labels,true_predictions),
        "report":classification_report(true_labels,true_predictions)
    }


In [29]:
training_args=TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

In [30]:
dataset=dataset["train"].train_test_split(test_size=0.1,seed=42)
train_dataset=dataset["train"]
validation_dataset=dataset["test"]

tokenized_datasets=dataset.map(tokenize_and_align_labels,batched=True)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

Map: 100%|██████████| 43163/43163 [00:07<00:00, 5881.26 examples/s]
  trainer=Trainer(


In [33]:
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [31]:
def ner_inference(sentence,model,tokenizer,id2label):
    model.eval()
    tokenized_input=tokenizer(sentence,return_tensors="pt",is_split_into_words=False).to(model.device)
    outputs=model(**tokenized_input)
    predicted_label_ids=outputs.logits.argmax(-1)[0].cpu().numpy()

    tokens=tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
    word_ids=tokenized_input.word_ids(0)

    ner_results=[]
    current_word=""
    current_label=None

    for token,word_idx,label_id in zip(tokens,word_ids,predicted_label_ids):
        if word_idx is None:
            continue
        label=id2label[label_id]
        if word_idx!=word_ids[word_ids.index(word_idx)-1] if word_ids.index(word_idx)>0 else None:
            if current_word:
                ner_results.append((current_word,current_label))
            current_word=token
            current_label=label
        else:
            if token.startswith("##"):
                current_word+=token[2:]
            else:
                current_word+=token
    if current_word:
        ner_results.append((current_word,current_label))

    return ner_results


In [32]:
sentence1="Monika chaulagain works at Google in New York"
print("NER output:",ner_inference(sentence1,model,tokenizer,id2label))

NER output: [('Mon', 'I-ORG'), ('##ika', 'B-DATE'), ('ch', 'B-DATE'), ('##aul', 'I-PER'), ('##aga', 'I-PER'), ('##in', 'I-MONEY'), ('works', 'I-TIME'), ('at', 'I-ORG'), ('Google', 'B-DATE'), ('in', 'I-TIME'), ('New', 'B-LOC'), ('York', 'I-ORG')]
