In [None]:
!pip install datasets

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate==0.28.0

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
from datasets import load_dataset

In [11]:
device = 'cuda'

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=9
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
datasets = load_dataset("conll2003", trust_remote_code=True)

In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )
    labels = []
    
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # First subword
            else:
                label_ids.append(-100)  # Other subwords
            previous_word_idx = word_idx
            
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_datasets = datasets.map(
    tokenize_and_align_labels, batched=True
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
training_args = TrainingArguments(
    output_dir="./ner-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5512,0.318035
2,0.2494,0.236161
3,0.1757,0.217799


TrainOutput(global_step=2634, training_loss=0.2963402638764841, metrics={'train_runtime': 333.7579, 'train_samples_per_second': 126.208, 'train_steps_per_second': 7.892, 'total_flos': 1376049275709696.0, 'train_loss': 0.2963402638764841, 'epoch': 3.0})

In [39]:
def predict_entities(text, model, tokenizer, id2label, label_map):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    inputs = {
        "input_ids": torch.tensor([input_ids]),
        "attention_mask": torch.tensor([[1] * len(input_ids)])
    }
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    
    results = []
    i = 0
    while i < len(tokens):
        token = tokens[i]
        
        if token in tokenizer.all_special_tokens:
            i += 1
            continue
        
        pred_id = predictions[i]
        label = id2label[pred_id]
        entity_type = label_map.get(label, label)
        
        word = token.replace("##", "")
        start_idx = i
        i += 1
        
        while i < len(tokens) and tokens[i].startswith("##"):
            word += tokens[i].replace("##", "")
            i += 1
        
        word_labels = [id2label[predictions[j]] for j in range(start_idx, i)]
        entity_labels = [label_map.get(label, label) for label in word_labels]
        
        word_entity = entity_labels[0]
        
        results.append((word, word_entity))
    
    return results

In [38]:
label_map = {
    "LABEL_0": "O",
    "LABEL_1": "B-MISC",
    "LABEL_2": "I-MISC",
    "LABEL_3": "B-PER",
    "LABEL_4": "I-PER",
    "LABEL_5": "B-ORG",
    "LABEL_6": "I-ORG",
    "LABEL_7": "B-LOC",
    "LABEL_8": "I-LOC"
}

id2label = model.config.id2label

model = AutoModelForTokenClassification.from_pretrained("./ner-model/checkpoint-2500")
tokenizer = AutoTokenizer.from_pretrained("./ner-model/checkpoint-2500")

text = "Apple Inc. is based in Cupertino, California."
entities = predict_entities(text, model, tokenizer, id2label, label_map)

for word, label in entities:
    if label != "O":
        print(f"{word}: {label}")

Apple: B-ORG
Inc: I-PER
Cupertino: B-ORG
California: B-ORG
