This is an additional notebook containing the fully combined code to perform this task.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [77]:
import re


def get_formatted_data(file_path):
    with open(file_path, "r", encoding="UTF-8") as file:
        lines = file.readlines()

    formatted_data = []
    i = 0
    while i < len(lines):
        sentence = lines[i].strip()
        if sentence:
            i += 1
            mountains = lines[i].strip().split(", ")
            labels = [] 
            if mountains[0] != "":  # No mountains name in sentence
                for mountain in mountains:
                    match = re.search(re.escape(mountain), sentence)
                    if match:
                        start_idx = match.start()
                        labels.append([start_idx, len(mountain), "MOUNT"])
    
            formatted_data.append({
                "text": sentence,
                "label": labels
            })
        i += 1
    
    return formatted_data

In [78]:
data = get_formatted_data('data/generated_dataset.txt')

In [79]:
for entry in data:
    print(entry)

{'text': 'The journey to Denali is a test of endurance and skill.', 'label': [[15, 6, 'MOUNT']]}
{'text': 'Climbers often dream of reaching the summit of Kilimanjaro.', 'label': [[47, 11, 'MOUNT']]}
{'text': 'The Alps are a mesmerizing mountain range stretching across Europe.', 'label': [[4, 4, 'MOUNT']]}
{'text': 'In Nepal, Annapurna offers both beauty and danger to those who attempt to climb it.', 'label': [[10, 9, 'MOUNT']]}
{'text': 'The Rockies are known for their stunning landscapes and diverse wildlife.', 'label': [[4, 7, 'MOUNT']]}
{'text': 'K2, also called Mount Godwin-Austen, is one of the most treacherous peaks in the world.', 'label': [[0, 2, 'MOUNT'], [16, 19, 'MOUNT']]}
{'text': 'Mount Fuji in Japan is famous for its symmetrical shape and cultural significance.', 'label': [[0, 10, 'MOUNT']]}
{'text': 'Many photographers capture the beauty of the Andes in South America.', 'label': [[45, 5, 'MOUNT']]}
{'text': 'At sunrise, the snow-capped peak of Mount Rainier glows with a 

In [80]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(data, test_size=0.15, random_state=42)

In [107]:
from collections import Counter
train_counts = Counter()
test_counts = Counter()
for item in train_data:
    train_counts[len(item['label'])] += 1
for item in test_data:
    test_counts[len(item['label'])] += 1

In [109]:
print("Mountain distribution in train sentences:", train_counts)
print("Mountain distribution in test sentences:", test_counts)

Mountain distribution in train sentences: Counter({0: 250, 1: 164, 2: 5})
Mountain distribution in test sentences: Counter({0: 42, 1: 30, 2: 2})


In [110]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [111]:
label_list = ["O", "B-mountain", "I-mountain"]
label2id = dict(map(lambda i: (label_list[i], i), range(3)))
id2label = dict(map(lambda i: (i, label_list[i]), range(3)))

In [112]:
def tokenize_data(dataset):
    tokenized_data = []
    
    for sample in dataset:
        text = sample["text"]
        entities = sample["label"]
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
        labels = [0] * len(tokens)
        
        for start, length, _ in entities:
            prefix_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[:start])))
            start_token = len(prefix_tokens) - 1
            
            entity_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[start:start+length])))
            to_ignore = entity_tokens.count("[SEP]") + entity_tokens.count("[CLS]")
            end_token = start_token + len(entity_tokens) - 1 - to_ignore
            
            labels[start_token] = 1  # B-MOUNT
            for idx in range(start_token+1, end_token+1):
                labels[idx] = 2  # I-MOUNT

        tokens_ids = tokenizer.convert_tokens_to_ids(tokens)   
        tokenized_data.append({
            'input_ids': tokens_ids,
            'labels': labels
        })
    return tokenized_data

In [113]:
tokenized_train_data = tokenize_data(train_data)
tokenized_test_data = tokenize_data(test_data)

In [117]:
example = tokenized_test_data[1]
input_ids = example["input_ids"]
labels = example["labels"]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Ids:", input_ids)
print("Tokens:", tokens)
print("Labels:", labels)

Ids: [101, 31192, 77639, 10129, 153, 25733, 31417, 10106, 63270, 19602, 10119, 103675, 11759, 33627, 10850, 10114, 10474, 79038, 14042, 119, 102]
Tokens: ['[CLS]', 'Gang', '##kha', '##r', 'P', '##uen', '##sum', 'in', 'Bhutan', 'remains', 'un', '##cl', '##im', '##bed', 'due', 'to', 'its', 'sacred', 'status', '.', '[SEP]']
Labels: [0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [118]:
import evaluate

seqeval = evaluate.load("seqeval")

In [119]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [120]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-multilingual-cased", id2label=id2label, label2id=label2id, finetuning_task="ner"
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [121]:
train_ds = Dataset.from_pandas(pd.DataFrame(data=tokenized_train_data))
test_ds = Dataset.from_pandas(pd.DataFrame(data=tokenized_test_data))

In [123]:
training_args = TrainingArguments(
    output_dir="mountain_ner_model",
    num_train_epochs=6,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.094548,0.707317,0.852941,0.773333,0.980017
2,No log,0.133794,0.72973,0.794118,0.760563,0.977519
3,No log,0.136903,0.852941,0.852941,0.852941,0.981682
4,No log,0.137077,0.852941,0.852941,0.852941,0.981682
5,No log,0.137676,0.852941,0.852941,0.852941,0.981682
6,No log,0.137897,0.852941,0.852941,0.852941,0.981682


{'mountain': {'precision': 0.7073170731707317, 'recall': 0.8529411764705882, 'f1': 0.7733333333333334, 'number': 34}, 'overall_precision': 0.7073170731707317, 'overall_recall': 0.8529411764705882, 'overall_f1': 0.7733333333333334, 'overall_accuracy': 0.9800166527893422}
{'mountain': {'precision': 0.7297297297297297, 'recall': 0.7941176470588235, 'f1': 0.7605633802816901, 'number': 34}, 'overall_precision': 0.7297297297297297, 'overall_recall': 0.7941176470588235, 'overall_f1': 0.7605633802816901, 'overall_accuracy': 0.97751873438801}
{'mountain': {'precision': 0.8529411764705882, 'recall': 0.8529411764705882, 'f1': 0.8529411764705882, 'number': 34}, 'overall_precision': 0.8529411764705882, 'overall_recall': 0.8529411764705882, 'overall_f1': 0.8529411764705882, 'overall_accuracy': 0.9816819317235637}
{'mountain': {'precision': 0.8529411764705882, 'recall': 0.8529411764705882, 'f1': 0.8529411764705882, 'number': 34}, 'overall_precision': 0.8529411764705882, 'overall_recall': 0.8529411764

TrainOutput(global_step=318, training_loss=0.001313672031996385, metrics={'train_runtime': 427.8977, 'train_samples_per_second': 5.875, 'train_steps_per_second': 0.743, 'total_flos': 13152309633000.0, 'train_loss': 0.001313672031996385, 'epoch': 6.0})

In [131]:
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer) # "Liubavaa/mountain_ner_model")

In [132]:
text = "The highest mountain on Earth is Everest in the Himalayas of Asia, whose summit is 8,850 m (29,035 ft) above mean sea level."
classifier(text)

[{'entity': 'B-mountain',
  'score': 0.8549887,
  'index': 7,
  'word': 'Everest',
  'start': 33,
  'end': 40},
 {'entity': 'B-mountain',
  'score': 0.5293795,
  'index': 10,
  'word': 'Himalaya',
  'start': 48,
  'end': 56}]

In [133]:
trainer.evaluate()

{'mountain': {'precision': 0.7073170731707317, 'recall': 0.8529411764705882, 'f1': 0.7733333333333334, 'number': 34}, 'overall_precision': 0.7073170731707317, 'overall_recall': 0.8529411764705882, 'overall_f1': 0.7733333333333334, 'overall_accuracy': 0.9800166527893422}


{'eval_loss': 0.09454839676618576,
 'eval_precision': 0.7073170731707317,
 'eval_recall': 0.8529411764705882,
 'eval_f1': 0.7733333333333334,
 'eval_accuracy': 0.9800166527893422,
 'eval_runtime': 2.3082,
 'eval_samples_per_second': 32.059,
 'eval_steps_per_second': 4.332,
 'epoch': 6.0}