So for this problem I also used pretty straightforward approach where I used ChatGPT to generate a lot of different textes about mountains in all around the world. Then I used doccano to maniually label all texts and saved labeled data. Then I proceeded to use DistilBert for my NER to detect mountain names in the text. To use it correctly, and for it to be more efficient i've manually done some hyperparams tuning. Although it helps a lot to find a balance between over and underfitting, the model still lack some tuning and maybe larger dataset.

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments, pipeline
from torch.nn import CrossEntropyLoss

import json
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MountainDataset(Dataset):
    def __init__(self, texts, labels, max_length=512):
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

        # Create an array to hold the labels
        label_array = torch.ones(self.max_length, dtype=torch.long) * -100  # Use -100 to ignore index during loss computation
        label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)

        item = {key: val.squeeze() for key, val in encoding.items()}  # Remove extra dimension
        item['labels'] = label_array

        return item

    def __len__(self):
        return len(self.texts)

In [3]:
def load_data(filename):
    """Load and process the data from a JSONL file for NER.

    Args:
        filename (str): The path to the JSONL file.

    Returns:
        list: A list of texts (sentences).
        list: A corresponding list of word-level labels.
    """
    texts = []
    labels = []

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            text = data['text']
            words = text.split()
            word_labels = [0] * text.count(" ")  # Initialize all labels to 0

            for label in data['label']:
                start, end, label_type = label
                # Find the word index for start position
                start_index = text[:start].count(' ')
                end_index = text[:end].count(' ')

                # Update labels for words within the entity
                for i in range(start_index, end_index):
                    word_labels[i] = 1 if label_type == "Mountain" else 0

            texts.append(text)
            labels.append(word_labels)

    return texts, labels

In [4]:
def pad_labels(labels, max_length, pad_value=-100):
    """Pad the label sequences to a maximum length.

    Args:
        labels (list): The list of label sequences.
        max_length (int): The maximum length to pad to.
        pad_value (int): The value used for padding.

    Returns:
        torch.Tensor: A tensor of padded label sequences.
    """
    padded_labels = []
    for label in labels:
        # Check if the label sequence is longer than max_length
        if len(label) > max_length:
            # Truncate the label sequence if it's too long
            padded_label = label[:max_length]
        else:
            # Pad the label sequence if it's too short
            padded_label = label + [pad_value] * (max_length - len(label))
        padded_labels.append(padded_label)

    return torch.tensor(padded_labels)





In [13]:
# Example data (you'll replace this with your own dataset)
texts, labels = load_data("labeled_dataset.jsonl")
padded_labels = pad_labels(labels, 256)

# Create Custom trainer which will properly count class weights
labels_flattened = np.array([label for sublist in padded_labels for label in sublist if label != -100])
class_weights = compute_class_weight('balanced', classes=np.unique(labels_flattened), y=labels_flattened)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
loss_fct = CrossEntropyLoss(weight=class_weights_tensor)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss



# Create the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, padded_labels, test_size=0.2, random_state=42)
dataset = MountainDataset(train_texts, train_labels)
val_dataset = MountainDataset(val_texts, val_labels)

# Load the model
model = DistilBertForTokenClassification.from_pretrained('./results/checkpoint-105', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.05,
    learning_rate=0.001,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    # eval_steps=20,
    save_strategy="epoch",
    # gradient_accumulation_steps=2
)


# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

  label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)


Epoch,Training Loss,Validation Loss
1,No log,0.562008
2,No log,0.829908
3,0.307200,0.738857
4,0.307200,0.676184
5,0.354300,0.641367


  label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)
  label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)
  label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)
  label_array[:len(labels)] = torch.tensor(labels, dtype=torch.long)


TrainOutput(global_step=105, training_loss=0.3411435672215053, metrics={'train_runtime': 694.4754, 'train_samples_per_second': 0.605, 'train_steps_per_second': 0.151, 'total_flos': 54874302013440.0, 'train_loss': 0.3411435672215053, 'epoch': 5.0})

In [14]:
model.save_pretrained('./saved_model')

Some testing. As we can see only words "the Alps" is labeled as label_1. So program can see some mountains and detect them in text. However, it needs to be said that algorithm still gives a lot of wrong results and needs to be improved.

In [39]:
# Testing the results
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForTokenClassification.from_pretrained('./results/checkpoint-21')
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Example text
example_text = "My favourite mountains are the Alps also in terms of different processors I think \"Elbrus\" are the worst"

# Get predictions
predictions = ner_pipeline(example_text)

# Process and print predictions
for prediction in predictions:
    print(prediction)

{'entity': 'LABEL_0', 'score': 0.854722, 'index': 1, 'word': 'my', 'start': 0, 'end': 2}
{'entity': 'LABEL_0', 'score': 0.9617946, 'index': 2, 'word': 'favourite', 'start': 3, 'end': 12}
{'entity': 'LABEL_0', 'score': 0.96299255, 'index': 3, 'word': 'mountains', 'start': 13, 'end': 22}
{'entity': 'LABEL_0', 'score': 0.8687076, 'index': 4, 'word': 'are', 'start': 23, 'end': 26}
{'entity': 'LABEL_1', 'score': 0.7748349, 'index': 5, 'word': 'the', 'start': 27, 'end': 30}
{'entity': 'LABEL_1', 'score': 0.8525415, 'index': 6, 'word': 'alps', 'start': 31, 'end': 35}
{'entity': 'LABEL_0', 'score': 0.94497156, 'index': 7, 'word': 'also', 'start': 36, 'end': 40}
{'entity': 'LABEL_0', 'score': 0.9702717, 'index': 8, 'word': 'in', 'start': 41, 'end': 43}
{'entity': 'LABEL_0', 'score': 0.9781503, 'index': 9, 'word': 'terms', 'start': 44, 'end': 49}
{'entity': 'LABEL_0', 'score': 0.8332724, 'index': 10, 'word': 'of', 'start': 50, 'end': 52}
{'entity': 'LABEL_0', 'score': 0.9561972, 'index': 11, 'wo