I. DATASET CREATION

IMPORT

In [1]:
import wikipediaapi
import re
import pandas as pd
from collections import Counter
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForTokenClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import lr_scheduler
import torch.nn as nn
import nltk
from nltk.corpus import stopwords
import random

Initializing the Wikipedia API and the list of mountains

In [2]:
# Upload file
df = pd.read_csv('Mountain.csv')

# Counting the number of unique mountain names
num_mountains = df['Mountain'].nunique()
print(f"Number of unique mountain names: {num_mountains}")

Number of unique mountain names: 1621


In [3]:
# Initialize Wikipedia API with User-Agent
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent="Mozilla/5.0 (compatible; MyNLPApp/1.0; +https://example.com)"
)

# Path to the mountain file
file_path = 'Mountain.csv'

# Loading data from a CSV file
df_mountains = pd.read_csv(file_path)

# Extract the first 1200 mountain names from the 'Mountain' column
mountain_names_600 = df_mountains['Mountain'].head(1200).tolist()

# Виведемо кількість витягнутих назв гір для перевірки
print(f"Number of mountains drawn: {len(mountain_names_600)}")
print(f"Top 10 mountains: {mountain_names_600[:10]}")


Number of mountains drawn: 1200
Top 10 mountains: ['Mount Everest', 'K2', 'Kangchenjunga', 'Lhotse', 'Makalu', 'Cho Oyu', 'Dhaulagiri', 'Manaslu', 'Nanga Parbat', 'Annapurna']


Extract the text from Wikipedia pages

In [4]:
# Function for downloading article texts
def fetch_mountain_texts(mountain_names):
    texts = {}
    for name in mountain_names:
        page = wiki_wiki.page(name)
        if page.exists():
            texts[name] = page.text[:500]  # Limit the text to the first 500 characters
    return texts

# Loading texts
mountain_texts_600 = fetch_mountain_texts(mountain_names_600)

In [5]:
# Print a preview for multiple mountains
mountain_texts_preview_600 = {name: text[:300] for name, text in mountain_texts_600.items()}
len(mountain_texts_600), mountain_texts_preview_600

(1131,
 {'Mount Everest': "Mount Everest, known locally as Sagarmatha or Qomolangma,  is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. The China–Nepal border runs across its summit point. Its elevation (snow height) of 8,848.86 m (29,031 ft 8+1⁄2 in) was most recently es",
  'K2': 'K2, at 8,611 metres (28,251 ft) above sea level, is the second-highest mountain on Earth, after Mount Everest at 8,849 metres (29,032 ft). It lies in the Karakoram range, partially in the Gilgit-Baltistan region of Pakistan-administered Kashmir and partially in the China-administered Trans-Karakoram',
  'Kangchenjunga': 'Kangchenjunga, also spelled Kanchenjunga, Kanchanjanghā and Khangchendzonga, is the third-highest mountain in the world. Its summit lies at 8,586 m (28,169 ft) in a section of the Himalayas, the Kangchenjunga Himal, which is bounded in the west by the Tamur River, in the north by the Lhonak River an',
  'Lhotse': "Lhotse (Nepali: ल्होत्

In [5]:
# List of mountains that we will search for in the texts for markup
mountain_names_lower = [name.lower() for name in mountain_names_600]

Definition of "main" mountain on Wikipedia page (actually the step is not extremely important, but wanted to try)

In [6]:
# Function to determine the main mountain in the text
def get_main_mountain(text, mountain_names):
    mountain_counts = Counter()
    text_lower = text.lower()
    
    for mountain in mountain_names:
        count = text_lower.count(mountain)  # Count the number of mentions of each mountain
        if count > 0:
            mountain_counts[mountain] = count
            
    if mountain_counts:
        main_mountain = mountain_counts.most_common(1)[0][0]  # Picking the mountain with the most mentions
        return main_mountain
    return None

# Function for marking up texts taking into account the main mountain
def annotate_mountains_by_main(texts, mountain_names):
    annotated_data = []
    for name, text in texts.items():
        main_mountain = get_main_mountain(text, mountain_names)  # Determine the main mountain
        if main_mountain:
            sentences = re.split(r'(?<=[.!?]) +', text)  # Split the text into sentences
            for sentence in sentences:
                found_mountains = [mountain for mountain in mountain_names if mountain in sentence.lower()]
                if found_mountains:
                    annotated_data.append({
                        'sentence': sentence,
                        'main_mountain': main_mountain,
                        'mentioned_mountains': found_mountains
                    })
    return annotated_data

Saving our dataset

In [7]:
# Text markup
annotated_dataset_by_main = annotate_mountains_by_main(mountain_texts_600, mountain_names_lower)

# Convert to DataFrame for saving
df_annotated_main = pd.DataFrame(annotated_dataset_by_main)

# Output the number of sentences in the dataset
print(f"Number of sentences in the dataset before adding synthetic data: {df_annotated_main.shape[0]}")

# Saving the dataset in CSV format
output_path = 'annotated_mountain_dataset.csv'
df_annotated_main.to_csv(output_path, index=False)

# Show the first few lines
df_annotated_main.head(20)

Number of sentences in the dataset before adding synthetic data: 1958


Unnamed: 0,sentence,main_mountain,mentioned_mountains
0,"Mount Everest, known locally as Sagarmatha or ...",mount everest,"[mount everest, olan]"
1,"Its elevation (snow height) of 8,848.86 m (29,...",mount everest,[mount everest]
2,"K2, at 8,611 metres (28,251 ft) above sea leve...",k2,"[mount everest, k2]"
3,"It lies in the Karakoram range, partially in t...",k2,[k2]
4,"Kangchenjunga, also spelled Kanchenjunga, Kanc...",kangchenjunga,[kangchenjunga]
5,"Its summit lies at 8,586 m (28,169 ft) in a se...",kangchenjunga,[kangchenjunga]
6,"Lhotse (Nepali: ल्होत्से, romanized: L'hōtsē [...",lhotse,"[mount everest, k2, kangchenjunga, lhotse]"
7,"At an elevation of 8,516 metres (27,940 ft) ab...",lhotse,"[lhotse, nuptse]"
8,"Makalu (Nepali: मकालु हिमाल, romanized: Makālu...",makalu,[makalu]
9,It is located in the Mahalangur Himalayas 19 k...,makalu,[mount everest]


Adding synthetic data

In [8]:
# Uploading synthetic data
df_synthetic = pd.read_csv('synthetic_data.csv', header=None, names=['sentence'])

# Filter the lines to exclude those containing only "[" or "]"
df_synthetic = df_synthetic[~df_synthetic['sentence'].str.contains(r'^\[|\]$', regex=True)]

# Create a column 'main_mountain' for synthetic data
df_synthetic['main_mountain'] = df_synthetic['sentence'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else x)

# Combine the main dataset with synthetic data
df_combined = pd.concat([df_annotated_main, df_synthetic], ignore_index=True)

# Fill NaN values in 'mentioned_mountains' with empty lists
df_combined['mentioned_mountains'] = df_combined['mentioned_mountains'].fillna('[]')

# Save the combined dataset to a new CSV file
df_combined.to_csv('combined_annotated_dataset.csv', index=False)
print("Combined dataset has been saved to 'combined_annotated_dataset.csv'")

# Output the size of the new merged dataset for verification
print(f"Size of the merged dataset: {df_combined.shape}")

Combined dataset has been saved to 'combined_annotated_dataset.csv'
Size of the merged dataset: (2992, 3)


In [9]:
df_combined.head(10)

Unnamed: 0,sentence,main_mountain,mentioned_mountains
0,"Mount Everest, known locally as Sagarmatha or ...",mount everest,"[mount everest, olan]"
1,"Its elevation (snow height) of 8,848.86 m (29,...",mount everest,[mount everest]
2,"K2, at 8,611 metres (28,251 ft) above sea leve...",k2,"[mount everest, k2]"
3,"It lies in the Karakoram range, partially in t...",k2,[k2]
4,"Kangchenjunga, also spelled Kanchenjunga, Kanc...",kangchenjunga,[kangchenjunga]
5,"Its summit lies at 8,586 m (28,169 ft) in a se...",kangchenjunga,[kangchenjunga]
6,"Lhotse (Nepali: ल्होत्से, romanized: L'hōtsē [...",lhotse,"[mount everest, k2, kangchenjunga, lhotse]"
7,"At an elevation of 8,516 metres (27,940 ft) ab...",lhotse,"[lhotse, nuptse]"
8,"Makalu (Nepali: मकालु हिमाल, romanized: Makālu...",makalu,[makalu]
9,It is located in the Mahalangur Himalayas 19 k...,makalu,[mount everest]


Preparing our dataset

In [10]:
def prepare_ner_dataset_fixed_multiword(df, mountain_names):
    """
    Prepare a dataset for training a NER model taking into account multi-word mountain names.

    Returns a list of sentences tokenized into words and the corresponding labels.
    """
    sentences = []
    labels = []
    
    for _, row in df.iterrows():
        sentence = row['sentence'].split()  # tokenization
        main_mountain = row['main_mountain'].lower()  # Main mountain in the article
        
        sentence_labels = ['O'] * len(sentence)  # Initialize the label list with the value 'O'
        
        # Check all the long-form mountain names in the sentence
        for mountain in mountain_names:
            mountain_tokens = mountain.split()  # Break the name of the mountain into words
            mountain_len = len(mountain_tokens)
            
            # Check each substring in the sentence
            for i in range(len(sentence) - mountain_len + 1):
                window = sentence[i:i + mountain_len]  # A substring of a sentence as long as the name of a mountain
                window_lower = [word.lower().strip(string.punctuation) for word in window]
                
                if window_lower == mountain_tokens:
                    sentence_labels[i] = 'B-MOUNTAIN'  # Beginning of mountain name
                    for j in range(1, mountain_len):
                        sentence_labels[i + j] = 'I-MOUNTAIN'  # Continuation of the mountain name
        
        sentences.append(sentence)
        labels.append(sentence_labels)
    
    return sentences, labels


In [11]:
# Preparing the dataset
sentences_fixed_multiword, labels_fixed_multiword = prepare_ner_dataset_fixed_multiword(df_combined, mountain_names_lower)

STOP WORDS

In [12]:
# Let's output an example sentence before removing stop words
print("Before removing stop words:")
print(list(zip(sentences_fixed_multiword[10], labels_fixed_multiword[10])))

Before removing stop words:
[('One', 'O'), ('of', 'O'), ('the', 'O'), ('eight-thousanders,', 'O'), ('Makalu', 'B-MOUNTAIN'), ('is', 'O'), ('an', 'O'), ('isolated', 'O'), ('peak', 'O'), ('shaped', 'O'), ('like', 'O'), ('a', 'O'), ('four-sided', 'O'), ('pyramid.', 'O'), ('Makalu', 'B-MOUNTAIN'), ('has', 'O'), ('two', 'O'), ('notable', 'O'), ('subsidiary', 'O'), ('peaks.', 'O')]


In [13]:
# Load stop words and remove them from the dataset
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(sentences, labels):
    """
    Removes stop words for class O tokens only.
    """
    new_sentences = []
    new_labels = []
    
    for sentence, label in zip(sentences, labels):
        reduced_sentence = []
        reduced_label = []
        
        for word, l in zip(sentence, label):
            if l == 'O' and word.lower() in stop_words:
                continue  # Skip the token if it is a class O stop word
            reduced_sentence.append(word)
            reduced_label.append(l)
        
        new_sentences.append(reduced_sentence)
        new_labels.append(reduced_label)
    
    return new_sentences, new_labels

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Apply the function to the dataset
sentences_reduced, labels_reduced = remove_stopwords(sentences_fixed_multiword, labels_fixed_multiword)

In [15]:
# Let's output an example sentence after removing stop words
print("\nAfter removing stop words:")
print(list(zip(sentences_reduced[10], labels_reduced[10])))


After removing stop words:
[('One', 'O'), ('eight-thousanders,', 'O'), ('Makalu', 'B-MOUNTAIN'), ('isolated', 'O'), ('peak', 'O'), ('shaped', 'O'), ('like', 'O'), ('four-sided', 'O'), ('pyramid.', 'O'), ('Makalu', 'B-MOUNTAIN'), ('two', 'O'), ('notable', 'O'), ('subsidiary', 'O'), ('peaks.', 'O')]


Deleting sentences with only O label tokens. (no I-MOUNTAIN or B-MOUNTAIN tokens)

In [16]:
def filter_sentences_no_mountains(sentences, labels):
    """
    Removes sentences where all tokens have the label 'O' (i.e. no mention of mountain names).
    """
    filtered_sentences = []
    filtered_labels = []
    
    for sentence, label in zip(sentences, labels):
        if any(l != 'O' for l in label):  # Check if there is at least one label other than 'O'
            filtered_sentences.append(sentence)
            filtered_labels.append(label)
    
    return filtered_sentences, filtered_labels

# Apply the function to the reduced dataset
filtered_sentences, filtered_labels = filter_sentences_no_mountains(sentences_reduced, labels_reduced)

# Output the number of sentences after filtering
print(f"Number of sentences after filtering: {len(filtered_sentences)}")

# Example sentence after filtering
print("Example sentence after filtering:")
print(list(zip(filtered_sentences[0], filtered_labels[0])))

Number of sentences after filtering: 2700
Example sentence after filtering:
[('Mount', 'B-MOUNTAIN'), ('Everest,', 'I-MOUNTAIN'), ('known', 'O'), ('locally', 'O'), ('Sagarmatha', 'O'), ('Qomolangma,', 'O'), ("Earth's", 'O'), ('highest', 'O'), ('mountain', 'O'), ('sea', 'O'), ('level,', 'O'), ('located', 'O'), ('Mahalangur', 'O'), ('Himal', 'O'), ('sub-range', 'O'), ('Himalayas.', 'O')]


Trying to fis disbalance by removing 0.5 of all O tokens to reach 70/30 balance (70 - O tokens, 30 I-MOUNTAIN/B-MOUNTAIN tokens)

In [17]:
def reduce_o_tokens(sentences, labels, reduction_ratio=0.5):
    """
    Selectively reduces the number of class O tokens in sentences.

    Parameters:
    - sentences: list of sentences (tokenized into words)
    - labels: list of labels for each sentence
    - reduction_ratio: fraction of class O tokens to remove

    Returns new lists of sentences and labels.
    """
    new_sentences = []
    new_labels = []
    
    for sentence, label in zip(sentences, labels):
        o_indices = [i for i, l in enumerate(label) if l == 'O']
        num_to_remove = int(len(o_indices) * reduction_ratio)
        
        if num_to_remove > 0:
            indices_to_remove = set(random.sample(o_indices, num_to_remove))
        else:
            indices_to_remove = set() 
        
        reduced_sentence = [word for i, word in enumerate(sentence) if i not in indices_to_remove]
        reduced_label = [l for i, l in enumerate(label) if i not in indices_to_remove]
        
        new_sentences.append(reduced_sentence)
        new_labels.append(reduced_label)
    
    return new_sentences, new_labels

# Apply the function to the dataset
final_sentences_reduced, final_labels_reduced = reduce_o_tokens(filtered_sentences, filtered_labels, reduction_ratio=0.5)

print(f"Number of sentences after reducing class O tokens: {len(final_sentences_reduced)}")

Number of sentences after reducing class O tokens: 2700


In [18]:
# Combine words and tags into strings
final_sentences_str = [' '.join(sentence) for sentence in final_sentences_reduced]
final_labels_str = [' '.join(label) for label in final_labels_reduced]

# Creating DataFrame
df = pd.DataFrame({
    'sentence': final_sentences_str,
    'label': final_labels_str
})

# Saving DataFrame in CSV file
df.to_csv('final_dataset.csv', index=False)

print("File successfully saved as 'final_dataset.csv'")

Файл успішно збережено як 'final_dataset.csv'


In [19]:
# Uploading dataset
df = pd.read_csv('final_dataset.csv')

# Split the strings back into token lists
final_sentences = [sentence.split() for sentence in df['sentence']]
final_labels = [label.split() for label in df['label']]

II. TRAINING OUR MODEL

In [20]:
# Splitting the dataset into training and test samples (80% training, 20% test)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    final_sentences, final_labels, test_size=0.2, random_state=42
)

# Check the number of sentences in each sample
print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 2160
Number of test sentences: 540


In [21]:
# Load the tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=3)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Map labels to numeric values
label_map = {'O': 0, 'B-MOUNTAIN': 1, 'I-MOUNTAIN': 2}

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_prepare_data(sentences, labels, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    label_ids = []

    for sent, label in zip(sentences, labels):
        # Tokenize the sentence and reduce it to a fixed length max_len
        encoded_dict = tokenizer.encode_plus(
            sent,
            is_split_into_words=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        # Bring labels to the same length as tokens (pad labels with a value of -100)
        label_id = [label_map[l] for l in label] + [-100] * (max_len - len(label))
        label_ids.append(torch.tensor(label_id[:max_len]))

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.stack(label_ids)

# Training data preparation
train_inputs, train_masks, train_labels = tokenize_and_prepare_data(train_sentences, train_labels, tokenizer)

# Preparing test data
test_inputs, test_masks, test_labels = tokenize_and_prepare_data(test_sentences, test_labels, tokenizer)

# Creating DataLoader for training and test samples
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

len(train_dataloader), len(test_dataloader)

(135, 34)

In [23]:
# Installing the AdamW optimizer for BERT
optimizer = AdamW(model.parameters(), lr=3e-5)

# Setting the number of epochs
epochs = 8

# Calculate the number of each class in the training labels
all_train_labels = [label for labels in train_labels for label in labels]
label_counts = Counter(all_train_labels)

# Initialize weights with large initial weights for classes that are not in the sample
class_weights = {0: 1.0, 1: 1.0, 2: 1.0}

# Boosting weights for rare classes
class_weights[1] *= 4
class_weights[2] *= 4

# Specify the weights as a tensor
weights_tensor = torch.tensor([class_weights[0], class_weights[1], class_weights[2]], dtype=torch.float)



# Loss function with weights for classes
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor, ignore_index=-100)

# Training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [24]:
#Training cycle
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in train_dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()  # Reset gradients
        
        # Forward pass
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss  # Losses
        
        loss.backward()  # Backward pass
        optimizer.step()  # Update settings
        
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Avg_loss: {avg_loss:.4f}")

Epoch 1/8, Avg_loss: 0.5154
Epoch 2/8, Avg_loss: 0.3236
Epoch 3/8, Avg_loss: 0.2449
Epoch 4/8, Avg_loss: 0.1967
Epoch 5/8, Avg_loss: 0.1484
Epoch 6/8, Avg_loss: 0.1120
Epoch 7/8, Avg_loss: 0.0827
Epoch 8/8, Avg_loss: 0.0642


In [25]:
# Set the model to evaluation mode
model.eval()

true_labels = []
pred_labels = []

# Evaluate the model on test data
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_attention_mask, batch_labels = [b.to(device) for b in batch]
        
        # Prediction without saving gradients
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits  # Expected values
        
        predictions = torch.argmax(logits, dim=-1)  # Get indices with maximum value (probability)
        
        # Collect real and predicted labels
        for i in range(batch_labels.size(0)):  # Go through each sentence in the batch
            true_label = batch_labels[i].cpu().numpy()
            pred_label = predictions[i].cpu().numpy()
            
            # Ignore labels -100 (padding)
            valid_indices = true_label != -100
            true_labels.extend(true_label[valid_indices])
            pred_labels.extend(pred_label[valid_indices])

# Output the classification report
print(classification_report(true_labels, pred_labels, target_names=['O', 'B-MOUNTAIN', 'I-MOUNTAIN']))

              precision    recall  f1-score   support

           O       0.90      0.94      0.92      3324
  B-MOUNTAIN       0.77      0.69      0.73       745
  I-MOUNTAIN       0.81      0.68      0.74       624

    accuracy                           0.87      4693
   macro avg       0.82      0.77      0.79      4693
weighted avg       0.86      0.87      0.86      4693



In [26]:
# Saving the trained model and tokenizer
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

print("The model and tokenizer have been successfully saved!")

The model and tokenizer have been successfully saved!
