### Import necessary libraries


In [42]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, random_split 
from tqdm import tqdm 
import torch
import pandas as pd

### Assuming a predefined set of entity types

In [43]:
entity_types = ["O", "B-MOUNTAIN", "I-MOUNTAIN"]

### Set num_labels

In [44]:
num_labels = len(entity_types)

### Load pre-trained BERT model and tokenizer

In [45]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define batch_size

In [46]:
batch_size = 32

### Define learning rate

In [47]:
learning_rate = 5e-5

### Convert CSV Dataset to train_dataset_sample

In [48]:
file_path = '../data/labeled_mountain_dataset.csv'
df = pd.read_csv(file_path)

train_dataset_sample = []
current_sentence = []
current_entities = []

for index, row in df.iterrows():
    word = row['Word']
    tag = row['Tag']
    
    if pd.isna(word) or pd.isna(tag):
        if current_sentence:
            train_dataset_sample.append({
                "text": " ".join(current_sentence),
                "labels": {"entities": current_entities}
            })
            current_sentence = []
            current_entities = []
    else:
        current_sentence.append(word)
        
        complete_sentence = " ".join(current_sentence)
        
        start_pos = len(complete_sentence) - len(word)
        end_pos = len(complete_sentence)
        
        if tag == "B-MOUNTAIN":
            current_entities.append((start_pos, end_pos, "MOUNTAIN"))
        elif tag == "I-MOUNTAIN" and current_entities:
            last_entity = current_entities[-1]
            current_entities[-1] = (last_entity[0], end_pos, last_entity[2])

if current_sentence:
    train_dataset_sample.append({
        "text": " ".join(current_sentence),
        "labels": {"entities": current_entities}
    })

print(train_dataset_sample)


[{'text': 'Mount Everest is the highest mountain in the world located in the Himalayas', 'labels': {'entities': [(0, 13, 'MOUNTAIN')]}}, {'text': 'Kilimanjaro is known for its snow-capped peak despite being near the equator', 'labels': {'entities': [(0, 11, 'MOUNTAIN')]}}, {'text': 'The Andes stretch along the western coast of South America creating a dramatic landscape', 'labels': {'entities': [(0, 9, 'MOUNTAIN')]}}, {'text': 'Mount Fuji is a symbol of Japan and one of the most photographed mountains', 'labels': {'entities': [(0, 10, 'MOUNTAIN')]}}, {'text': 'K2 is notorious for its difficulty and deadly reputation among mountaineers', 'labels': {'entities': [(0, 2, 'MOUNTAIN')]}}, {'text': 'Denali in Alaska offers stunning views and is the tallest peak in North America', 'labels': {'entities': [(0, 6, 'MOUNTAIN')]}}, {'text': 'The Matterhorn is famous for its pyramid shape and attracts climbers from around the world', 'labels': {'entities': [(0, 14, 'MOUNTAIN')]}}, {'text': 'Mount El

### Tokenize and format data

In [67]:
def tokenize_and_format_data(dataset, tokenizer):
    tokenized_data = []
    for sample in dataset:
        text = sample["text"]
        entities = sample["labels"]["entities"]

        # Tokenize the input text using the BERT tokenizer
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
        # Initialize labels for each token as 'O' (Outside)
        labels = ['O'] * len(tokens)

        # Update labels for entity spans
        for start, end, entity_type in entities:
            # Tokenize the prefix to get the correct offset
            prefix_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[:start])))
            start_token = len(prefix_tokens)

            # Tokenize the entity to get its length
            entity_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[start:end])))
            end_token = start_token + len(entity_tokens) - 1

            # Check if start_token and end_token are within bounds
            if start_token < len(labels):
                labels[start_token] = "B-MOUNTAIN"
            else:
                print(f"Warning: start_token {start_token} is out of range for text: '{text}'")
                continue

            # Ensure end_token does not exceed the length of labels
            for i in range(start_token + 1, min(end_token + 1, len(labels))):
                labels[i] = "I-MOUNTAIN"

        # Convert tokens and labels to input IDs and label IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        label_ids = [entity_types.index(label) for label in labels]

        # Pad input_ids and label_ids to the maximum sequence length
        padding_length = tokenizer.model_max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        label_ids += [entity_types.index('O')] * padding_length

        tokenized_data.append({
            'input_ids': input_ids,
            'labels': label_ids
        })

    # Convert tokenized data to PyTorch dataset
    dataset = TensorDataset(
        torch.tensor([item['input_ids'] for item in tokenized_data]),
        torch.tensor([item['labels'] for item in tokenized_data])
    )
    return dataset

### Prepare data for fine-tuning

In [68]:
train_data = tokenize_and_format_data(train_dataset_sample, tokenizer)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

### Fine-tune the model

In [69]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_epochs = 15 

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        inputs, labels = batch
        # Unpack the tuple
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

Training:   0%|          | 0/4 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Training: 100%|██████████| 4/4 [07:46<00:00, 116.61s/it]
Training: 100%|██████████| 4/4 [07:01<00:00, 105.50s/it]
Training: 100%|██████████| 4/4 [06:38<00:00, 99.54s/it] 
Training: 100%|██████████| 4/4 [06:46<00:00, 101.71s/it]
Training: 100%|██████████| 4/4 [06:57<00:00, 104.45s/it]
Training: 100%|██████████| 4/4 [06:44<00:00, 101.23s/it]
Training: 100%|██████████| 4/4 [06:49<00:00, 102.45s/it]
Training: 100%|██████████| 4/4 [07:09<00:00, 107.28s/it]
Training: 100%|██████████| 4/4 [06:50<00:00, 102.66s/it]
Training: 100%|██████████| 4/4 [06:57<00:00, 104.29s/it]
Training: 100%|██████████| 4/4 [06:46<00:00, 101.61s/it]
Training: 100%|██████████| 4/4 [06:45<00:00, 101.29s/it]
Training: 100%|██████████| 4/4 [06:49<00:00, 102.34s/it]
Training: 1

### Save the fine-tuned model for later use

In [70]:
model.save_pretrained('fine_tuned_ner_model')

### Inference

In [75]:
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained('fine_tuned_ner_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to perform inference
def infer_entities(text):
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=tokenizer.model_max_length)
    
    # Ensure the model is in evaluation mode
    model.eval()

    with torch.no_grad():
        # Get the predictions
        outputs = model(**tokens)
        logits = outputs.logits

    # Get the predicted labels
    predicted_ids = torch.argmax(logits, dim=2)

    # Convert predicted IDs to labels
    predicted_labels = [entity_types[id.item()] for id in predicted_ids[0]]

    # Post-process to get the tokens with their corresponding labels
    tokenized_text = tokenizer.tokenize(text)
    result = []

    for token, label in zip(tokenized_text, predicted_labels):
        result.append((token, label))

    return result

# Example usage
text_to_infer =  "Mount Everest is the highest mountain in the world, located in the Himalayas Kilimanjaro is known for its snow-capped peak despite being near the equator The Andes stretch along the western coast of South America, creating a dramatic landscape Mount Fuji is a symbol of Japan and one of the most photographed mountains"
predicted_entities = infer_entities(text_to_infer)

# Display the results
for token, label in predicted_entities:
    if label != "O":
        print(f"{token}: {label}")

the: I-MOUNTAIN
located: B-MOUNTAIN
known: I-MOUNTAIN
creating: B-MOUNTAIN
