#  NER demo

In [1]:
# import libraries
import random
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForTokenClassification, BertTokenizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# define a global variables
TEST_SIZE = 0.2
BATCH_SIZE = 32
MAX_LENGTH = 128

# read dataset CSV file
ner_df = pd.read_csv("Dataset.csv")
tokens = ner_df["Words"]
labels = ner_df["Labels"]

# split instances of test dataset
_ , inference_df = train_test_split(ner_df, test_size=TEST_SIZE, random_state=101)
_ , test_df= train_test_split(inference_df, test_size=0.5, random_state=101)

# check for available devices
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: {device}\n")

# load model
model_path = "models/"
loaded_ner_model = BertForTokenClassification.from_pretrained(model_path).to(device)

# create label map for encoding entities
label_map = {"O": 1, 
             "B-mountain": 2, "I-mountain": 2,
             "B-country": 3, "I-country": 3,
             "B-location": 4, "I-location": 4,
             "B-scientist": 5, "I-scientist": 5,
             "B-astronomicalobject": 6,  "I-astronomicalobject": 6,
             "B-organisation": 7, "I-organisation": 7,
             "B-award": 8, "I-award": 8,
             "B-misc": 9, "I-misc": 9,
             "B-academicjournal": 10, "I-academicjournal": 10,
             "B-university": 11, "I-university": 11,
             "B-person": 12, "I-person": 12,
             "B-chemicalcompound": 13, "I-chemicalcompound": 13,
             "B-protein": 14, "I-protein": 14,
             "B-event": 15, "I-event": 15,
             "B-enzyme": 16, "I-enzyme": 16,
             "B-discipline": 17, "I-discipline": 17,
             "B-theory": 18, "I-theory": 18,
             "B-chemicalelement":19, "I-chemicalelement":19}

#  define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Current device: cuda



In [2]:
def preprocess_random_single_sentence(sentence):
    """
    Preprocesses a single sentence by tokenizing and preparing it as input tensors and attention masks.

    Arguments:
    - sentence (str): The input sentence to be preprocessed.

    Returns:
    - input_tensor (torch.Tensor): Tensor containing tokenized input IDs.
    - attention_mask (torch.Tensor): Tensor containing the attention mask.
    """
    tokenized_input = tokenizer.encode_plus(
        sentence,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
        add_special_tokens=False)
    input_tensor = tokenized_input["input_ids"]
    attention_mask = tokenized_input["attention_mask"]
    
    return input_tensor, attention_mask

def inference_single_sentence(model, sentence_tensors):
    """
    Performs inference on a single sentence using the provided model and preprocessed sentence tensors.

    Arguments:
    - model (torch.nn.Module): The pre-trained model for inference.
    - sentence_tensors (tuple): Tuple containing input_ids and attention_mask tensors.

    Returns:
    - predicted_labels (torch.Tensor): Predicted labels for the sentence after inference.
    """
    input_ids, attention_mask = sentence_tensors
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    
    with torch.inference_mode():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    predicted_labels = torch.argmax(outputs.logits, dim=2)
    mask = attention_mask.squeeze() > 0
    predicted_labels = predicted_labels.squeeze()[mask]
    
    return predicted_labels

def get_entities_from_predictions(sentence, predicted_labels):
    """
    Extracts entities from a sentence based on predicted labels.

    Arguments:
    - sentence (str): The original sentence.
    - predicted_labels (torch.Tensor): Predicted labels for the tokens in the sentence.

    Returns:
    - entities (list): List of tokens identified as entities based on predicted labels.
    """
    tokenized_sentence = tokenizer.tokenize(sentence)
    entity_indices = [i for i, label in enumerate(predicted_labels)]
    entities = [tokenized_sentence[i] for i in entity_indices]

    return entities

def extract_class_2_words(ziped_list):
    """
    Extracts words belonging to class 2 (mountains) from a list of token-class pairs.

    Arguments:
    - ziped_list (list): List of tuples containing token-class pairs.

    Returns:
    - output_list (list): List of mountain names extracted from the token-class pairs.
    """
    previous_class = 999
    mountings = []
    mount = []
    for sub_token, token_class in ziped_list:
        if token_class == 2:
            if previous_class == 2:
                mount.append(sub_token)
            else:
                mount = [sub_token]  
        elif previous_class == 2:
            mountings.append(mount)  
            mount = []
        previous_class = token_class
    if previous_class == 2:
        mountings.append(mount)
    output_list = [''.join([word.replace('##', '') for word in sublist]) for sublist in mountings]
    return output_list

def get_mountains_names_from_text(sentence, model):
    """
    Extracts mountain names from a given sentence using a pre-trained BERT model.

    Arguments:
    - sentence (str): The input sentence.
    - model: The pre-trained BERT model used for inference.

    Returns:
    - mountains (list): List of identified mountain names in the input sentence.
    """
    sentence_tensors = preprocess_random_single_sentence(sentence)
    predicted_labels = inference_single_sentence(model, sentence_tensors)
    list_of_labels = predicted_labels.cpu().tolist()
    entities = get_entities_from_predictions(sentence, predicted_labels)
    tokens_labels_pairs = list(zip(entities, list_of_labels))
    
    mountains = extract_class_2_words(tokens_labels_pairs)
    
    return mountains

## Demo interface
###### You can rerun the cell below to randomly select a sentence. The BERT NER model will attempt to find all mountains in this text.

In [3]:
# Choose a random sentence
sentence = random.choice(test_df["Words"].tolist())

# Extract mountain names
mountains = get_mountains_names_from_text(sentence, model=loaded_ner_model)

print(f"Sentence: {sentence}")
print(f"\nFounded mountains: {mountains}")

Sentence: There is some information about earthquakes in Aristotle ' s Meteorology , in Naturalis Historia by Pliny the Elder , and in Strabo ' s Geographica

Founded mountains: []
