In [1]:
from datasets import load_dataset
import pandas as pd
from simpletransformers.ner import NERModel
import torch
from transformers import BertTokenizer
import json
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
animal_classes = {
    "butterfly": "B-BUTTERFLY",
    "cat": "B-CAT",
    "cow": "B-COW",
    "dog": "B-DOG",
    "elephant": "B-ELEPHANT",
    "hen": "B-HEN",
    "horse": "B-HORSE",
    "sheep": "B-SHEEP",
    "spider": "B-SPIDER",
    "squirrel": "B-SQUIRREL"
}


In [2]:
raw_dataset = load_dataset("wikiann", "en")  #load dataset

In [4]:
def prepare_data(dataset):
    data = []
    for i, example in enumerate(dataset):
        for j, token in enumerate(example["tokens"]):
            token_lower = token.lower()
            if token_lower in animal_classes:
                label = animal_classes[token_lower]  # Assign a specific animal class
            else:
                label = "O"  # Not animal
            data.append([i, token, label])
    return pd.DataFrame(data, columns=["sentence_id", "words", "labels"])

In [6]:
def filter_relevant_examples(dataset):
    filtered_data = []
    for example in dataset:
        tokens = example["tokens"]
        if any(token.lower() in animal_classes for token in tokens):  # checking to see if there's an animal
            filtered_data.append(example)
    return filtered_data

In [5]:
train_data = prepare_data(raw_dataset["train"])
val_data = prepare_data(raw_dataset["validation"])

In [7]:
filtered_train_data = filter_relevant_examples(raw_dataset["train"])
filtered_val_data = filter_relevant_examples(raw_dataset["validation"])

In [8]:
print(f"Train examples: {len(filtered_train_data)}")
print(f"Validation examples: {len(filtered_val_data)}")

Train examples: 35
Validation examples: 12


In [4]:
with open('wiki_animal.json', 'r') as file: #Import wiki_animal
    data = []
    for line in file:
        try:
            # Convert each string into a JSON object
            data.append(json.loads(line))
        except json.JSONDecodeError:
            print("Error when reading a string:", line)

# Now you can work with this list
print(data[0]['TEXT'])

[['the', "'", '10th', 'edition', 'of', 'Systema Naturae', "'", 'is', 'a', 'book', 'written', 'by', 'Carl Linnaeus', 'and', 'published', 'in', 'two', 'volumes', 'in', '1758', 'and', '1759', 'which', 'marks', 'the', 'starting', 'point', 'of', 'zoological nomenclature', '.'], ['before', '1758', 'most', 'biological', 'catalogues', 'had', 'used', 'polynomial', 'names', 'for', 'the', 'taxa', 'included', 'including', 'earlier', 'editions', 'of', 'Systema Naturae', '.'], ['the', 'first', 'work', 'to', 'consistently', 'apply', 'binomial', 'nomenclature', 'across', 'the', 'animal', 'kingdom', 'was', 'the', '10th', 'edition', 'of', 'Systema Naturae', '.'], ['the', 'International Commission on Zoological Nomenclature', 'therefore', 'chose', '1', 'January', '1758', 'as', 'the', '"', 'starting', 'point', '"', 'for', 'zoological', 'nomenclature', 'and', 'asserted', 'that', 'the', '10th', 'edition', 'of', 'Systema Naturae', 'was', 'to', 'be', 'treated', 'as', 'if', 'published', 'on', 'that', 'date', '

In [5]:
def label_text(text, animal_classes, sentence_id):
    words = text.split()  # Split the text into words
    labels = []
    
    for word in words:
        word_lower = word.lower().strip('.,!?')  # Lowercase the word and remove punctuation marks
        
        # If the word is in animal_classes, add a label
        if word_lower in animal_classes:
            labels.append((word, animal_classes[word_lower]))
        else:
            # Otherwise, label O (not an entity)
            labels.append((word, "O"))
    
    # Return data with added sentence_id
    return [{'sentence_id': sentence_id, 'words': words, 'labels': [label[1] for label in labels]}]


In [6]:
def label_animal_mentions(data, animal_classes):
    labeled_data = []
    sentence_id = 0  # Initial index for proposals
    
    for entry in data:
        text = entry['TEXT']  # Getting the text from your data
        # Marking up each sentence
        for sentence in text:
            labeled_sentence = label_text(' '.join(sentence), animal_classes, sentence_id)
            labeled_data.extend(labeled_sentence)  # Add the marked up sentence to the final list
            sentence_id += 1  # Increase the sentence_id for the following sentence
    
    return labeled_data

In [7]:
labeled_data = label_animal_mentions(data, animal_classes)

In [9]:
def count_class_occurrences(labeled_data):
    # Create a list of labels (only classes, excluding “O”)
    labels = [label for sentence in labeled_data for label in sentence['labels'] if label != "O"]
    
    # Count the number of occurrences of each class
    label_counts = Counter(labels)
    
    return label_counts


In [10]:
label_counts = count_class_occurrences(labeled_data)

In [11]:
label_counts

Counter({'B-HORSE': 1349,
         'B-CAT': 696,
         'B-DOG': 650,
         'B-SPIDER': 494,
         'B-SQUIRREL': 431,
         'B-BUTTERFLY': 331,
         'B-ELEPHANT': 278,
         'B-SHEEP': 50,
         'B-COW': 30,
         'B-HEN': 15})

In [13]:
flattened_data = []

# Add a sentence_id to each word
for sentence in labeled_data:
    sentence_id = sentence['sentence_id']
    words = sentence['words']
    labels = sentence['labels']
    
    for word, label in zip(words, labels):
        flattened_data.append([sentence_id, word, label])

In [14]:
df = pd.DataFrame(flattened_data, columns=["sentence_id", "words", "labels"])

In [28]:
def filter_sentences_with_animal_classes(labeled_data, animal_classes):
    filtered_data = []
    for sentence_id, group in labeled_data.groupby('sentence_id'):  
        labels = group['labels'].tolist()
        
        # Check if there is at least one word with an animal label (not “O”)
        if any(label != "O" and label in animal_classes.values() for label in labels):
            filtered_data.append(group)
    
    # Merge filtered sentences into DataFrame
    return pd.concat(filtered_data)

In [32]:
filtered_data = filter_sentences_with_animal_classes(df, animal_classes)

In [36]:
train_data, val_data = train_test_split(filtered_data, test_size=0.2, random_state=42)

In [39]:
labels_list = ['O', 'B-BUTTERFLY', 'B-CAT', 'B-COW', 'B-DOG', 'B-ELEPHANT', 'B-HEN', 'B-HORSE', 'B-SHEEP', 'B-SPIDER', 'B-SQUIRREL']

In [40]:
model = NERModel(
    "bert", "bert-base-cased",labels=labels_list, use_cuda=False
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
model.train_model(train_data, eval_df=val_data, output_dir='out3/') #train model

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:20<00:00,  2.51s/it]
Epoch 1 of 1:   0%|                                                                              | 0/1 [00:00<?, ?it/s]
[Aning Epoch 1 of 1:   0%|                                                                    | 0/469 [00:00<?, ?it/s]
[Achs 1/1. Running Loss:    2.8378:   0%|                                                     | 0/469 [00:00<?, ?it/s]
[Achs 1/1. Running Loss:    2.8378:   0%|                                             | 1/469 [00:02<20:33,  2.63s/it]
[Achs 1/1. Running Loss:    2.8154:   0%|                                             | 1/469 [00:03<20:33,  2.63s/it]
[Achs 1/1. Running Loss:    2.8154:   0%|▏                                            | 2/469 [00:04<17:37,  2.26s/it]
[Achs 1/1. Running Loss:    2.7953:   0%|▏                                            | 2/469 [00:05<17:37,  2.26s/it]
[Achs 1/1. Running Loss:    2.7953:   1

(469, 0.09862155382392872)

In [43]:
test_sentences = [
    "A big elephants walked through the forest.", #test
    "There was a dog in the backyard."
]

predictions, _ = model.predict(test_sentences)

for sentence, preds in zip(test_sentences, predictions):
    print(f"Sentence: {sentence}")
    print(preds)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.19s/it]
Running Prediction: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.92it/s]

Sentence: A big elephants walked through the forest.
[{'A': 'O'}, {'big': 'O'}, {'elephants': 'O'}, {'walked': 'O'}, {'through': 'O'}, {'the': 'O'}, {'forest.': 'O'}]
Sentence: There was a dog in the backyard.
[{'There': 'O'}, {'was': 'O'}, {'a': 'O'}, {'dog': 'B-DOG'}, {'in': 'O'}, {'the': 'O'}, {'backyard.': 'O'}]





In [68]:
from transformers import BertTokenizer

# Downloading the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [69]:
def tokenize_and_preserve_labels(data, tokenizer):
    tokenized_data = []

    for entry in data:
        sentence_id = entry["sentence_id"]
        words = entry["words"]
        labels = entry["labels"]

        tokenized_words = []
        tokenized_labels = []
        
        for word, label in zip(words, labels):
            subwords = tokenizer.tokenize(word)  # Applying subword tokenization
            tokenized_words.extend(subwords)
            tokenized_labels.extend([label] * len(subwords))  # We assign the same label to each subword

        tokenized_data.append({
            "sentence_id": sentence_id,
            "tokens": tokenized_words,
            "labels": tokenized_labels
        })

    return tokenized_data


In [72]:
tokenized_data = tokenize_and_preserve_labels(labeled_data, tokenizer) #Tokenization

In [85]:
def filter_sentences_with_animals(tokenized_data, animal_classes):
    filtered_data = [
        entry for entry in tokenized_data if any(label in animal_classes for label in entry["labels"])
    ]
    return filtered_data

animal_classes = {"B-CAT", "B-DOG", "B-ELEPHANT", "B-BUTTERFLY", "B-SHEEP", "B-COW", "B-HORSE", "B-HEN", "B-SPIDER", "B-SQUIRREL"}
filtered_tokenized_data = filter_sentences_with_animals(tokenized_data, animal_classes)

In [127]:
print(f"Before filtration: {len(tokenized_data)}")
print(f"After filtration: {len(filtered_tokenized_data)}")


Before filtration: 101807
After filtration: 3747


In [87]:
#Converts data to a format compatible with BERT for NER.
def convert_to_bert_format(tokenized_data, tokenizer, label2id, max_length=128):
    attention_masks = []
    label_ids = []

    for entry in tokenized_data:
        tokens = entry["tokens"]
        labels = entry["labels"]

        # Tokenization and conversion to BERT format
        encoding = tokenizer(tokens, 
                             is_split_into_words=True,
                             padding="max_length",
                             truncation=True,
                             max_length=max_length,
                             return_tensors="pt")

        # Converting labels to numeric format
        label_ids_seq = [label2id[label] for label in labels]
        label_ids_seq = label_ids_seq[:max_length] + [label2id["O"]] * (max_length - len(label_ids_seq))

        input_ids.append(encoding["input_ids"].squeeze(0))
        attention_masks.append(encoding["attention_mask"].squeeze(0))
        label_ids.append(torch.tensor(label_ids_seq))

    return torch.stack(input_ids), torch.stack(attention_masks), torch.stack(label_ids)

# Data preparation
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Label dictionary definition
labels_list = ["O", "B-CAT", "B-DOG", "B-ELEPHANT", "B-BUTTERFLY", "B-SHEEP", "B-COW", "B-HORSE", "B-HEN", "B-SPIDER", "B-SQUIRREL"]
label2id = {label: i for i, label in enumerate(labels_list)}

# Convert data to BERT-compatible format
input_ids, attention_masks, label_ids = convert_to_bert_format(filtered_tokenized_data, tokenizer, label2id)

print("input_ids shape:", input_ids.shape)
print("attention_masks shape:", attention_masks.shape)
print("label_ids shape:", label_ids.shape)


input_ids shape: torch.Size([3747, 128])
attention_masks shape: torch.Size([3747, 128])
label_ids shape: torch.Size([3747, 128])


In [128]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, input_ids, attention_masks, label_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.label_ids = label_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.label_ids[idx]
        }

# Create Dataset and DataLoader
train_dataset = NERDataset(input_ids, attention_masks, label_ids)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"Number of batches: {len(train_dataloader)}")

Number of batches: 1


In [89]:
from transformers import BertForTokenClassification

num_labels = len(label2id)

model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [90]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)




In [91]:
from tqdm import tqdm

num_epochs = 3  #model preparations and fit

model.train()
for epoch in range(num_epochs):
    loop = tqdm(train_dataloader, leave=True)
    total_loss = 0

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Средний loss за эпоху {epoch+1}: {total_loss / len(train_dataloader)}")


Epoch 1: 100%|██████████████████████████████████████████████████████████| 469/469 [10:24<00:00,  1.33s/it, loss=0.0199]


Средний loss за эпоху 1: 0.06134394314219512


Epoch 2: 100%|██████████████████████████████████████████████████████████| 469/469 [10:05<00:00,  1.29s/it, loss=0.0199]


Средний loss за эпоху 2: 0.018675477944934037


Epoch 3: 100%|██████████████████████████████████████████████████████████| 469/469 [10:15<00:00,  1.31s/it, loss=0.0158]

Средний loss за эпоху 3: 0.014640580418694423





In [92]:
model.save_pretrained("ner_model") #save model

In [93]:
tokenizer.save_pretrained("ner_model")

('ner_model\\tokenizer_config.json',
 'ner_model\\special_tokens_map.json',
 'ner_model\\vocab.txt',
 'ner_model\\added_tokens.json')

In [94]:
def predict_entities(text, model, tokenizer, label2id):
    model.eval()  # Switching to prediction mode
    
    # Tokenize the input text
    encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True)
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # We get the predictions of the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits  
    predictions = torch.argmax(logits, dim=2)  # Getting indexes of predicted labels

    # Convert indexes to labels
    id2label = {v: k for k, v in label2id.items()} 
    predicted_labels = [id2label[label_id] for label_id in predictions[0].cpu().numpy()]

    # Output the result
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    for token, label in zip(tokens, predicted_labels):
        print(f"{token}: {label}")

In [98]:
test_sentence = ["this", "is", "a", "beautiful", "elephanteand "]
predict_entities(test_sentence, model, tokenizer, label2id)

[CLS]: O
this: O
is: O
a: O
beautiful: B-ELEPHANT
elephant: O
##ean: O
##d: O
[SEP]: O


In [100]:
test_sentence = ["this", "is", "a", "beautiful", "hot-dog "]
predict_entities(test_sentence, model, tokenizer, label2id)

[CLS]: O
this: O
is: O
a: O
beautiful: O
hot: O
-: B-DOG
dog: O
[SEP]: O


In [129]:
test_sentences = [
    ["the", "brown", "dog", "runs", "in", "the", "park"],
    ["a", "huge", "elephant", "is", "walking", "slowly"],
    ["the", "butterfly", "has", "colorful", "wings"],
    ["there", "is", "a", "cow", "in", "the", "field"], 
    ["the", "spider", "is", "weaving", "a", "web"], 
]

for sentence in test_sentences:
    print(f"\n📌 Test proposal: {' '.join(sentence)}")
    predict_entities(sentence, model, tokenizer, label2id)


📌 Test proposal: the brown dog runs in the park
[CLS]: O
the: O
brown: B-DOG
dog: O
runs: O
in: O
the: O
park: O
[SEP]: O

📌 Test proposal: a huge elephant is walking slowly
[CLS]: O
a: O
huge: B-ELEPHANT
elephant: O
is: O
walking: O
slowly: O
[SEP]: O

📌 Test proposal: the butterfly has colorful wings
[CLS]: O
the: B-BUTTERFLY
butterfly: O
has: O
colorful: O
wings: O
[SEP]: O

📌 Test proposal: there is a cow in the field
[CLS]: O
there: O
is: O
a: B-COW
cow: O
in: O
the: O
field: O
[SEP]: O

📌 Test proposal: the spider is weaving a web
[CLS]: O
the: B-SPIDER
spider: O
is: O
weaving: O
a: O
web: O
[SEP]: O


In [123]:
text_input = "There is a SPIDER in the picture."

In [124]:
def predict_animal_from_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1)
    return  predictions

    # Get the animal tags
    labels = ner_tokenizer.convert_ids_to_tokens(predictions[0].tolist())
    animals = [label for label in labels if label.startswith("B-")]
    return animals


In [125]:
predict_animal_from_text(text_input)

tensor([[0, 0, 0, 9, 0, 0, 0, 0, 0, 0]])