# Named Entity Recognition

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Set variables

In [None]:
batch_size = 4
data_directory = "/content/drive/MyDrive/NER/copious_ner/"
model_src = "NoYo25/BiodivBERT"

## Install and import necessary libraries

In [None]:
!pip install transformers
!pip install datasets
!pip install seqeval
!pip install accelerate -U

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import os
import shutil
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
#from datasets import load_metric
from evaluate import load
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [None]:
#ADD for training
os.environ["WANDB_DISABLED"] = "true"

## Convert the dataset

In [None]:
root_data_dir = data_directory

biodivner_dataset = "train"
train_csv_file_path = "train_copious_ner.csv"
val_csv_file_path = "dev_copious_ner.csv"
test_csv_file_path = "test_copious_ner.csv"

In [None]:
def loadData(csv_file_path):
  dataset_path = os.path.join(root_data_dir, csv_file_path)
  data = pd.read_csv(dataset_path, encoding="latin1")
  data = data.fillna(method="ffill")
  return data

In [None]:
data = loadData(train_csv_file_path)
val_data = loadData(val_csv_file_path)
test_data = loadData(test_csv_file_path)

  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")
  data = data.fillna(method="ffill")


In [None]:
import re

def convert(orig):

    df = pd.DataFrame(orig)

    # Extract the integer from 'Sentence #'
    df['Sentence Number'] = df['Sentence Number'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # Add a column representing the original order
    df['Original Order'] = range(len(df))

    # Group by 'Sentence #' and aggregate 'Word' and 'Tag' into lists
    grouped = df.groupby('Sentence Number').agg({'Token': list, 'Label': list, 'Original Order': 'first'}).reset_index()

    # Sort the DataFrame based on the original order
    grouped = grouped.sort_values(by='Original Order').drop('Original Order', axis=1)

    grouped = grouped.rename(columns={'Token': 'tokens'})
    grouped = grouped.rename(columns={'Label': 'labels'})
    grouped = grouped.drop('Sentence Number', axis=1)
    # print(grouped)
    return grouped

#print(grouped)


In [None]:
train_df = convert(data)
val_df = convert(val_data)
test_df = convert(test_data)

In [None]:
print(train_df)

                                                tokens  \
0    [48, ##8, SM, ##IT, ##H, ., LA, ##KE, LA, ##NA...   
1    [GE, ##OL, ##O, ##GI, ##C, R, ##EC, ##ON, ##NA...   
2    [48, ##4, ##SM, ##IT, ##H, ., THE, S, ##U, ##L...   
3    [GE, ##OL, ##O, ##GI, ##C, R, ##EC, ##ON, ##NA...   
4    [47, ##4, SM, ##IT, ##H, ., certain, special, ...   
..                                                 ...   
525  [2, ##f, ), 0, CA, ##LD, ##ER, ##ON, ., usuall...   
526  [65, ##2, PR, ##OC, ##EE, ##DI, ##NG, ##S, OF,...   
527  [MA, ##MM, ##AL, ##S, O, ##P, THE, P, ##H, ##I...   
528  [O, ##BS, ##TE, ##TR, ##IC, ##S, IN, THE, P, #...   
529  [THE, H, ##AB, ##IT, ##U, ##AL, US, ##E, OF, O...   

                                                labels  
0    [O, O, O, O, O, O, B-GeographicalLocation, I-G...  
1    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
2    [O, O, O, O, O, O, O, B-GeographicalLocation, ...  
3    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-G...  
4    [O, O, O, O, 

In [None]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Tokenize the dataset

In [None]:
label_list = ['B-Taxon', 'I-Taxon', 'B-GeographicalLocation', 'I-GeographicalLocation', 'B-Habitat', 'I-Habitat', 'O']
label2id = {k: v for v, k in enumerate(label_list)}
id2label = {v: k for v, k in enumerate(label_list)}
print(label2id)
print(id2label)

{'B-Taxon': 0, 'I-Taxon': 1, 'B-GeographicalLocation': 2, 'I-GeographicalLocation': 3, 'B-Habitat': 4, 'I-Habitat': 5, 'O': 6}
{0: 'B-Taxon', 1: 'I-Taxon', 2: 'B-GeographicalLocation', 3: 'I-GeographicalLocation', 4: 'B-Habitat', 5: 'I-Habitat', 6: 'O'}


In [None]:
task = "ner"
model_checkpoint = model_src

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), max_length= 512, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_tokenized_datasets = dev_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/530 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

## Finetuning the model to the dataset

In [None]:
model =  AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label, label2id=label2id)

args = TrainingArguments(
    f"test-{task}",
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = "epoch",
    num_train_epochs=3
)

data_collator = DataCollatorForTokenClassification(tokenizer)
#metric = load_metric("seqeval")
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at NoYo25/BiodivBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


Clear model directory if it hasn't been cleared yet.

In [None]:
directory_path = "/content/drive/MyDrive/NER/nermodel/"

if os.path.exists(directory_path):
    try:
        # Remove the directory
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' removed successfully.")
    except Exception as e:
        print(f"Error removing directory '{directory_path}': {e}")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory '/content/drive/MyDrive/NER/nermodel/' removed successfully.


In [None]:
#ADD
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()
trainer.evaluate()
trainer.save_model(directory_path)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5706,0.434138,0.035996,0.043956,0.039579,0.836173
2,0.2424,0.307298,0.429762,0.495879,0.460459,0.884911
3,0.2609,0.223571,0.520442,0.646978,0.576852,0.920168


  _warn_prf(average, modifier, msg_start, len(result))


## Get precision, f1-score, and recall for each entity group

In [None]:
predictions, labels, _ = trainer.predict(test_tokenized_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'GeographicalLocation': {'precision': np.float64(0.6237864077669902),
  'recall': np.float64(0.7099447513812155),
  'f1': np.float64(0.6640826873385013),
  'number': np.int64(362)},
 'Habitat': {'precision': np.float64(0.21621621621621623),
  'recall': np.float64(0.14035087719298245),
  'f1': np.float64(0.1702127659574468),
  'number': np.int64(57)},
 'Taxon': {'precision': np.float64(0.39652173913043476),
  'recall': np.float64(0.5861182519280206),
  'f1': np.float64(0.4730290456431535),
  'number': np.int64(389)},
 'overall_precision': np.float64(0.4814453125),
 'overall_recall': np.float64(0.6101485148514851),
 'overall_f1': np.float64(0.5382096069868996),
 'overall_accuracy': 0.9213073279792898}