# Evaluation on each checkpoint of the NER Finetuned model

## Evaluation Script

In [2]:
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score

# Load tokenizer and model
def load_finetuned_model(model_path, model_id, num_labels=3):
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=num_labels,
        id2label={0: "O", 1: "B", 2: "I"},
        label2id={"O": 0, "B": 1, "I": 2},
        local_files_only=True  # 🧠 This tells it not to look on the Hugging Face Hub
    )
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    return model, tokenizer, device

# Predict function
def predict_ner(tokens, model, tokenizer, device):
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, padding=True)
    word_ids = inputs.word_ids()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
    predictions = logits.argmax(dim=-1)[0].cpu().numpy()

    id2label = {0: "O", 1: "B", 2: "I"}
    word_preds = []
    prev_word_id = None
    current_preds = []

    for i, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if word_id != prev_word_id:
            if current_preds:
                word_preds.append(id2label[max(set(current_preds), key=current_preds.count)])
            current_preds = [predictions[i]]
            prev_word_id = word_id
        else:
            current_preds.append(predictions[i])

    if current_preds:
        word_preds.append(id2label[max(set(current_preds), key=current_preds.count)])

    return word_preds

# Evaluation pipeline
def evaluate_model(model_path, model_id="meta-llama/Llama-3.2-3B-Instruct", num_samples=None):
    model, tokenizer, device = load_finetuned_model(model_path, model_id)

    dataset = load_dataset("spyysalo/species_800")["test"]
    if num_samples:
        dataset = dataset.select(range(num_samples))

    label_list = ["O", "B", "I"]
    y_true, y_pred = [], []

    print(f"Evaluating on {len(dataset)} samples...")

    for example in tqdm(dataset):
        tokens = example["tokens"]
        true_labels = [label_list[i] for i in example["ner_tags"]]
        pred_labels = predict_ner(tokens, model, tokenizer, device)

        min_len = min(len(true_labels), len(pred_labels))
        y_true.append(true_labels[:min_len])
        y_pred.append(pred_labels[:min_len])

    # Metrics
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    # Output
    print("\n--- Final Evaluation Metrics ---")
    print(f"Precision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print("\nDetailed Report:\n", report)

    # Save summary
    pd.DataFrame([{
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }]).to_csv("finetuned_species_ner_metrics.csv", index=False)

### Checkpoint 360

In [2]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-360")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Could not load bitsandbytes native library: /usr/lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda126.so)
Traceback (most recent call last):
  File "/root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/envs/mahshid_thesis/lib/python3.11/ctype

Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:11<00:00, 22.67it/s]



--- Final Evaluation Metrics ---
Precision: 0.462
Recall:    0.501
F1 Score:  0.480

Detailed Report:
               precision    recall  f1-score   support

           _       0.46      0.50      0.48       767

   micro avg       0.46      0.50      0.48       767
   macro avg       0.46      0.50      0.48       767
weighted avg       0.46      0.50      0.48       767



In [3]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-720")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:04<00:00, 25.41it/s]



--- Final Evaluation Metrics ---
Precision: 0.469
Recall:    0.536
F1 Score:  0.500

Detailed Report:
               precision    recall  f1-score   support

           _       0.47      0.54      0.50       767

   micro avg       0.47      0.54      0.50       767
   macro avg       0.47      0.54      0.50       767
weighted avg       0.47      0.54      0.50       767



In [4]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_finetuned_model_llama_3-2-3B")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:03<00:00, 25.52it/s]



--- Final Evaluation Metrics ---
Precision: 0.462
Recall:    0.501
F1 Score:  0.480

Detailed Report:
               precision    recall  f1-score   support

           _       0.46      0.50      0.48       767

   micro avg       0.46      0.50      0.48       767
   macro avg       0.46      0.50      0.48       767
weighted avg       0.46      0.50      0.48       767



In [5]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-1080")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:04<00:00, 25.20it/s]



--- Final Evaluation Metrics ---
Precision: 0.444
Recall:    0.437
F1 Score:  0.440

Detailed Report:
               precision    recall  f1-score   support

           _       0.44      0.44      0.44       767

   micro avg       0.44      0.44      0.44       767
   macro avg       0.44      0.44      0.44       767
weighted avg       0.44      0.44      0.44       767



In [3]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-180")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Could not load bitsandbytes native library: /usr/lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda126.so)
Traceback (most recent call last):
  File "/root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/root/envs/mahshid_thesis/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/envs/mahshid_thesis/lib/python3.11/ctype

Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:03<00:00, 25.68it/s]



--- Final Evaluation Metrics ---
Precision: 0.387
Recall:    0.399
F1 Score:  0.393

Detailed Report:
               precision    recall  f1-score   support

           _       0.39      0.40      0.39       767

   micro avg       0.39      0.40      0.39       767
   macro avg       0.39      0.40      0.39       767
weighted avg       0.39      0.40      0.39       767



In [4]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-540")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:04<00:00, 25.33it/s]



--- Final Evaluation Metrics ---
Precision: 0.441
Recall:    0.421
F1 Score:  0.431

Detailed Report:
               precision    recall  f1-score   support

           _       0.44      0.42      0.43       767

   micro avg       0.44      0.42      0.43       767
   macro avg       0.44      0.42      0.43       767
weighted avg       0.44      0.42      0.43       767



In [5]:
evaluate_model(model_path="./NER_finetuned_models/species_ner_model_llama_3-2-3B/checkpoint-900")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating on 1631 samples...


100%|███████████████████████████████████████| 1631/1631 [01:03<00:00, 25.67it/s]



--- Final Evaluation Metrics ---
Precision: 0.438
Recall:    0.426
F1 Score:  0.432

Detailed Report:
               precision    recall  f1-score   support

           _       0.44      0.43      0.43       767

   micro avg       0.44      0.43      0.43       767
   macro avg       0.44      0.43      0.43       767
weighted avg       0.44      0.43      0.43       767

