<a href="https://colab.research.google.com/github/Jaquelinedops/Benchmarking_NER/blob/main/Benchmarking_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install seqeval



In [2]:

import tensorflow_datasets as tfds
import tensorflow as tf
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from collections import Counter
import pandas as pd
# Load the conll2003 dataset
ds, info = tfds.load(
    'conll2003',
    split=['train', 'test'], # The splits are 'train', 'validation', and 'test'
    with_info=True,
    as_supervised=False # Set to True if you want a (features, label) tuple
)



In [3]:
train_ds = ds[0]
test_ds = ds[1]

In [4]:
# Use this to check for CUDA availability and move all related calls inside
print("Is CUDA available?:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Total VRAM:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2), "GB")
else:
    print("CUDA not available. GPU not detected.")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Is CUDA available?: True
GPU Name: Tesla T4
Total VRAM: 14.74 GB
Using device: cuda


In [5]:
try:
    dataset_tfds, info = tfds.load(
        "conll2003",
        split='test',
        as_supervised=False,
        with_info=True
    )
    test_ds = dataset_tfds
    label_list = info.features['ner'].feature.names
    print("Dataset 'conll2003' loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Dicionário de modelos para o benchmark
modelos = {
    "XML-RoBERTa": "Davlan/xlm-roberta-base-ner-hrl",
    "BERT-Davlan": "Davlan/bert-base-multilingual-cased-ner-hrl",
    "BERT": "dslim/bert-base-NER",
    "RoBERTa": "Jean-Baptiste/roberta-large-ner-english",
}

# --- Seção 2: Função para Avaliação ---

def avaliar_modelo_ner(model_name, dataset_para_teste, label_list, device):
    """
    Carrega um modelo e seu tokenizer, roda o pipeline de NER
    no dataset de teste e retorna um relatório de classificação detalhado
    e o score Cohen's Kappa.
    """
    print(f"\n--- Avaliando o Modelo: {model_name} ---")

    # Carrega o tokenizer e o modelo
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

    # Cria o pipeline de NER, passando o dispositivo explicitamente
    device_id = 0 if device.type == 'cuda' else -1
    ner_pipeline = pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=device_id
    )

    # Listas para armazenar as entidades preditas e as verdadeiras
    # We will need two sets of lists: one for seqeval and one flattened for Cohen's Kappa
    verdades_seqeval = []
    predicoes_seqeval = []

    verdades_flattened = []
    predicoes_flattened = []

    # Processa cada exemplo no dataset de teste
    for exemplo in dataset_para_teste:
        tokens = [t.decode('utf-8') for t in exemplo['tokens'].numpy().tolist()]
        tags_verdadeiras_ids = exemplo['ner'].numpy().tolist()

        # Map the true tags from numbers to names for seqeval
        tags_verdadeiras_str = [label_list[tag_id] for tag_id in tags_verdadeiras_ids]

        texto = " ".join(tokens)
        preds = ner_pipeline(texto)

        # Format predictions for seqeval and flatten for Cohen's Kappa
        preds_seq_level = ['O'] * len(tokens)
        preds_kappa_level = ['O'] * len(tokens) # We'll start with 'O' tags

        # Logic to map the aggregated pipeline predictions to the token-level format
        # This part of your code seems complex and potentially brittle.
        # A more robust approach might be to not use the 'simple' aggregation
        # and process token-level predictions directly from the model output,
        # but for now, we'll try to keep your logic and ensure it provides
        # a flattened list for Kappa.

        current_char_idx = 0
        for pred in preds:
            word = pred['word']
            entity_type = pred['entity_group']

            # This is your token mapping logic. Let's assume it works for now.
            token_start_index = -1
            temp_text = ""
            for i, token in enumerate(tokens):
                if word.startswith(temp_text + token):
                    if token_start_index == -1:
                        token_start_index = i
                    temp_text += token
                    if temp_text == word:
                        break
                else:
                    temp_text = ""
                    token_start_index = -1

            if token_start_index != -1:
                preds_seq_level[token_start_index] = f"B-{entity_type}"
                preds_kappa_level[token_start_index] = f"B-{entity_type}" # Save for Kappa
                for i in range(token_start_index + 1, len(tokens)):
                    if word.endswith("".join(tokens[token_start_index:i+1])):
                        preds_seq_level[i] = f"I-{entity_type}"
                        preds_kappa_level[i] = f"I-{entity_type}" # Save for Kappa
                    else:
                        break

        verdades_seqeval.append(tags_verdadeiras_str)
        predicoes_seqeval.append(preds_seq_level)

        verdades_flattened.extend(tags_verdadeiras_str)
        predicoes_flattened.extend(preds_kappa_level)

    # Generate the detailed classification report
    relatorio = classification_report(verdades_seqeval, predicoes_seqeval, digits=5, output_dict=True)

    # Calculate Cohen's Kappa
    kappa = cohen_kappa_score(verdades_flattened, predicoes_flattened)

    return relatorio, kappa

# --- Seção 3: Execução do Benchmark e Exibição dos Resultados ---

# Detectar o dispositivo disponível (GPU ou CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
relatorio_final = []
for nome_modelo, model_id in modelos.items():
    relatorio, kappa_score = avaliar_modelo_ner(model_id, test_ds, label_list, device)
    df_relatorio = pd.DataFrame(relatorio).transpose()
    df_relatorio = df_relatorio.round(4)
    print(f"\n=====================================================")
    print(f"RELATÓRIO DE AVALIAÇÃO PARA O MODELO: {nome_modelo}")
    print(df_relatorio)
    print("=====================================================")
    print("=====================================================")
    print(f"Cohen's Kappa Score: {kappa_score:.4f}")
    print("=====================================================\n")
    relatorio_final.append(relatorio)

Dataset 'conll2003' loaded successfully.
Using device: cuda

--- Avaliando o Modelo: Davlan/xlm-roberta-base-ner-hrl ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0
  return forward_call(*args, **kwargs)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  _warn_prf(average, modifier, msg_start, len(result))



RELATÓRIO DE AVALIAÇÃO PARA O MODELO: XML-RoBERTa
              precision  recall  f1-score  support
LOC              0.9578  0.7614    0.8484   1668.0
MISC             0.0000  0.0000    0.0000    702.0
ORG              0.8925  0.5250    0.6611   1661.0
PER              0.9202  0.2566    0.4014   1617.0
micro avg        0.9285  0.4527    0.6087   5648.0
macro avg        0.6926  0.3858    0.4777   5648.0
weighted avg     0.8088  0.4527    0.5599   5648.0
Cohen's Kappa Score: 0.4547


--- Avaliando o Modelo: Davlan/bert-base-multilingual-cased-ner-hrl ---


Device set to use cuda:0
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, msg_start, len(result))



RELATÓRIO DE AVALIAÇÃO PARA O MODELO: BERT-Davlan
              precision  recall  f1-score  support
LOC              0.9432  0.7560    0.8393   1668.0
MISC             0.0000  0.0000    0.0000    702.0
ORG              0.8874  0.5647    0.6902   1661.0
PER              0.9367  0.2746    0.4247   1617.0
micro avg        0.9215  0.4680    0.6207   5648.0
macro avg        0.6918  0.3988    0.4885   5648.0
weighted avg     0.8077  0.4680    0.5724   5648.0
Cohen's Kappa Score: 0.4688


--- Avaliando o Modelo: dslim/bert-base-NER ---


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
  return forward_call(*args, **kwargs)



RELATÓRIO DE AVALIAÇÃO PARA O MODELO: BERT
              precision  recall  f1-score  support
LOC              0.9457  0.6781    0.7898   1668.0
MISC             0.8665  0.5271    0.6554    702.0
ORG              0.8927  0.5111    0.6501   1661.0
PER              0.9425  0.1824    0.3057   1617.0
micro avg        0.9162  0.4683    0.6198   5648.0
macro avg        0.9118  0.4747    0.6003   5648.0
weighted avg     0.9194  0.4683    0.5934   5648.0
Cohen's Kappa Score: 0.4693


--- Avaliando o Modelo: Jean-Baptiste/roberta-large-ner-english ---


Device set to use cuda:0
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, msg_start, len(result))



RELATÓRIO DE AVALIAÇÃO PARA O MODELO: RoBERTa
              precision  recall  f1-score  support
LOC                 0.0     0.0       0.0   1668.0
MISC                0.0     0.0       0.0    702.0
ORG                 0.0     0.0       0.0   1661.0
PER                 0.0     0.0       0.0   1617.0
micro avg           0.0     0.0       0.0   5648.0
macro avg           0.0     0.0       0.0   5648.0
weighted avg        0.0     0.0       0.0   5648.0
Cohen's Kappa Score: -0.0000

