<a href="https://colab.research.google.com/github/Jaquelinedops/Benchmarking_NER/blob/main/Benchmarking_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install seqeval



In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from collections import Counter
import pandas as pd
warnings.filterwarnings('ignore')


In [3]:
print("Is CUDA available?:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Total VRAM:", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2), "GB")
else:
    print("CUDA not available. GPU not detected.")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Is CUDA available?: True
GPU Name: Tesla T4
Total VRAM: 14.74 GB
Using device: cuda


In [None]:

# carregando o conll2003 dataset
try:
    dataset_tfds, info = tfds.load(
        "conll2003",
        split='test',
        as_supervised=False,
        with_info=True
    )
    test_ds = dataset_tfds
    label_list = info.features['ner'].feature.names
    print("Dataset 'conll2003' loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Dicionário de modelos para o benchmark
modelos = {
    "XML-RoBERTa": "Davlan/xlm-roberta-base-ner-hrl",
    "BERT-Davlan": "Davlan/bert-base-multilingual-cased-ner-hrl",
    "BERT": "dslim/bert-base-NER",
    #"RoBERTa": "Jean-Baptiste/roberta-large-ner-english",
}

# --- Seção 2: Função para Avaliação ---

def avaliar_modelo_ner(model_name, dataset_para_teste, label_list, device):
    """
    Carrega um modelo e seu tokenizer, roda o pipeline de NER
    no dataset de teste e retorna um relatório de classificação detalhado
    e o score Cohen's Kappa.
    """
    print(f"\n--- Avaliando o Modelo: {model_name} ---")

    # Carrega o tokenizer e o modelo
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

    # Cria o pipeline de NER, passando o dispositivo explicitamente
    device_id = 0
    ner_pipeline = pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=device_id
    )

    # Listas para armazenar as entidades preditas e as verdadeiras
    verdades_seqeval = []
    predicoes_seqeval = []

    verdades_flattened = []
    predicoes_flattened = []

    # Processa cada exemplo no dataset de teste
    for exemplo in dataset_para_teste:
        tokens = [t.decode('utf-8') for t in exemplo['tokens'].numpy().tolist()]
        tags_verdadeiras_ids = exemplo['ner'].numpy().tolist()

        # Map the true tags from numbers to names for seqeval
        tags_verdadeiras_str = [label_list[tag_id] for tag_id in tags_verdadeiras_ids]

        texto = " ".join(tokens)
        preds = ner_pipeline(texto)

        # Formatação de predictions para seqeval e flatten para Cohen's Kappa
        preds_seq_level = ['O'] * len(tokens)
        preds_kappa_level = ['O'] * len(tokens)
        current_char_idx = 0
        for pred in preds:
            word = pred['word']
            entity_type = pred['entity_group']

            token_start_index = -1
            temp_text = ""
            for i, token in enumerate(tokens):
                if word.startswith(temp_text + token):
                    if token_start_index == -1:
                        token_start_index = i
                    temp_text += token
                    if temp_text == word:
                        break
                else:
                    temp_text = ""
                    token_start_index = -1

            if token_start_index != -1:
                preds_seq_level[token_start_index] = f"B-{entity_type}"
                preds_kappa_level[token_start_index] = f"B-{entity_type}" # Save for Kappa
                for i in range(token_start_index + 1, len(tokens)):
                    if word.endswith("".join(tokens[token_start_index:i+1])):
                        preds_seq_level[i] = f"I-{entity_type}"
                        preds_kappa_level[i] = f"I-{entity_type}" # Save for Kappa
                    else:
                        break

        verdades_seqeval.append(tags_verdadeiras_str)
        predicoes_seqeval.append(preds_seq_level)

        verdades_flattened.extend(tags_verdadeiras_str)
        predicoes_flattened.extend(preds_kappa_level)

    # Gera relatorio
    relatorio = classification_report(verdades_seqeval, predicoes_seqeval, digits=5, output_dict=True)

    # Calcula Cohen's Kappa
    kappa = cohen_kappa_score(verdades_flattened, predicoes_flattened)

    return relatorio, kappa

# --- Seção 3: Execução do Benchmark e Exibição dos Resultados ---

# Detectar o dispositivo disponível (GPU ou CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
temp = {}
relatorio_final = {}
for nome_modelo, model_id in modelos.items():
    relatorio, kappa_score = avaliar_modelo_ner(model_id, test_ds, label_list, device)

    temp = {
        'nome_modelo': nome_modelo,
        'relatorio': relatorio,
        'kappa_score': kappa_score
    }

    df_relatorio = pd.DataFrame(temp).transpose()
    df_relatorio = df_relatorio.round(4)
    relatorio_final.append(temp)
    print(f"\n=====================================================")
    print(f"RELATÓRIO DE AVALIAÇÃO PARA O MODELO: {nome_modelo}")
    print(df_relatorio)
    print("=====================================================")
    print(f"Cohen's Kappa Score: {kappa_score:.4f}")
    print("=====================================================\n")



Dataset 'conll2003' loaded successfully.
Using device: cuda

--- Avaliando o Modelo: Davlan/xlm-roberta-base-ner-hrl ---


Device set to use cuda:0
  return forward_call(*args, **kwargs)
