# üîé An√°lise de Consenso entre LLMs

## 1) Setup e Configura√ß√£o

In [118]:
import sys
from loguru import logger
import pandas as pd
from src.api.schemas.experiment import ExperimentRequest

logger.remove()
logger.add(
    sys.stdout,
    format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
    level="INFO"
)

logger.success("‚úì Setup completo")

[32m21:40:49[0m | [32m[1mSUCCESS [0m | [32m[1m‚úì Setup completo[0m


In [119]:
import json
from pathlib import Path

experiment = "base_experiment_local"

# Load JSON from file
config_path = Path(f"../experiments/{experiment}.json")

with open(config_path, "r") as f:
    config_dict = json.load(f)

# Instantiate Pydantic model
EXPERIMENT_CONFIG = ExperimentRequest(**config_dict)

### - Modelos e prompt

In [120]:
from src.api.services.prompt_factory import get_prompt_template

DEFAULT_MODELS = EXPERIMENT_CONFIG.models
PROMPT_TEMPLATE = EXPERIMENT_CONFIG.prompt_type

PROMPT_TEMPLATE  = get_prompt_template(
    EXPERIMENT_CONFIG.prompt_type,
    EXPERIMENT_CONFIG.custom_prompt,
)

### - Configura√ß√µes de consenso

In [121]:
consensus_cfg = {
    "threshold": 0.8,
    "strategy": "majority_vote",
    "no_consensus_strategy": "flag_for_review",
}

consensus_threshold = consensus_cfg.get("threshold", 0.8)
consensus_strategy = consensus_cfg.get("strategy", "majority_vote")
no_consensus_strategy = consensus_cfg.get(
    "no_consensus_strategy", "flag_for_review"
)

### - Configura√ß√µes de dataset

In [122]:
dataset_cfg = EXPERIMENT_CONFIG.dataset_config

dataset_split = dataset_cfg.split
combine_splits = dataset_cfg.combine_splits
sample_size = dataset_cfg.sample_size
random_state = dataset_cfg.random_state

### - Configura√ß√µes de cache

In [123]:
cache_cfg = EXPERIMENT_CONFIG.cache

cache_enabled = cache_cfg.enabled
cache_dir = cache_cfg.dir

### - Resultados

In [124]:
results_cfg = EXPERIMENT_CONFIG.results

save_intermediate = results_cfg.save_intermediate
intermediate = results_cfg.intermediate
results_dir = results_cfg.dir

In [None]:
dataset_name = "yelp_reviews"  
specific_date = "2025-12-27"

results_dir = Path(results_dir)
results_dataset_path = results_dir.joinpath(dataset_name, specific_date)

## 2) Carregar dados

### - Dataset

In [None]:
from src.utils.data_loader import load_hf_dataset

texts, categories, ground_truth = load_hf_dataset(
    dataset_name=dataset_name, 
    cache_dir=cache_dir,
    dataset_global_config=dataset_cfg
)

logger.info(f"Textos: {len(texts)}")
logger.info(f"Categorias: {categories}")
logger.info(f"Ground truth: {'Sim' if ground_truth else 'N√£o'}")

[32m21:40:49[0m | [1mINFO    [0m | [1mCarregando dataset: agnews[0m
[32m21:40:49[0m | [1mINFO    [0m | [1mCombinando splits: ['train', 'test'][0m
[32m21:40:54[0m | [1mINFO    [0m | [1m  ‚úì train: 510400 exemplos[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  ‚úì test: 127600 exemplos[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mTotal combinado: 638000 exemplos[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mCategorias extra√≠das automaticamente: [0, 1, 2, 3][0m
[32m21:40:59[0m | [1mINFO    [0m | [1mAmostra reduzida para 5000 exemplos (seed=42)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mColuna de texto: text[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mGround truth carregado da coluna 'label'[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mTextos: 5000[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mCategorias: [0, 1, 2, 3][0m
[32m21:40:59[0m | [1mINFO    [0m | [1mGround truth: Sim[0m


### - Anota√ß√µes

In [None]:
df_annotations = pd.read_csv(results_dataset_path.joinpath("annotations.csv"))

df_annotations["ground_truth"] = ground_truth
df_annotations

Unnamed: 0,text_id,text,deepseek-r1-8b_rep1,deepseek-r1-8b_consensus,deepseek-r1-8b_consensus_score,deepseek-r1-8b_annotation_time_sec,gemma3-4b_rep1,gemma3-4b_consensus,gemma3-4b_consensus_score,gemma3-4b_annotation_time_sec,...,mistral-7b_annotation_time_sec,llama2-7b_rep1,llama2-7b_consensus,llama2-7b_consensus_score,llama2-7b_annotation_time_sec,llama3.1-8b_rep1,llama3.1-8b_consensus,llama3.1-8b_consensus_score,llama3.1-8b_annotation_time_sec,ground_truth
0,0,"""Family appeals for release of UK hostage"",""Th...",0,0,1.0,112.647519,2,2,1.0,175.541295,...,35.421645,2,2,1.0,67.424837,0,0,1.0,148.524105,0
1,1,"""Overcoming the piracy stigma in China"",""SHANG...",3,3,1.0,4.562144,2,2,1.0,0.440606,...,0.112560,2,2,1.0,0.115293,0,0,1.0,0.255689,3
2,2,"""Toshiba inspires breakthrough in hard disk st...",3,3,1.0,3.133863,2,2,1.0,0.419972,...,0.101397,2,2,1.0,0.111904,2,2,1.0,0.236625,3
3,3,"""Oracle moves to monthly patching schedule"",""W...",3,3,1.0,4.155114,2,2,1.0,0.426328,...,0.100936,2,2,1.0,0.108230,2,2,1.0,0.239776,3
4,4,"""Liquor Inhaler Debuts Alcohol-Free in NYC"",""N...",3,3,1.0,2.812132,2,2,1.0,0.424579,...,0.115799,2,2,1.0,0.121997,2,2,1.0,0.249027,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,"""Afghan Forces Catch Mullah Omar's Security Ch...",0,0,1.0,2.088234,1,1,1.0,0.435211,...,0.111510,2,2,1.0,0.114405,0,0,1.0,0.251697,0
4996,4996,"""World's pop stars band together for Myanmar's...",0,0,1.0,2.262565,0,0,1.0,0.423527,...,0.103464,2,2,1.0,0.112732,0,0,1.0,0.249706,0
4997,4997,"""Greek Orthodox leader dies in helicopter cras...",0,0,1.0,3.459361,0,0,1.0,0.405576,...,0.104014,2,2,1.0,0.109907,2,2,1.0,0.232801,0
4998,4998,"""Internet Explorer Loses More Market Share (Zi...",2,2,1.0,3.824344,2,2,1.0,0.400595,...,0.097065,2,2,1.0,0.105969,2,2,1.0,0.229967,3


### - M√©tricas

In [128]:
df_metrics = pd.read_csv(results_dataset_path.joinpath("model_metrics.csv"))

df_metrics

Unnamed: 0,model,accuracy,f1_weighted,precision_weighted,recall_weighted,coverage,error_rate,invalid_predictions_rate
0,deepseek-r1-8b,0.8098,0.807279,0.82395,0.8098,1.0,0.1902,0.0
1,llama3.1-8b,0.6704,0.651208,0.741999,0.6704,0.9998,0.3296,0.0002
2,gemma3-4b,0.5852,0.530539,0.719842,0.5852,1.0,0.4148,0.0
3,mistral-7b,0.5344,0.501298,0.587932,0.5344,1.0,0.4656,0.0
4,llama2-7b,0.2742,0.194814,0.386184,0.2742,0.9982,0.7258,0.0018


## 3) Calcular Consenso

In [129]:
from src.llm_annotation_system.consensus.consensus_calculator import ConsensusCalculator
from src.llm_annotation_system.consensus.consensus_evaluator import ConsensusEvaluator

# Inicializar calculador
consensus_calculator = ConsensusCalculator(
    consensus_threshold=consensus_threshold,
    default_strategy=consensus_strategy
)

analyzer = ConsensusEvaluator(
    categories=categories, 
    calculator=consensus_calculator, 
    output_dir=results_dataset_path
)

df_with_consensus = analyzer.compute_consensus(df_annotations)

[32m21:40:59[0m | [1mINFO    [0m | [1mExecutando c√°lculo de consenso interno...[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mCalculando consenso...[0m
[32m21:40:59[0m | [32m[1mSUCCESS [0m | [32m[1mConsenso calculado:[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Alto (‚â•80%): 2459 (49.2%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  M√©dio (60-80%): 1585 (31.7%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Baixo (<60%): 956 (19.1%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Problem√°ticos: 771 (15.4%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Itens que precisam de revis√£o: 0 (0.0%)[0m
[32m21:40:59[0m | [32m[1mSUCCESS [0m | [32m[1mC√°lculo de consenso finalizado.[0m


### - Estatisticas

In [130]:
logger.info("\nüìä Estat√≠sticas de Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Mediana: {df_with_consensus['consensus_score'].median():.2%}")
logger.info(f"  Desvio padr√£o: {df_with_consensus['consensus_score'].std():.2%}")

[32m21:40:59[0m | [1mINFO    [0m | [1m
üìä Estat√≠sticas de Consenso:[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  M√©dia: 68.70%[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Mediana: 60.00%[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  Desvio padr√£o: 18.94%[0m


### - Distribui√ß√£o por n√≠vel

In [131]:
# Distribui√ß√£o por n√≠vel
levels = df_with_consensus['consensus_level'].value_counts()
logger.info("Distribui√ß√£o por n√≠vel:")
for level, count in levels.items():
    logger.info(f"  {level}: {count} ({count/len(df_with_consensus):.1%})")

[32m21:40:59[0m | [1mINFO    [0m | [1mDistribui√ß√£o por n√≠vel:[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  high: 2459 (49.2%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  medium: 1585 (31.7%)[0m
[32m21:40:59[0m | [1mINFO    [0m | [1m  low: 956 (19.1%)[0m


### - Visualizar consenso

In [132]:
from src.llm_annotation_system.consensus.consensus_visualizer import ConsensusVisualizer

visualizer = ConsensusVisualizer(output_dir=results_dataset_path)

In [133]:
visualizer.plot_score_and_levels(
    df_with_consensus=df_with_consensus,
    levels=levels
)

‚úì Gr√°fico salvo: score_and_levels.html


## 4) An√°lise Detalhada de Consenso

### - Gerando Report

In [134]:
# Gerar relat√≥rio
report = analyzer.generate_consensus_report(
    df=df_with_consensus
)

logger.success("‚úì Relat√≥rio gerado")

[32m21:40:59[0m | [1mINFO    [0m | [1mGerando relat√≥rio completo de consenso...[0m
[32m21:40:59[0m | [1mINFO    [0m | [1mFleiss' Kappa: 0.263 (Fraco)[0m
[32m21:41:00[0m | [1mINFO    [0m | [1mCasos problem√°ticos: 956[0m
[32m21:41:00[0m | [32m[1mSUCCESS [0m | [32m[1mRelat√≥rio de consenso gerado com sucesso.[0m
[32m21:41:00[0m | [32m[1mSUCCESS [0m | [32m[1m‚úì Relat√≥rio gerado[0m


### - Pairwise_agreement

In [135]:
logger.info("\nüìä Gerando heatmap de concord√¢ncia...")
visualizer.plot_agreement_heatmap(
    agreement_df=report['pairwise_agreement'],
    title='Matriz de Concord√¢ncia entre Modelos',
)

[32m21:41:00[0m | [1mINFO    [0m | [1m
üìä Gerando heatmap de concord√¢ncia...[0m


‚úì Heatmap salvo: agreement_heatmap.html


### - Cohens Kappa

In [136]:
logger.info("\nüìä Gerando heatmap de Cohens_Kappa...")
visualizer.plot_kappa_heatmap(
    kappa_df=report['cohens_kappa']
)

[32m21:41:00[0m | [1mINFO    [0m | [1m
üìä Gerando heatmap de Cohens_Kappa...[0m


‚úì Heatmap salvo: kappa_heatmap.html


### - Casos problem√°ticos

In [137]:
# Casos problem√°ticos
problematic = report.get('problematic_cases')
if problematic is not None and len(problematic) > 0:
    logger.warning(f"\n‚ö†Ô∏è  {len(problematic)} casos problem√°ticos identificados")
    display(problematic)
else:
    logger.success("\n‚úì Nenhum caso problem√°tico identificado")

‚ö†Ô∏è  956 casos problem√°ticos identificados[0m


Unnamed: 0,text_id,text,consensus_score,annotations,entropy
0,0,"""Family appeals for release of UK hostage"",""Th...",0.4,"{0: 2, 2: 2, 1: 1}",1.521928
1,1,"""Overcoming the piracy stigma in China"",""SHANG...",0.4,"{3: 2, 2: 2, 0: 1}",1.521928
2,14,"""Al-Qaida may be hiding messages in computer f...",0.4,"{0: 2, 3: 2, 2: 1}",1.521928
3,15,"""Leave it to Mets to also ax the candidates"",""...",0.4,"{1: 1, 2: 2, 3: 1, 0: 1}",1.921928
4,16,"""Call for Demonstrations Falls on Deaf Ears"",""...",0.4,"{0: 2, 2: 1, 1: 1, 3: 1}",1.921928
...,...,...,...,...,...
951,4973,"""Kiwis ruffle Aussie feathers?"",""They may be r...",0.4,"{0: 2, 1: 1, 3: 1, 2: 1}",1.921928
952,4990,"""Sharon presses on with Gaza plan"",""JERUSALEM ...",0.4,"{0: 2, 2: 2, 3: 1}",1.521928
953,4994,"""Italians, Canadians gather to honour living l...",0.4,"{0: 2, 1: 1, 3: 1, 2: 1}",1.921928
954,4995,"""Afghan Forces Catch Mullah Omar's Security Ch...",0.4,"{0: 2, 1: 2, 2: 1}",1.521928


## 5) Valida√ß√£o com Ground Truth

In [138]:
accuracy, cls_report, cm = analyzer.evaluate_ground_truth(
    df_with_consensus=df_with_consensus
)

visualizer.plot_confusion_matrix(
    cm=cm,
    categories=categories
)

[32m21:41:00[0m | [32m[1mSUCCESS [0m | [32m[1m
üéØ Accuracy: 74.32%[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m
Classification Report:[0m
{'0': {'precision': 0.8797250859106529, 'recall': 0.6295081967213115, 'f1-score': 0.7338748208313426, 'support': 1220.0}, '1': {'precision': 0.9349064279902359, 'recall': 0.9418032786885245, 'f1-score': 0.9383421804818293, 'support': 1220.0}, '2': {'precision': 0.5446182152713891, 'recall': 0.9344909234411997, 'f1-score': 0.6881720430107527, 'support': 1267.0}, '3': {'precision': 0.8494475138121547, 'recall': 0.4756380510440835, 'f1-score': 0.6098165592464055, 'support': 1293.0}, 'accuracy': 0.7432, 'macro avg': {'precision': 0.8021743107461081, 'recall': 0.7453601124737799, 'f1-score': 0.7425514008925825, 'support': 5000.0}, 'weighted avg': {'precision': 0.8004434722134102, 'recall': 0.7432, 'f1-score': 0.7401023062404591, 'support': 5000.0}}


‚úì Matriz de confus√£o salva em: C:\Users\gabri\Documents\GitHub\llm-annotation\data\results\agnews\2025-12-27\graphics\confusion_matrix.html


## 6) Exportar Resultados

In [139]:
import json

# Criar diret√≥rio
results_dir = results_dataset_path.joinpath("summary")
results_dir.mkdir(parents=True, exist_ok=True)

# Salvar CSVs
df_with_consensus.to_csv(results_dir / 'dataset_anotado_completo.csv', index=False)
logger.info(f"‚úì Salvos: {len(df_with_consensus)} registros")

# Alta confian√ßa
high_conf = df_with_consensus[df_with_consensus['consensus_score'] >= 0.8]
high_conf.to_csv(results_dir / 'alta_confianca.csv', index=False)
logger.info(f"‚úì Alta confian√ßa: {len(high_conf)} registros")

# Necessita revis√£o
low_conf = df_with_consensus[df_with_consensus['consensus_score'] < 0.8]
low_conf.to_csv(results_dir / 'necessita_revisao.csv', index=False)
logger.info(f"‚úì Necessita revis√£o: {len(low_conf)} registros")

# Sum√°rio JSON
summary = {
    'dataset': {
        'name': dataset_name,
        'total_texts': len(texts),
        'categories': categories,
        'has_ground_truth': ground_truth is not None
    },
    'config': {
        'models': DEFAULT_MODELS,
        'total_models': len(DEFAULT_MODELS),
        'use_alternative_params': False,
        'num_repetitions': 1,
        'total_annotations': len(texts) * len(DEFAULT_MODELS) * 1
    },
    'results': {
        'consensus_mean': float(df_with_consensus['consensus_score'].mean()),
        'consensus_median': float(df_with_consensus['consensus_score'].median()),
        'high_consensus': int((df_with_consensus['consensus_level'] == 'high').sum()),
        'medium_consensus': int((df_with_consensus['consensus_level'] == 'medium').sum()),
        'low_consensus': int((df_with_consensus['consensus_level'] == 'low').sum()),
    },
    'metrics': {
        'fleiss_kappa': float(report['fleiss_kappa']),
        'fleiss_interpretation': report['fleiss_interpretation']
    }
}

if ground_truth:
    summary['validation'] = {
        'classification_report': cls_report
    }

with open(results_dir / 'sumario_experimento.json', 'w') as f:
    json.dump(summary, f, indent=2)

logger.success("\n‚úì Resultados exportados com sucesso!")

[32m21:41:00[0m | [1mINFO    [0m | [1m‚úì Salvos: 5000 registros[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m‚úì Alta confian√ßa: 2459 registros[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m‚úì Necessita revis√£o: 2541 registros[0m
[32m21:41:00[0m | [32m[1mSUCCESS [0m | [32m[1m
‚úì Resultados exportados com sucesso![0m


## 10) Resumo Final

In [140]:
logger.info("\n" + "="*80)
logger.success("RESUMO DO EXPERIMENTO")
logger.info("="*80)

logger.info(f"\nüìä Dataset: {dataset_name}")
logger.info(f"  Textos: {len(texts)}")
logger.info(f"  Categorias: {len(categories)}")

logger.info(f"\nü§ñ Configura√ß√£o:")
logger.info(f"  Modelos base: {len(DEFAULT_MODELS)}")
logger.info(f"  Total modelos: {len(DEFAULT_MODELS)}")
logger.info(f"  Alternative params: {False}")
logger.info(f"  Repeti√ß√µes: {1}")

logger.info(f"\nüìà Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

if ground_truth:
    logger.info(f"\nüéØ Valida√ß√£o:")
    logger.info(f"  Accuracy: {accuracy:.2%}")

logger.info(f"\nüìÅ Arquivos gerados em: {results_dir}/")

logger.success("\n‚úÖ An√°lise completa!")

[32m21:41:00[0m | [1mINFO    [0m | [1m
[32m21:41:00[0m | [32m[1mSUCCESS [0m | [32m[1mRESUMO DO EXPERIMENTO[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m
üìä Dataset: agnews[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Textos: 5000[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Categorias: 4[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m
ü§ñ Configura√ß√£o:[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Modelos base: 5[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Total modelos: 5[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Alternative params: False[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Repeti√ß√µes: 1[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m
üìà Consenso:[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  M√©dia: 68.70%[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Fleiss' Kappa: 0.263 (Fraco)[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m
üéØ Valida√ß√£o:[0m
[32m21:41:00[0m | [1mINFO    [0m | [1m  Accuracy: 74.32%[0m
[32m2