# ü§ñ An√°lise de Consenso entre LLMs
## Notebook Refatorado com Alternative Params

Este notebook usa:
- Componentes modulares
- Logging com loguru
- Integra√ß√£o com HuggingFace
- **Alternative params** para testar varia√ß√µes

## 1Ô∏è‚É£ Setup e Configura√ß√£o

In [1]:
# Imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
import os

# Configurar logging
logger.remove()
logger.add(
    sys.stdout,
    format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
    level="INFO"
)

# Paths
# project_root = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[1]
# sys.path.insert(0, str(project_root / "src"))

logger.success("‚úì Setup completo")

[32m14:49:12[0m | [32m[1mSUCCESS [0m | [32m[1m‚úì Setup completo[0m


## 2Ô∏è‚É£ Carregar Dataset do HuggingFace

In [2]:
from src.utils.data_loader import load_hf_dataset, load_hf_dataset_as_dataframe, list_available_datasets

# Listar datasets
logger.info("Datasets dispon√≠veis:")
for dataset in list_available_datasets():
    logger.info(f"  - {dataset}")

[32m14:49:13[0m | [1mDatasets dispon√≠veis:[0m
[32m14:49:13[0m | [1m  - agnews[0m
[32m14:49:13[0m | [1m  - mpqa[0m
[32m14:49:13[0m | [1m  - webkb[0m
[32m14:49:13[0m | [1m  - ohsumed[0m
[32m14:49:13[0m | [1m  - acm[0m
[32m14:49:13[0m | [1m  - yelp_2013[0m
[32m14:49:13[0m | [1m  - dblp[0m
[32m14:49:13[0m | [1m  - books[0m
[32m14:49:13[0m | [1m  - reut90[0m
[32m14:49:13[0m | [1m  - wos11967[0m
[32m14:49:13[0m | [1m  - twitter[0m
[32m14:49:13[0m | [1m  - trec[0m
[32m14:49:13[0m | [1m  - wos5736[0m
[32m14:49:13[0m | [1m  - sst1[0m
[32m14:49:13[0m | [1m  - pang_movie[0m
[32m14:49:13[0m | [1m  - movie_review[0m
[32m14:49:13[0m | [1m  - vader_movie[0m
[32m14:49:13[0m | [1m  - subj[0m
[32m14:49:13[0m | [1m  - sst2[0m
[32m14:49:13[0m | [1m  - yelp_reviews[0m
[32m14:49:13[0m | [1m  - 20ng[0m
[32m14:49:13[0m | [1m  - medline[0m


In [3]:
# Carregar dataset
dataset_name = "ohsumed"  # Ajuste conforme necess√°rio

texts, categories, ground_truth = load_hf_dataset(dataset_name)

logger.info(f"Textos: {len(texts)}")
logger.info(f"Categorias: {categories}")
logger.info(f"Ground truth: {'Sim' if ground_truth else 'N√£o'}")

[32m14:49:13[0m | [1mCarregando dataset: ohsumed[0m
[32m14:49:13[0m | [1mCombinando splits: ['train', 'test'][0m
[32m14:49:21[0m | [1m  ‚úì train: 164718 exemplos[0m
[32m14:49:28[0m | [1m  ‚úì test: 18302 exemplos[0m
[32m14:49:28[0m | [1mTotal combinado: 183020 exemplos[0m
[32m14:49:28[0m | [1mCategorias extra√≠das automaticamente: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22][0m
[32m14:49:28[0m | [1mColuna de texto: text[0m
[32m14:49:28[0m | [1mGround truth carregado da coluna 'label'[0m
[32m14:49:28[0m | [1mTextos: 183020[0m
[32m14:49:28[0m | [1mCategorias: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22][0m
[32m14:49:28[0m | [1mGround truth: Sim[0m


In [4]:
# Visualizar amostra
logger.info("\nAmostra dos textos:")
for i, text in enumerate(texts[:3]):
    logger.info(f"{i+1}. {text[:100]}...")
    if ground_truth:
        logger.info(f"   Label: {ground_truth[i]}")

[32m14:49:52[0m | [1m
Amostra dos textos:[0m
[32m14:49:52[0m | [1m1. Evaluation of women with possible appendicitis using technetium-99m leukocyte scan, The authors eval...[0m
[32m14:49:52[0m | [1m   Label: 5[0m
[32m14:49:52[0m | [1m2. Cause of death in an emergency department, A retrospective review was done of 601 consecutive emerge...[0m
[32m14:49:52[0m | [1m   Label: 13[0m
[32m14:49:52[0m | [1m3. Intermittent obstruction of an incarcerated hiatal hernia with a total thoracic stomach, A case of i...[0m
[32m14:49:52[0m | [1m   Label: 5[0m


In [5]:
df, categories = load_hf_dataset_as_dataframe(dataset_name)

[32m14:49:55[0m | [1mCarregando dataset: ohsumed[0m
[32m14:49:55[0m | [1mCombinando splits: ['train', 'test'][0m
[32m14:50:02[0m | [1m  ‚úì train: 164718 exemplos[0m
[32m14:50:10[0m | [1m  ‚úì test: 18302 exemplos[0m
[32m14:50:10[0m | [1mTotal combinado: 183020 exemplos[0m
[32m14:50:10[0m | [1mCategorias extra√≠das automaticamente: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22][0m
[32m14:50:10[0m | [1mColuna de texto: text[0m
[32m14:50:10[0m | [1mGround truth carregado da coluna 'label'[0m
[32m14:50:10[0m | [1mDataFrame criado com 183020 linhas[0m


In [6]:
df

Unnamed: 0,text,label
0,Evaluation of women with possible appendicitis...,5
1,"Cause of death in an emergency department, A r...",13
2,Intermittent obstruction of an incarcerated hi...,5
3,Excitatory amino acids in the developing brain...,9
4,Anencephaly: clinical determination of brain d...,9
...,...,...
183015,"Descartes before the horse: I clone, therefore...",1
183016,Nosocomial infection and pseudoinfection from ...,0
183017,"Minerals in hair, serum, and urine of healthy ...",14
183018,Reporting of communicable diseases by universi...,0


In [7]:
categories

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22]

## 3Ô∏è‚É£ Configurar Modelos LLM

### Op√ß√£o A: Usar apenas par√¢metros padr√£o (temp=0)

In [None]:
from src.llm_annotation_system.annotation.llm_annotator import LLMAnnotator
from src.experiments.base_experiment import DEFAULT_MODELS

# Inicializar SEM alternative params
annotator = LLMAnnotator(
    models=DEFAULT_MODELS,
    categories=categories,
    api_keys=None,
    use_langchain_cache=True,
    use_alternative_params=False  # Apenas temp=0
)

logger.success(f"‚úì Annotator inicializado com {len(annotator.models)} modelos")

ModuleNotFoundError: No module named 'prompts'

### Op√ß√£o B: Usar alternative params (temp=0, 0.3, 0.5)

**Aten√ß√£o**: Isso cria 9 modelos (3 base + 6 varia√ß√µes)

In [None]:
# Descomente para usar alternative params:

# annotator = LLMAnnotator(
#     models=models,
#     categories=categories,
#     api_keys=None,
#     use_langchain_cache=True,
#     use_alternative_params=True  # Expande para 9 modelos
# )

# logger.success(f"‚úì Annotator com alternative params: {len(annotator.models)} modelos")
# logger.info(f"  Modelos expandidos: {annotator.models}")

## 4Ô∏è‚É£ Executar Anota√ß√£o

### Testar anota√ß√£o

In [None]:
# Par√¢metros
num_repetitions = 3

# Estimativa
total_annotations = len(texts) * len(annotator.models) * num_repetitions
logger.info(f"Total de anota√ß√µes: {total_annotations}")
logger.warning(f"  Modelos: {len(annotator.models)}")
logger.warning(f"  Textos: {len(texts)}")
logger.warning(f"  Repeti√ß√µes: {num_repetitions}")

# Anotar
df_annotations = annotator.annotate_single(
    texts=texts,
    num_repetitions=num_repetitions
)

logger.success("‚úì Anota√ß√µes completas")
display(df_annotations.head())

### Anotando dataset completo

In [None]:
# Par√¢metros
num_repetitions = 3

# Estimativa
total_annotations = len(texts) * len(annotator.models) * num_repetitions
logger.info(f"Total de anota√ß√µes: {total_annotations}")
logger.warning(f"  Modelos: {len(annotator.models)}")
logger.warning(f"  Textos: {len(texts)}")
logger.warning(f"  Repeti√ß√µes: {num_repetitions}")

# Anotar
df_annotations = annotator.annotate_dataset(
    texts=texts,
    num_repetitions=num_repetitions
)

logger.success("‚úì Anota√ß√µes completas")
display(df_annotations.head())

## 5Ô∏è‚É£ Calcular Consenso

In [None]:
# Calcular consenso
df_with_consensus = annotator.calculate_consensus(df_annotations)

# Estat√≠sticas
logger.info("\nüìä Estat√≠sticas de Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Mediana: {df_with_consensus['consensus_score'].median():.2%}")
logger.info(f"  Desvio padr√£o: {df_with_consensus['consensus_score'].std():.2%}")

# Distribui√ß√£o por n√≠vel
levels = df_with_consensus['consensus_level'].value_counts()
logger.info("\nDistribui√ß√£o por n√≠vel:")
for level, count in levels.items():
    logger.info(f"  {level}: {count} ({count/len(df_with_consensus):.1%})")

In [None]:
# Visualiza√ß√µes
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histograma
axes[0].hist(df_with_consensus['consensus_score'], bins=20, edgecolor='black')
axes[0].set_xlabel('Consensus Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribui√ß√£o de Scores de Consenso')

# Barras por n√≠vel
levels.plot(kind='bar', ax=axes[1], color=['green', 'orange', 'red'])
axes[1].set_xlabel('N√≠vel de Consenso')
axes[1].set_ylabel('Contagem')
axes[1].set_title('Casos por N√≠vel de Consenso')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 6Ô∏è‚É£ An√°lise de Alternative Params

**Nota**: Esta se√ß√£o s√≥ funciona se `use_alternative_params=True`

In [None]:
# Verificar se alternative params foi usado
if annotator.use_alternative_params:
    logger.info("üìä Analisando impacto dos alternative params...")
    
    # Agrupar por modelo base
    for base_model in models:
        # Encontrar varia√ß√µes deste modelo
        variations = [m for m in annotator.models if m.startswith(base_model)]
        
        logger.info(f"\n{base_model}:")
        
        for var in variations:
            if f'{var}_consensus_score' in df_with_consensus.columns:
                score = df_with_consensus[f'{var}_consensus_score'].mean()
                logger.info(f"  {var}: {score:.2%} consenso interno")
    
    # Comparar temperaturas
    fig, ax = plt.subplots(figsize=(12, 6))
    
    consensus_cols = [col for col in df_with_consensus.columns 
                     if '_consensus_score' in col and '_alt' in col or 
                     (col.replace('_consensus_score', '') in models)]
    
    if consensus_cols:
        means = [df_with_consensus[col].mean() for col in consensus_cols]
        labels = [col.replace('_consensus_score', '') for col in consensus_cols]
        
        ax.bar(labels, means)
        ax.set_ylabel('Consenso Interno M√©dio')
        ax.set_title('Consenso por Varia√ß√£o de Par√¢metros')
        ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.show()
    
else:
    logger.warning("Alternative params n√£o foi usado. Para an√°lise detalhada, reinicialize com use_alternative_params=True")

## 7Ô∏è‚É£ An√°lise Detalhada de Consenso

In [None]:
from consensus_analyzer_refactored import ConsensusAnalyzer

# Inicializar analyzer
analyzer = ConsensusAnalyzer(categories)

# Colunas de consenso
consensus_cols = [col for col in df_with_consensus.columns if '_consensus' in col and '_score' not in col]

logger.info(f"Analisando {len(consensus_cols)} anotadores")

# Gerar relat√≥rio
report = analyzer.generate_consensus_report(
    df=df_with_consensus,
    annotator_cols=consensus_cols,
    output_dir="./results"
)

logger.success("‚úì Relat√≥rio gerado")

In [None]:
# M√©tricas
logger.info("\nüìä M√©tricas de Concord√¢ncia:")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

# Interpreta√ß√£o
kappa = report['fleiss_kappa']
if kappa > 0.8:
    logger.success("Concord√¢ncia excelente!")
elif kappa > 0.6:
    logger.info("Concord√¢ncia boa")
elif kappa > 0.4:
    logger.warning("Concord√¢ncia moderada")
else:
    logger.warning("Concord√¢ncia fraca")

In [None]:
# Matriz de concord√¢ncia
agreement_df = report['pairwise_agreement']

plt.figure(figsize=(12, 10))
sns.heatmap(agreement_df, annot=True, fmt='.2f', cmap='YlGnBu', cbar_kws={'label': 'Agreement'})
plt.title('Matriz de Concord√¢ncia Par a Par')
plt.tight_layout()
plt.show()

In [None]:
# Casos problem√°ticos
problematic = report.get('problematic_cases')
if problematic is not None and len(problematic) > 0:
    logger.warning(f"\n‚ö†Ô∏è  {len(problematic)} casos problem√°ticos identificados")
    display(problematic.head())
else:
    logger.success("\n‚úì Nenhum caso problem√°tico identificado")

## 8Ô∏è‚É£ Valida√ß√£o com Ground Truth

In [None]:
if ground_truth:
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    
    # Adicionar ground truth
    df_with_consensus['ground_truth'] = ground_truth
    
    # Accuracy
    accuracy = accuracy_score(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    )
    
    logger.success(f"\nüéØ Accuracy: {accuracy:.2%}")
    
    # Classification report
    logger.info("\nClassification Report:")
    print(classification_report(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    ))
    
    # Confusion matrix
    cm = confusion_matrix(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    )
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=categories, yticklabels=categories)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix vs Ground Truth')
    plt.tight_layout()
    plt.savefig('./results/confusion_vs_ground_truth.png', dpi=150)
    plt.show()
    
else:
    logger.info("\n‚ö†Ô∏è  Ground truth n√£o dispon√≠vel - pulando valida√ß√£o")

## 9Ô∏è‚É£ Exportar Resultados

In [None]:
import json
from pathlib import Path

# Criar diret√≥rio
results_dir = Path('./results/final')
results_dir.mkdir(parents=True, exist_ok=True)

# Salvar CSVs
df_with_consensus.to_csv(results_dir / 'dataset_anotado_completo.csv', index=False)
logger.info(f"‚úì Salvos: {len(df_with_consensus)} registros")

# Alta confian√ßa
high_conf = df_with_consensus[df_with_consensus['consensus_score'] >= 0.8]
high_conf.to_csv(results_dir / 'alta_confianca.csv', index=False)
logger.info(f"‚úì Alta confian√ßa: {len(high_conf)} registros")

# Necessita revis√£o
low_conf = df_with_consensus[df_with_consensus['consensus_score'] < 0.8]
low_conf.to_csv(results_dir / 'necessita_revisao.csv', index=False)
logger.info(f"‚úì Necessita revis√£o: {len(low_conf)} registros")

# Sum√°rio JSON
summary = {
    'dataset': {
        'name': dataset_name,
        'total_texts': len(texts),
        'categories': categories,
        'has_ground_truth': ground_truth is not None
    },
    'config': {
        'models': models,
        'total_models': len(annotator.models),
        'use_alternative_params': annotator.use_alternative_params,
        'num_repetitions': num_repetitions,
        'total_annotations': len(texts) * len(annotator.models) * num_repetitions
    },
    'results': {
        'consensus_mean': float(df_with_consensus['consensus_score'].mean()),
        'consensus_median': float(df_with_consensus['consensus_score'].median()),
        'high_consensus': int((df_with_consensus['consensus_level'] == 'high').sum()),
        'medium_consensus': int((df_with_consensus['consensus_level'] == 'medium').sum()),
        'low_consensus': int((df_with_consensus['consensus_level'] == 'low').sum()),
    },
    'metrics': {
        'fleiss_kappa': float(report['fleiss_kappa']),
        'fleiss_interpretation': report['fleiss_interpretation']
    }
}

if ground_truth:
    summary['validation'] = {
        'accuracy': float(accuracy)
    }

with open(results_dir / 'sumario_experimento.json', 'w') as f:
    json.dump(summary, f, indent=2)

logger.success("\n‚úì Resultados exportados com sucesso!")

## üîü Resumo Final

In [None]:
logger.info("\n" + "="*80)
logger.success("RESUMO DO EXPERIMENTO")
logger.info("="*80)

logger.info(f"\nüìä Dataset: {dataset_name}")
logger.info(f"  Textos: {len(texts)}")
logger.info(f"  Categorias: {len(categories)}")

logger.info(f"\nü§ñ Configura√ß√£o:")
logger.info(f"  Modelos base: {len(models)}")
logger.info(f"  Total modelos: {len(annotator.models)}")
logger.info(f"  Alternative params: {annotator.use_alternative_params}")
logger.info(f"  Repeti√ß√µes: {num_repetitions}")

logger.info(f"\nüìà Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

if ground_truth:
    logger.info(f"\nüéØ Valida√ß√£o:")
    logger.info(f"  Accuracy: {accuracy:.2%}")

logger.info(f"\nüìÅ Arquivos gerados em: {results_dir}/")

cache_stats = annotator.get_cache_stats()
logger.info(f"\nüíæ Cache: {cache_stats['total_entries']} entradas")

logger.success("\n‚úÖ An√°lise completa!")