# ü§ñ An√°lise de Consenso entre LLMs
## Notebook Refatorado

Este notebook usa a estrutura refatorada com:
- Componentes modulares
- Logging com loguru
- Integra√ß√£o com HuggingFace

## 1Ô∏è‚É£ Setup e Configura√ß√£o

In [2]:
# Imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger

# Configurar logging
logger.remove()
logger.add(
    sys.stdout,
    format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
    level="INFO"
)

# Paths
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root / 'src' / 'llm_annotation_system'))
sys.path.insert(0, str(project_root / 'src' / 'config'))
sys.path.insert(0, str(project_root / 'src' / 'utils'))

logger.success("‚úì Setup completo")

[32m14:59:29[0m | [32m[1mSUCCESS [0m | [32m[1m‚úì Setup completo[0m


## 2Ô∏è‚É£ Carregar Dataset do HuggingFace

In [None]:
from src.utils.data_loader import load_hf_dataset, list_available_datasets

# Listar datasets
logger.info("Datasets dispon√≠veis:")
for dataset in list_available_datasets():
    logger.info(f"  - {dataset}")

ModuleNotFoundError: No module named 'data_loader'

In [None]:
# Carregar dataset
dataset_name = "agnews"  # Ajuste conforme necess√°rio

texts, categories, ground_truth = load_hf_dataset(dataset_name)

logger.info(f"Textos: {len(texts)}")
logger.info(f"Categorias: {categories}")
logger.info(f"Ground truth: {'Sim' if ground_truth else 'N√£o'}")

In [None]:
# Visualizar amostra
logger.info("\nAmostra dos textos:")
for i, text in enumerate(texts[:3]):
    logger.info(f"{i+1}. {text[:100]}...")
    if ground_truth:
        logger.info(f"   Label: {ground_truth[i]}")

## 3Ô∏è‚É£ Configurar Modelos LLM

In [None]:
from llm_annotation_system.annotation.llm_annotator import LLMAnnotator

# Modelos (ajuste conforme dispon√≠vel)
models = [
    "llama3-8b",
    "mistral-7b",
    "qwen2-7b",
]

# Inicializar
annotator = LLMAnnotator(
    models=models,
    categories=categories,
    api_keys=None,  # Ou configure com suas keys
    use_langchain_cache=True
)

logger.success("‚úì Annotator inicializado")

## 4Ô∏è‚É£ Executar Anota√ß√£o

In [None]:
# Par√¢metros
num_repetitions = 3

# Estimativa
total_annotations = len(texts) * len(models) * num_repetitions
logger.info(f"Total de anota√ß√µes: {total_annotations}")

# Anotar
df_annotations = annotator.annotate_dataset(
    texts=texts,
    num_repetitions=num_repetitions
)

logger.success("‚úì Anota√ß√µes completas")
display(df_annotations.head())

## 5Ô∏è‚É£ Calcular Consenso

In [None]:
# Calcular consenso
df_with_consensus = annotator.calculate_consensus(df_annotations)

# Estat√≠sticas
logger.info("\nüìä Estat√≠sticas de Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Mediana: {df_with_consensus['consensus_score'].median():.2%}")
logger.info(f"  Desvio padr√£o: {df_with_consensus['consensus_score'].std():.2%}")

# Distribui√ß√£o por n√≠vel
levels = df_with_consensus['consensus_level'].value_counts()
logger.info("\nDistribui√ß√£o por n√≠vel:")
for level, count in levels.items():
    logger.info(f"  {level}: {count} ({count/len(df_with_consensus):.1%})")

In [None]:
# Visualiza√ß√µes
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histograma
axes[0].hist(df_with_consensus['consensus_score'], bins=20, edgecolor='black')
axes[0].set_xlabel('Consensus Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribui√ß√£o de Scores de Consenso')

# Barras por n√≠vel
levels.plot(kind='bar', ax=axes[1], color=['green', 'orange', 'red'])
axes[1].set_xlabel('N√≠vel de Consenso')
axes[1].set_ylabel('Contagem')
axes[1].set_title('Casos por N√≠vel de Consenso')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 6Ô∏è‚É£ An√°lise Detalhada

In [None]:
from consensus_analyzer_refactored import ConsensusAnalyzer

# Inicializar analyzer
analyzer = ConsensusAnalyzer(categories)

# Colunas de consenso
consensus_cols = [col for col in df_with_consensus.columns if '_consensus' in col and '_score' not in col]

# Gerar relat√≥rio
report = analyzer.generate_consensus_report(
    df=df_with_consensus,
    annotator_cols=consensus_cols,
    output_dir="./results"
)

logger.success("‚úì Relat√≥rio gerado")

In [None]:
# M√©tricas
logger.info("\nüìä M√©tricas de Dist√¢ncia:")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

# Interpreta√ß√£o
kappa = report['fleiss_kappa']
if kappa > 0.8:
    logger.success("Concord√¢ncia excelente!")
elif kappa > 0.6:
    logger.info("Concord√¢ncia boa")
elif kappa > 0.4:
    logger.warning("Concord√¢ncia moderada")
else:
    logger.warning("Concord√¢ncia fraca")

In [None]:
# Matriz de concord√¢ncia
agreement_df = report['pairwise_agreement']

plt.figure(figsize=(10, 8))
sns.heatmap(agreement_df, annot=True, fmt='.2f', cmap='YlGnBu', cbar_kws={'label': 'Agreement'})
plt.title('Matriz de Concord√¢ncia Par a Par')
plt.tight_layout()
plt.show()

In [None]:
# Casos problem√°ticos
problematic = report.get('problematic_cases')
if problematic is not None and len(problematic) > 0:
    logger.warning(f"\n‚ö†Ô∏è  {len(problematic)} casos problem√°ticos identificados")
    display(problematic.head())
else:
    logger.success("\n‚úì Nenhum caso problem√°tico identificado")

## 7Ô∏è‚É£ Valida√ß√£o com Ground Truth

In [None]:
if ground_truth:
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    
    # Adicionar ground truth
    df_with_consensus['ground_truth'] = ground_truth
    
    # Accuracy
    accuracy = accuracy_score(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    )
    
    logger.success(f"\nüéØ Accuracy: {accuracy:.2%}")
    
    # Classification report
    logger.info("\nClassification Report:")
    print(classification_report(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    ))
    
    # Confusion matrix
    cm = confusion_matrix(
        df_with_consensus['ground_truth'],
        df_with_consensus['most_common_annotation']
    )
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=categories, yticklabels=categories)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix vs Ground Truth')
    plt.tight_layout()
    plt.savefig('./results/confusion_vs_ground_truth.png', dpi=150)
    plt.show()
    
    # Accuracy por n√≠vel de consenso
    logger.info("\nAccuracy por n√≠vel de consenso:")
    for level in ['high', 'medium', 'low']:
        df_level = df_with_consensus[df_with_consensus['consensus_level'] == level]
        if len(df_level) > 0:
            acc_level = accuracy_score(
                df_level['ground_truth'],
                df_level['most_common_annotation']
            )
            logger.info(f"  {level}: {acc_level:.2%} ({len(df_level)} casos)")
else:
    logger.info("\n‚ö†Ô∏è  Ground truth n√£o dispon√≠vel - pulando valida√ß√£o")

## 8Ô∏è‚É£ Exportar Resultados

In [None]:
import json
from pathlib import Path

# Criar diret√≥rio
results_dir = Path('./results/final')
results_dir.mkdir(parents=True, exist_ok=True)

# Salvar CSVs
df_with_consensus.to_csv(results_dir / 'dataset_anotado_completo.csv', index=False)
logger.info(f"‚úì Salvos: {len(df_with_consensus)} registros")

# Alta confian√ßa
high_conf = df_with_consensus[df_with_consensus['consensus_score'] >= 0.8]
high_conf.to_csv(results_dir / 'alta_confianca.csv', index=False)
logger.info(f"‚úì Alta confian√ßa: {len(high_conf)} registros")

# Necessita revis√£o
low_conf = df_with_consensus[df_with_consensus['consensus_score'] < 0.8]
low_conf.to_csv(results_dir / 'necessita_revisao.csv', index=False)
logger.info(f"‚úì Necessita revis√£o: {len(low_conf)} registros")

# Sum√°rio JSON
summary = {
    'dataset': {
        'name': dataset_name,
        'total_texts': len(texts),
        'categories': categories,
        'has_ground_truth': ground_truth is not None
    },
    'config': {
        'models': models,
        'num_repetitions': num_repetitions,
        'total_annotations': total_annotations
    },
    'results': {
        'consensus_mean': float(df_with_consensus['consensus_score'].mean()),
        'consensus_median': float(df_with_consensus['consensus_score'].median()),
        'consensus_std': float(df_with_consensus['consensus_score'].std()),
        'high_consensus': int((df_with_consensus['consensus_level'] == 'high').sum()),
        'medium_consensus': int((df_with_consensus['consensus_level'] == 'medium').sum()),
        'low_consensus': int((df_with_consensus['consensus_level'] == 'low').sum()),
        'problematic': int(df_with_consensus['is_problematic'].sum())
    },
    'metrics': {
        'fleiss_kappa': float(report['fleiss_kappa']),
        'fleiss_interpretation': report['fleiss_interpretation']
    }
}

if ground_truth:
    summary['validation'] = {
        'accuracy': float(accuracy)
    }

with open(results_dir / 'sumario_experimento.json', 'w') as f:
    json.dump(summary, f, indent=2)

logger.success("\n‚úì Resultados exportados com sucesso!")

## 9Ô∏è‚É£ Resumo Final

In [None]:
logger.info("\n" + "="*80)
logger.success("RESUMO DO EXPERIMENTO")
logger.info("="*80)

logger.info(f"\nüìä Dataset: {dataset_name}")
logger.info(f"  Textos: {len(texts)}")
logger.info(f"  Categorias: {len(categories)}")

logger.info(f"\nü§ñ Configura√ß√£o:")
logger.info(f"  Modelos: {len(models)}")
logger.info(f"  Repeti√ß√µes: {num_repetitions}")
logger.info(f"  Total de anota√ß√µes: {total_annotations}")

logger.info(f"\nüìà Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

if ground_truth:
    logger.info(f"\nüéØ Valida√ß√£o:")
    logger.info(f"  Accuracy: {accuracy:.2%}")

logger.info(f"\nüìÅ Arquivos gerados:")
logger.info(f"  {results_dir}/dataset_anotado_completo.csv")
logger.info(f"  {results_dir}/alta_confianca.csv")
logger.info(f"  {results_dir}/necessita_revisao.csv")
logger.info(f"  {results_dir}/sumario_experimento.json")

cache_stats = annotator.get_cache_stats()
logger.info(f"\nüíæ Cache: {cache_stats['total_entries']} entradas")

logger.success("\n‚úÖ An√°lise completa!")