# ü§ñ An√°lise de Consenso entre LLMs - Anota√ß√£o sem LLM Hacking

## 1) Setup e Configura√ß√£o

In [1]:
import sys
from loguru import logger
from src.api.schemas.experiment import ExperimentRequest

logger.remove()
logger.add(
    sys.stdout,
    format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
    level="INFO"
)

logger.success("‚úì Setup completo")

[32m14:12:39[0m | [32m[1mSUCCESS [0m | [32m[1m‚úì Setup completo[0m


In [3]:
import json
from pathlib import Path

experiment = "time_experiment_local"

# Load JSON from file
config_path = Path(f"../experiments/{experiment}.json")

with open(config_path, "r") as f:
    config_dict = json.load(f)

# Instantiate Pydantic model
EXPERIMENT_CONFIG = ExperimentRequest(**config_dict)

### - Modelos e prompt

In [4]:
from src.api.services.prompt_factory import get_prompt_template

DEFAULT_MODELS = EXPERIMENT_CONFIG.models
PROMPT_TEMPLATE = EXPERIMENT_CONFIG.prompt_type

PROMPT_TEMPLATE  = get_prompt_template(
    EXPERIMENT_CONFIG.prompt_type,
    EXPERIMENT_CONFIG.custom_prompt,
)

### - Configura√ß√µes de anota√ß√£o

In [5]:
annotation_cfg = EXPERIMENT_CONFIG.annotation

num_repetitions = annotation_cfg.num_repetitions_per_llm
use_alternative_params = annotation_cfg.use_alternative_params

model_strategy = annotation_cfg.model_strategy
rep_strategy = annotation_cfg.rep_strategy

### - Configura√ß√µes de consenso

In [6]:
consensus_cfg = {
    "threshold": 0.8,
    "strategy": "majority_vote",
    "no_consensus_strategy": "flag_for_review",
}

consensus_threshold = consensus_cfg.get("threshold", 0.8)
consensus_strategy = consensus_cfg.get("strategy", "majority_vote")
no_consensus_strategy = consensus_cfg.get(
    "no_consensus_strategy", "flag_for_review"
)

### - Configura√ß√µes de dataset

In [7]:
dataset_cfg = EXPERIMENT_CONFIG.dataset_config

dataset_split = dataset_cfg.split
combine_splits = dataset_cfg.combine_splits
sample_size = dataset_cfg.sample_size
random_state = dataset_cfg.random_state

### - Configura√ß√µes de cache

In [8]:
cache_cfg = EXPERIMENT_CONFIG.cache

cache_enabled = cache_cfg.enabled
cache_dir = cache_cfg.dir

### - Resultados

In [9]:
results_cfg = EXPERIMENT_CONFIG.results

save_intermediate = results_cfg.save_intermediate
intermediate = results_cfg.intermediate
results_dir = results_cfg.dir

## 2) Carregar Dataset

In [10]:
from src.utils.data_loader import load_hf_dataset, load_hf_dataset_as_dataframe, list_available_datasets, get_dataset_info


logger.info("Datasets dispon√≠veis:")
for dataset in list_available_datasets():
    logger.info(f"  - {dataset}")

[32m14:12:40[0m | [1mDatasets dispon√≠veis:[0m
[32m14:12:40[0m | [1m  - agnews[0m
[32m14:12:40[0m | [1m  - mpqa[0m
[32m14:12:40[0m | [1m  - webkb[0m
[32m14:12:40[0m | [1m  - ohsumed[0m
[32m14:12:40[0m | [1m  - acm[0m
[32m14:12:40[0m | [1m  - yelp_2013[0m
[32m14:12:40[0m | [1m  - dblp[0m
[32m14:12:40[0m | [1m  - books[0m
[32m14:12:40[0m | [1m  - reut90[0m
[32m14:12:40[0m | [1m  - wos11967[0m
[32m14:12:40[0m | [1m  - twitter[0m
[32m14:12:40[0m | [1m  - trec[0m
[32m14:12:40[0m | [1m  - wos5736[0m
[32m14:12:40[0m | [1m  - sst1[0m
[32m14:12:40[0m | [1m  - pang_movie[0m
[32m14:12:40[0m | [1m  - movie_review[0m
[32m14:12:40[0m | [1m  - vader_movie[0m
[32m14:12:40[0m | [1m  - subj[0m
[32m14:12:40[0m | [1m  - sst2[0m
[32m14:12:40[0m | [1m  - yelp_reviews[0m
[32m14:12:40[0m | [1m  - 20ng[0m
[32m14:12:40[0m | [1m  - medline[0m


In [11]:
dataset_name = "sst1"  # Ajuste conforme necess√°rio

texts, categories, ground_truth = load_hf_dataset(
    dataset_name=dataset_name, 
    cache_dir=cache_dir,
    dataset_global_config=dataset_cfg
)

logger.info(f"Textos: {len(texts)}")
logger.info(f"Categorias: {categories}")
logger.info(f"Ground truth: {'Sim' if ground_truth else 'N√£o'}")

[32m14:12:40[0m | [1mCarregando dataset: sst1[0m
[32m14:12:40[0m | [1mCombinando splits: ['train', 'test'][0m
[32m14:12:47[0m | [1m  ‚úì train: 106695 exemplos[0m
[32m14:12:54[0m | [1m  ‚úì test: 11855 exemplos[0m
[32m14:12:54[0m | [1mTotal combinado: 118550 exemplos[0m
[32m14:12:54[0m | [1mCategorias extra√≠das automaticamente: [0, 1, 2, 3, 4][0m
[32m14:12:54[0m | [1mAmostra reduzida para 20 exemplos (seed=42)[0m
[32m14:12:54[0m | [1mColuna de texto: text[0m
[32m14:12:54[0m | [1mGround truth carregado da coluna 'label'[0m
[32m14:12:54[0m | [1mTextos: 20[0m
[32m14:12:54[0m | [1mCategorias: [0, 1, 2, 3, 4][0m
[32m14:12:54[0m | [1mGround truth: Sim[0m


In [12]:
# Visualizar amostra
logger.info("Amostra dos textos:")
for i, text in enumerate(texts[:3]):
    logger.info(f"{i+1}. {text[:100]}...")
    if ground_truth:
        logger.info(f"   Label: {ground_truth[i]}")

[32m14:12:54[0m | [1mAmostra dos textos:[0m
[32m14:12:54[0m | [1m1. impostor has a handful of thrilling moments and a couple of good performances , but the movie does n...[0m
[32m14:12:54[0m | [1m   Label: 1[0m
[32m14:12:54[0m | [1m2. the acting in pauline and paulette is good all round , but what really sets the film apart is debrau...[0m
[32m14:12:54[0m | [1m   Label: 4[0m
[32m14:12:54[0m | [1m3. mr. polanski is in his element here : alone , abandoned , but still consoled by his art , which is m...[0m
[32m14:12:54[0m | [1m   Label: 3[0m


In [13]:
df, categories = load_hf_dataset_as_dataframe(
    dataset_name=dataset_name,
    cache_dir=cache_dir,
    dataset_global_config=dataset_cfg
)

[32m14:12:54[0m | [1mCarregando dataset: sst1[0m
[32m14:12:54[0m | [1mCombinando splits: ['train', 'test'][0m
[32m14:13:00[0m | [1m  ‚úì train: 106695 exemplos[0m
[32m14:13:07[0m | [1m  ‚úì test: 11855 exemplos[0m
[32m14:13:07[0m | [1mTotal combinado: 118550 exemplos[0m
[32m14:13:07[0m | [1mCategorias extra√≠das automaticamente: [0, 1, 2, 3, 4][0m
[32m14:13:07[0m | [1mAmostra reduzida para 20 exemplos (seed=42)[0m
[32m14:13:07[0m | [1mColuna de texto: text[0m
[32m14:13:07[0m | [1mGround truth carregado da coluna 'label'[0m
[32m14:13:07[0m | [1mDataFrame criado com 20 linhas[0m


## 3) Configurar Modelos LLM

### Op√ß√£o A: Usar apenas par√¢metros padr√£o (temp=0)

In [14]:
from src.llm_annotation_system.annotation.llm_annotator import LLMAnnotator

annotator = LLMAnnotator(
    dataset_name=dataset_name,
    models=DEFAULT_MODELS,
    categories=categories,
    cache_dir=cache_dir,
    results_dir=results_dir,
    prompt_template=PROMPT_TEMPLATE,
    use_langchain_cache=cache_enabled,
    use_alternative_params=use_alternative_params
)

logger.success(f"‚úì Annotator inicializado com {len(annotator.models)} modelos")

[32m14:13:09[0m | [1mCache carregado: 17361 entradas[0m
[32m14:13:09[0m | [1mCache LangChain DESATIVADO explicitamente[0m
[32m14:13:09[0m | [1mTemplate do prompt preparado[0m


Ollama params: {}
Ollama params: {}
Ollama params: {}
Ollama params: {}


[32m14:13:11[0m | [1mLLMAnnotator inicializado[0m
[32m14:13:11[0m | [1mModelos: 5 | Categorias: 5[0m
[32m14:13:11[0m | [32m[1m‚úì Annotator inicializado com 5 modelos[0m


## 4) Executar Anota√ß√£o

In [15]:
df

Unnamed: 0,text,label,label_description
0,impostor has a handful of thrilling moments an...,1,negative sentiment
1,the acting in pauline and paulette is good all...,4,very positive sentiment
2,"mr. polanski is in his element here : alone , ...",3,positive sentiment
3,a cop story that understands the medium amazin...,3,positive sentiment
4,the most horrific movie experience i 've had s...,0,very negative sentiment
5,"massoud 's story is an epic , but also a trage...",2,neutral sentiment
6,the actors do n't inhabit their roles -- they ...,0,very negative sentiment
7,more of a career curio than a major work .\n,2,neutral sentiment
8,"there 's a lot of good material here , but the...",2,neutral sentiment
9,if you saw benigni 's pinocchio at a public pa...,1,negative sentiment


In [16]:
annotator.annotation_engine.template

'You are an expert data annotator with extensive experience in text classification tasks.\n\nYour task is to classify the following text into one of the predefined categories with high precision.\n\n**Instructions:**\n1. Read the text carefully and understand its context\n2. Consider the nuances and implicit meanings\n3. Select the most appropriate category based on the content\n4. Be consistent with your classification criteria\n5. If the text is ambiguous, choose the most likely category based on dominant features\n\n**Available Categories:**\n- 0: very negative sentiment\n- 1: negative sentiment\n- 2: neutral sentiment\n- 3: positive sentiment\n- 4: very positive sentiment\n\n**Text to classify (Opinion / Sentiment):**\n{text}\n\n**Important Guidelines:**\n- Provide ONLY the category number as your response\n- Do not include explanations\n- Be objective and avoid bias\n- Consider edge cases carefully\n- Maintain consistency across similar texts\n\n**Your classification to the opinio

### - Testar anota√ß√£o √∫nica

In [18]:
# Par√¢metros de anota√ß√£o
text = texts[0]
model = "deepseek-r1-distill-llama-8b"

total_annotation = len(text) * num_repetitions
logger.warning(f"  Modelo: {model}")
logger.warning(f"  Texto: {len(text)}")
logger.warning(f"  Repeti√ß√µes: {num_repetitions}")

annotations = await annotator.annotate_single(
        text=text,
        model=model,
        num_repetitions=3,
        use_cache=cache_enabled,
        rep_strategy=rep_strategy
    )

logger.success("‚úì Anota√ß√£o completa")

annotations

[32m14:13:11[0m | [33m[1m  Modelo: deepseek-r1-distill-llama-8b[0m
[32m14:13:11[0m | [33m[1m  Texto: 115[0m
[32m14:13:11[0m | [33m[1m  Repeti√ß√µes: 1[0m
[32m14:13:18[0m | [32m[1m‚úì Anota√ß√£o completa[0m


[2, 1, 1]

### - Anotando dataset completo

In [19]:
df_annotations = await annotator.annotate_dataset(
    texts=texts,
    num_repetitions=num_repetitions,
    use_cache=cache_enabled,
    save_intermediate=save_intermediate,
    intermediate=intermediate,
    model_strategy=model_strategy,
    rep_strategy=rep_strategy
)

logger.success("‚úì Anota√ß√µes completas")
df_annotations

[32m14:13:18[0m | [1mIniciando anota√ß√£o[0m
[32m14:13:18[0m | [1mTextos: 20 | Modelos: 5 | Repeti√ß√µes: 1[0m
[32m14:13:18[0m | [1mTotal de anota√ß√µes: 100[0m
Anotando: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [08:32<00:00, 25.64s/it]
[32m14:21:50[0m | [32m[1m‚úì Anota√ß√µes completas[0m


Unnamed: 0,text_id,text,deepseek-r1-distill-llama-8b_rep1,deepseek-r1-distill-llama-8b_consensus,deepseek-r1-distill-llama-8b_consensus_score,deepseek-r1-distill-llama-8b_annotation_time_sec,qwen3-8b_rep1,qwen3-8b_consensus,qwen3-8b_consensus_score,qwen3-8b_annotation_time_sec,...,llama2-7b_consensus_score,llama2-7b_annotation_time_sec,mistral-7b_rep1,mistral-7b_consensus,mistral-7b_consensus_score,mistral-7b_annotation_time_sec,llama3.1-8b_rep1,llama3.1-8b_consensus,llama3.1-8b_consensus_score,llama3.1-8b_annotation_time_sec
0,0,impostor has a handful of thrilling moments an...,1,1,1.0,3.435346,1,1,1.0,10.724097,...,1.0,3.956189,1,1,1.0,3.990847,2,2,1.0,4.54966
1,1,the acting in pauline and paulette is good all...,3,3,1.0,4.176737,3,3,1.0,10.41509,...,1.0,3.204977,3,3,1.0,4.058478,2,2,1.0,4.841386
2,2,"mr. polanski is in his element here : alone , ...",3,3,1.0,5.978358,3,3,1.0,15.906592,...,1.0,3.941715,3,3,1.0,2.791691,2,2,1.0,4.139649
3,3,a cop story that understands the medium amazin...,4,4,1.0,3.937454,4,4,1.0,11.799683,...,1.0,4.735499,3,3,1.0,4.011998,3,3,1.0,4.119874
4,4,the most horrific movie experience i 've had s...,0,0,1.0,7.08056,0,0,1.0,9.694489,...,1.0,3.476061,1,1,1.0,2.779737,1,1,1.0,4.093793
5,5,"massoud 's story is an epic , but also a trage...",1,1,1.0,5.691549,2,2,1.0,16.106826,...,1.0,3.444676,3,3,1.0,2.773523,2,2,1.0,4.119426
6,6,the actors do n't inhabit their roles -- they ...,1,1,1.0,5.15018,0,0,1.0,11.109907,...,1.0,3.463867,1,1,1.0,2.815481,1,1,1.0,3.839775
7,7,more of a career curio than a major work .\n,1,1,1.0,4.433974,1,1,1.0,10.56248,...,1.0,2.932793,1,1,1.0,2.054406,2,2,1.0,2.98529
8,8,"there 's a lot of good material here , but the...",3,3,1.0,4.640043,1,1,1.0,22.385393,...,1.0,2.186939,1,1,1.0,0.368188,2,2,1.0,2.970219
9,9,if you saw benigni 's pinocchio at a public pa...,1,1,1.0,6.402905,0,0,1.0,15.425891,...,1.0,2.182142,0,0,1.0,0.358037,1,1,1.0,2.961752


### - Metricas por modelo

In [20]:
df_annotations["ground_truth"] = ground_truth if ground_truth else None
df_metrics = annotator.evaluate_model_metrics(
    df_annotations, 
    ground_truth_col="ground_truth", 
    output_csv=True
)

[32m14:21:50[0m | [1mCalculando m√©tricas por modelo...[0m
[32m14:21:50[0m | [1mTotal de linhas avaliadas: 20[0m
[32m14:21:50[0m | [1mM√©tricas para deepseek-r1-distill-llama-8b: Acc=0.4000, F1=0.3267, Prec=0.2833, Rec=0.4000, Cov=1.0000[0m
[32m14:21:50[0m | [1mM√©tricas para qwen3-8b: Acc=0.5500, F1=0.5107, Prec=0.5375, Rec=0.5500, Cov=1.0000[0m
[32m14:21:50[0m | [1mM√©tricas para llama2-7b: Acc=0.1000, F1=0.0182, Prec=0.0100, Rec=0.1000, Cov=1.0000[0m
[32m14:21:50[0m | [1mM√©tricas para mistral-7b: Acc=0.4000, F1=0.3727, Prec=0.3675, Rec=0.4000, Cov=1.0000[0m
[32m14:21:50[0m | [1mM√©tricas para llama3.1-8b: Acc=0.4500, F1=0.3591, Prec=0.3200, Rec=0.4500, Cov=1.0000[0m
[32m14:21:50[0m | [32m[1mM√©tricas por modelo salvas em: C:\Users\gabri\Documents\GitHub\llm-annotation\data\results\sst1\model_metrics.csv[0m
[32m14:21:50[0m | [32m[1m‚úì M√©tricas calculadas com sucesso[0m


## 5) Calcular Consenso

In [21]:
from src.llm_annotation_system.consensus.consensus_calculator import ConsensusCalculator
from src.llm_annotation_system.consensus.consensus_evaluator import ConsensusEvaluator

# Inicializar calculador
consensus_calculator = ConsensusCalculator(
    consensus_threshold=consensus_threshold,
    default_strategy=consensus_strategy
)

analyzer = ConsensusEvaluator(
    categories=categories, 
    calculator=consensus_calculator, 
    output_dir=annotator.results_dir
)

df_with_consensus = analyzer.compute_consensus(df_annotations)

[32m14:21:50[0m | [1mExecutando c√°lculo de consenso interno...[0m
[32m14:21:50[0m | [1mCalculando consenso...[0m
[32m14:21:50[0m | [32m[1mConsenso calculado:[0m
[32m14:21:50[0m | [1m  Alto (‚â•80%): 0 (0.0%)[0m
[32m14:21:50[0m | [1m  M√©dio (60-80%): 12 (60.0%)[0m
[32m14:21:50[0m | [1m  Baixo (<60%): 8 (40.0%)[0m
[32m14:21:50[0m | [1m  Problem√°ticos: 6 (30.0%)[0m
[32m14:21:50[0m | [1m  Itens que precisam de revis√£o: 0 (0.0%)[0m
[32m14:21:50[0m | [32m[1mC√°lculo de consenso finalizado.[0m


### - Estatisticas

In [22]:
logger.info("\nüìä Estat√≠sticas de Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Mediana: {df_with_consensus['consensus_score'].median():.2%}")
logger.info(f"  Desvio padr√£o: {df_with_consensus['consensus_score'].std():.2%}")

[32m14:21:50[0m | [1m
üìä Estat√≠sticas de Consenso:[0m
[32m14:21:50[0m | [1m  M√©dia: 52.00%[0m
[32m14:21:50[0m | [1m  Mediana: 60.00%[0m
[32m14:21:50[0m | [1m  Desvio padr√£o: 10.05%[0m


### - Distribui√ß√£o por n√≠vel

In [23]:
# Distribui√ß√£o por n√≠vel
levels = df_with_consensus['consensus_level'].value_counts()
logger.info("Distribui√ß√£o por n√≠vel:")
for level, count in levels.items():
    logger.info(f"  {level}: {count} ({count/len(df_with_consensus):.1%})")

[32m14:21:50[0m | [1mDistribui√ß√£o por n√≠vel:[0m
[32m14:21:50[0m | [1m  medium: 12 (60.0%)[0m
[32m14:21:50[0m | [1m  low: 8 (40.0%)[0m


## - Visualizar consenso

In [24]:
from src.llm_annotation_system.consensus.consensus_visualizer import ConsensusVisualizer

visualizer = ConsensusVisualizer(output_dir=annotator.results_dir)

In [25]:
visualizer.plot_score_and_levels(
    df_with_consensus=df_with_consensus,
    levels=levels
)

‚úì Gr√°fico salvo: score_and_levels.html


## 7Ô∏è‚É£ An√°lise Detalhada de Consenso

### - Gerando Report

In [26]:
# Gerar relat√≥rio
report = analyzer.generate_consensus_report(
    df=df_with_consensus
)

logger.success("‚úì Relat√≥rio gerado")

[32m14:21:52[0m | [1mGerando relat√≥rio completo de consenso...[0m
[32m14:21:52[0m | [1mFleiss' Kappa: 0.047 (Muito Fraco)[0m
[32m14:21:52[0m | [1mCasos problem√°ticos: 8[0m
[32m14:21:52[0m | [32m[1mRelat√≥rio de consenso gerado com sucesso.[0m
[32m14:21:52[0m | [32m[1m‚úì Relat√≥rio gerado[0m


### - Pairwise_agreement

In [27]:
logger.info("\nüìä Gerando heatmap de concord√¢ncia...")
visualizer.plot_agreement_heatmap(
    agreement_df=report['pairwise_agreement'],
    title='Matriz de Concord√¢ncia entre Modelos',
)

[32m14:21:52[0m | [1m
üìä Gerando heatmap de concord√¢ncia...[0m


‚úì Heatmap salvo: agreement_heatmap.html


### - Cohens Kappa

In [28]:
logger.info("\nüìä Gerando heatmap de Cohens_Kappa...")
visualizer.plot_kappa_heatmap(
    kappa_df=report['cohens_kappa']
)

[32m14:21:52[0m | [1m
üìä Gerando heatmap de Cohens_Kappa...[0m


‚úì Heatmap salvo: kappa_heatmap.html


### - Casos problem√°ticos

In [29]:
# Casos problem√°ticos
problematic = report.get('problematic_cases')
if problematic is not None and len(problematic) > 0:
    logger.warning(f"\n‚ö†Ô∏è  {len(problematic)} casos problem√°ticos identificados")
    display(problematic)
else:
    logger.success("\n‚úì Nenhum caso problem√°tico identificado")

[32m14:21:52[0m | [33m[1m
‚ö†Ô∏è  8 casos problem√°ticos identificados[0m


Unnamed: 0,text_id,text,consensus_score,annotations,entropy
0,4,the most horrific movie experience i 've had s...,0.4,"{0: 2, 4: 1, 1: 2}",1.521928
1,5,"massoud 's story is an epic , but also a trage...",0.4,"{1: 1, 2: 2, 4: 1, 3: 1}",1.921928
2,8,"there 's a lot of good material here , but the...",0.4,"{3: 1, 1: 2, 4: 1, 2: 1}",1.921928
3,9,if you saw benigni 's pinocchio at a public pa...,0.4,"{1: 2, 0: 2, 4: 1}",1.521928
4,12,meant to reduce blake 's philosophy into a tra...,0.4,"{1: 2, 4: 1, 2: 2}",1.521928
5,13,deliberately and skillfully uses ambiguity to ...,0.4,"{3: 2, 4: 1, 2: 2}",1.521928
6,17,what they 're doing is a matter of plumbing ar...,0.4,"{1: 2, 4: 1, 2: 2}",1.521928
7,19,"the results , if not memorable , are at least ...",0.4,"{3: 2, 4: 1, 2: 2}",1.521928


## 8Ô∏è‚É£ Valida√ß√£o com Ground Truth

In [30]:
if ground_truth:
    accuracy, cls_report, cm = analyzer.evaluate_ground_truth(
        df_with_consensus=df_with_consensus
    )

    visualizer.plot_confusion_matrix(
        cm=cm,
        categories=categories
    )
    
else:
    logger.info("‚ö†Ô∏è Ground truth n√£o dispon√≠vel ‚Äì pulando valida√ß√£o")

[32m14:21:52[0m | [32m[1m
üéØ Accuracy: 50.00%[0m
[32m14:21:52[0m | [1m
Classification Report:[0m


{'0': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3.0}, '1': {'precision': 0.5, 'recall': 0.8333333333333334, 'f1-score': 0.625, 'support': 6.0}, '2': {'precision': 0.5, 'recall': 0.16666666666666666, 'f1-score': 0.25, 'support': 6.0}, '3': {'precision': 0.4, 'recall': 0.6666666666666666, 'f1-score': 0.5, 'support': 3.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2.0}, 'accuracy': 0.5, 'macro avg': {'precision': 0.48, 'recall': 0.4666666666666667, 'f1-score': 0.43499999999999994, 'support': 20.0}, 'weighted avg': {'precision': 0.51, 'recall': 0.5, 'f1-score': 0.4575, 'support': 20.0}}


‚úì Matriz de confus√£o salva em: C:\Users\gabri\Documents\GitHub\llm-annotation\data\results\sst1\graphics\confusion_matrix.html


## 9Ô∏è‚É£ Exportar Resultados

In [31]:
import json
from pathlib import Path

# Criar diret√≥rio
results_dir = annotator.results_dir
results_dir = results_dir.joinpath("summary")
results_dir.mkdir(parents=True, exist_ok=True)

# Salvar CSVs
df_with_consensus.to_csv(results_dir / 'dataset_anotado_completo.csv', index=False)
logger.info(f"‚úì Salvos: {len(df_with_consensus)} registros")

# Alta confian√ßa
high_conf = df_with_consensus[df_with_consensus['consensus_score'] >= 0.8]
high_conf.to_csv(results_dir / 'alta_confianca.csv', index=False)
logger.info(f"‚úì Alta confian√ßa: {len(high_conf)} registros")

# Necessita revis√£o
low_conf = df_with_consensus[df_with_consensus['consensus_score'] < 0.8]
low_conf.to_csv(results_dir / 'necessita_revisao.csv', index=False)
logger.info(f"‚úì Necessita revis√£o: {len(low_conf)} registros")

# Sum√°rio JSON
summary = {
    'dataset': {
        'name': dataset_name,
        'total_texts': len(texts),
        'categories': categories,
        'has_ground_truth': ground_truth is not None
    },
    'config': {
        'models': DEFAULT_MODELS,
        'total_models': len(annotator.models),
        'use_alternative_params': annotator.use_alternative_params,
        'num_repetitions': num_repetitions,
        'total_annotations': len(texts) * len(annotator.models) * num_repetitions
    },
    'results': {
        'consensus_mean': float(df_with_consensus['consensus_score'].mean()),
        'consensus_median': float(df_with_consensus['consensus_score'].median()),
        'high_consensus': int((df_with_consensus['consensus_level'] == 'high').sum()),
        'medium_consensus': int((df_with_consensus['consensus_level'] == 'medium').sum()),
        'low_consensus': int((df_with_consensus['consensus_level'] == 'low').sum()),
    },
    'metrics': {
        'fleiss_kappa': float(report['fleiss_kappa']),
        'fleiss_interpretation': report['fleiss_interpretation']
    }
}

if ground_truth:
    summary['validation'] = {
        'classification_report': cls_report
    }

with open(results_dir / 'sumario_experimento.json', 'w') as f:
    json.dump(summary, f, indent=2)

logger.success("\n‚úì Resultados exportados com sucesso!")

[32m14:21:52[0m | [1m‚úì Salvos: 20 registros[0m
[32m14:21:52[0m | [1m‚úì Alta confian√ßa: 0 registros[0m
[32m14:21:52[0m | [1m‚úì Necessita revis√£o: 20 registros[0m
[32m14:21:52[0m | [32m[1m
‚úì Resultados exportados com sucesso![0m


## 10) Resumo Final

In [32]:
logger.info("\n" + "="*80)
logger.success("RESUMO DO EXPERIMENTO")
logger.info("="*80)

logger.info(f"\nüìä Dataset: {dataset_name}")
logger.info(f"  Textos: {len(texts)}")
logger.info(f"  Categorias: {len(categories)}")

logger.info(f"\nü§ñ Configura√ß√£o:")
logger.info(f"  Modelos base: {len(DEFAULT_MODELS)}")
logger.info(f"  Total modelos: {len(annotator.models)}")
logger.info(f"  Alternative params: {annotator.use_alternative_params}")
logger.info(f"  Repeti√ß√µes: {num_repetitions}")

logger.info(f"\nüìà Consenso:")
logger.info(f"  M√©dia: {df_with_consensus['consensus_score'].mean():.2%}")
logger.info(f"  Fleiss' Kappa: {report['fleiss_kappa']:.3f} ({report['fleiss_interpretation']})")

if ground_truth:
    logger.info(f"\nüéØ Valida√ß√£o:")
    logger.info(f"  Accuracy: {accuracy:.2%}")

logger.info(f"\nüìÅ Arquivos gerados em: {results_dir}/")

cache_stats = annotator.get_cache_stats()
logger.info(f"\nüíæ Cache: {cache_stats['total_entries']} entradas")

logger.success("\n‚úÖ An√°lise completa!")

[32m14:21:52[0m | [1m
[32m14:21:52[0m | [32m[1mRESUMO DO EXPERIMENTO[0m
[32m14:21:52[0m | [1m
üìä Dataset: sst1[0m
[32m14:21:52[0m | [1m  Textos: 20[0m
[32m14:21:52[0m | [1m  Categorias: 5[0m
[32m14:21:52[0m | [1m
ü§ñ Configura√ß√£o:[0m
[32m14:21:52[0m | [1m  Modelos base: 5[0m
[32m14:21:52[0m | [1m  Total modelos: 5[0m
[32m14:21:52[0m | [1m  Alternative params: False[0m
[32m14:21:52[0m | [1m  Repeti√ß√µes: 1[0m
[32m14:21:52[0m | [1m
üìà Consenso:[0m
[32m14:21:52[0m | [1m  M√©dia: 52.00%[0m
[32m14:21:52[0m | [1m  Fleiss' Kappa: 0.047 (Muito Fraco)[0m
[32m14:21:52[0m | [1m
üéØ Valida√ß√£o:[0m
[32m14:21:52[0m | [1m  Accuracy: 50.00%[0m
[32m14:21:52[0m | [1m
üìÅ Arquivos gerados em: C:\Users\gabri\Documents\GitHub\llm-annotation\data\results\sst1\summary/[0m
[32m14:21:52[0m | [1m
üíæ Cache: 17361 entradas[0m
[32m14:21:52[0m | [32m[1m
‚úÖ An√°lise completa![0m
