In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from scipy.stats import kendalltau, spearmanr
import os
import json
from tqdm.notebook import tqdm

print("--- Iniciando a Análise Final e Completa ---")

# --- 1. CONFIGURAÇÃO ---
OUTPUTS_DIR = 'outputs'
SELF_ASSESSMENT_PATH = 'gt_self_assessment_ranking.csv'
JUDGES_HARD_VOTE_PATH = 'gt_judges_hard_vote_ranking.csv'
JUDGES_SOFT_VOTE_PATH = 'gt_judges_soft_vote_ranking.csv'

# --- 2. CARREGAMENTO E PROCESSAMENTO DOS DADOS BRUTOS DOS LLMs ---
print("\n[ETAPA 1/6] Lendo e processando os arquivos JSON dos LLMs...")
all_runs_data = []
try:
    for root, dirs, files in os.walk(OUTPUTS_DIR):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(root, filename)
                try:
                    parts = file_path.split(os.sep)
                    prompt_num = int(parts[1].split('_')[1])
                    debate_num = int(parts[2].split('_')[1])
                    model_name = filename.split('_')[0]
                    run_num = int(filename.split('_')[2].split('.')[0])
                except (ValueError, IndexError): continue

                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                if "error" in data: continue

                for debater in data.get('debaters', []):
                    debater_name = debater.get('name')
                    performance_analysis = debater.get('performance', {}).get('performance_analysis', debater.get('performance_evaluation', ''))
                    
                    if prompt_num == 1:
                        score = debater.get('overall_score')
                        if score is not None:
                            all_runs_data.append({'prompt': 1, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': 'overall_score', 'score': score, 'analysis_text': performance_analysis})
                    elif prompt_num == 3:
                        positive_events = debater.get('positive_events', {})
                        negative_events = debater.get('negative_events', {})
                        score = sum(len(v) for v in positive_events.values()) - sum(len(v) for v in negative_events.values())
                        all_runs_data.append({'prompt': 3, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': 'total_event_score', 'score': score, 'analysis_text': performance_analysis})
                    else: # Prompts 2 e 4
                        scores_data = debater.get('scores', {})
                        if not scores_data: scores_data = {k: v.get('score') for k, v in debater.get('evaluation_aspects', {}).items()}
                        for criterion, score in scores_data.items():
                            all_runs_data.append({'prompt': prompt_num, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': criterion, 'score': score, 'analysis_text': performance_analysis})
    
    raw_df = pd.DataFrame(all_runs_data)
    print(f">>> Leitura de {len(raw_df)} registros brutos concluída.")
except Exception as e:
    print(f"ERRO na leitura dos JSONs: {e}")

# --- 3. CARREGAMENTO DOS DADOS DE GROUND TRUTH ---
print("\n[ETAPA 2/6] Carregando arquivos de Ground Truth...")
try:
    self_assessment_ranking_df = pd.read_csv(SELF_ASSESSMENT_PATH)
    judges_hard_vote_ranking_df = pd.read_csv(JUDGES_HARD_VOTE_PATH)
    judges_soft_vote_ranking_df = pd.read_csv(JUDGES_SOFT_VOTE_PATH)
    print(">>> Dados de Ground Truth carregados com sucesso.")
except Exception as e:
    print(f"ERRO ao carregar arquivos de Ground Truth: {e}")

# --- 4. PREPARAÇÃO E PADRONIZAÇÃO FINAL DOS RANKINGS ---
print("\n[ETAPA 3/6] Preparando e padronizando rankings...")
try:
    def standardize_debater_name(df, col_name='debater'):
        if col_name in df.columns:
            df[col_name] = df[col_name].astype(str).str.upper().str.replace(' ', '_').str.replace('DEBATER_', 'DEBATER_')
        return df

    raw_df = standardize_debater_name(raw_df)
    self_assessment_ranking_df = standardize_debater_name(self_assessment_ranking_df)
    judges_hard_vote_ranking_df = standardize_debater_name(judges_hard_vote_ranking_df)
    judges_soft_vote_ranking_df = standardize_debater_name(judges_soft_vote_ranking_df)

    agg_df = raw_df.groupby(['prompt', 'debate', 'model', 'debater', 'criterion'])['score'].mean().reset_index()
    total_scores = agg_df.groupby(['prompt', 'debate', 'model', 'debater'])['score'].sum().reset_index()
    total_scores['rank'] = total_scores.groupby(['prompt', 'debate', 'model'])['score'].rank(method='dense', ascending=False).astype(int)
    final_llm_rankings = total_scores.sort_values(by=['prompt', 'debate', 'model', 'rank'])
    print(">>> Rankings dos LLMs e Ground Truth preparados e padronizados.")
except Exception as e:
    print(f"Ocorreu um erro na preparação dos rankings: {e}")

# --- 5. FUNÇÕES DE MÉTRICA ---
def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    return correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')

def calculate_debaters_accuracy(predictions, ground_truth):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    merged['is_correct'] = (merged['rank_pred'] == merged['rank_gt'])
    return merged.groupby(['prompt', 'model'])['is_correct'].mean().rename('debaters_accuracy')

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    return (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

def calculate_rank_correlation(predictions, ground_truth, method='kendall'):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    correlations = []
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        if len(group['rank_pred']) == len(group['rank_gt']) and len(group) > 1:
            if method == 'kendall': corr, _ = kendalltau(group['rank_pred'], group['rank_gt'])
            elif method == 'spearman': corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])
            else: corr = np.nan
            correlations.append({'prompt': name[0], 'model': name[1], 'correlation': corr})
    corr_df = pd.DataFrame(correlations)
    return corr_df.groupby(['prompt', 'model'])['correlation'].mean()


# --- 6. EXECUÇÃO FINAL E EXIBIÇÃO ---
print("\n[ETAPA 4/6] Calculando as métricas finais...")
ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
    
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    debaters_acc = calculate_debaters_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    kendall = calculate_rank_correlation(predictions_filtered, gt_df, method='kendall').rename('kendall_tau')
    spearman = calculate_rank_correlation(predictions_filtered, gt_df, method='spearman').rename('spearman_rho')
    
    result_df = pd.concat([acc, debaters_acc, mrr, ndcg, kendall, spearman], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

print("\n[ETAPA 5/6] Tabela de Resultados Finais:")
display(final_summary_df)

print("\n[ETAPA 6/6] Salvando tabela de resultados...")
final_summary_df.to_csv('final_summary_results_full.csv')
print("\nTabela de resultados finais salva em 'final_summary_results_full.csv'")
print("\n--- Análise concluída! ---")

--- Iniciando a Análise Final e Completa ---

[ETAPA 1/6] Lendo e processando os arquivos JSON dos LLMs...
>>> Leitura de 12259 registros brutos concluída.

[ETAPA 2/6] Carregando arquivos de Ground Truth...
>>> Dados de Ground Truth carregados com sucesso.

[ETAPA 3/6] Preparando e padronizando rankings...
>>> Rankings dos LLMs e Ground Truth preparados e padronizados.

[ETAPA 4/6] Calculando as métricas finais...

[ETAPA 5/6] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,debaters_accuracy,mrr,ndcg,kendall_tau,spearman_rho
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vs_Judges_HardVote,1,claude,0.555556,0.411765,0.791667,0.925408,0.592781,0.674839
vs_Judges_HardVote,1,gemini,0.705882,0.455882,0.848958,0.95368,0.630921,0.680794
vs_Judges_HardVote,1,gpt4o,0.444444,0.382353,0.689583,0.908007,0.336921,0.365731
vs_Judges_HardVote,2,claude,0.588235,0.411765,0.802083,0.951635,0.618296,0.715615
vs_Judges_HardVote,2,gemini,0.529412,0.397059,0.734375,0.929081,0.50094,0.570866
vs_Judges_HardVote,2,gpt4o,0.529412,0.411765,0.734375,0.931477,0.57663,0.635609
vs_Judges_HardVote,3,claude,0.5,0.323529,0.75,0.919716,0.520181,0.598676
vs_Judges_HardVote,3,gemini,0.444444,0.355932,0.710714,0.781333,0.476835,0.528962
vs_Judges_HardVote,3,gpt4o,0.333333,0.279412,0.729167,0.870884,0.186197,0.200352
vs_Judges_HardVote,4,claude,0.444444,0.375,0.794872,0.752186,0.505294,0.60839



[ETAPA 6/6] Salvando tabela de resultados...

Tabela de resultados finais salva em 'final_summary_results_full.csv'

--- Análise concluída! ---


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from scipy.stats import kendalltau, spearmanr # <-- Adicionado import

print("--- Iniciando Comparação: Autoavaliação vs. Juízes (com todas as métricas) ---")

# --- 1. CARREGAMENTO DOS DADOS DE GROUND TRUTH ---
try:
    self_assessment_df = pd.read_csv('gt_self_assessment_ranking.csv')
    judges_hard_vote_df = pd.read_csv('gt_judges_hard_vote_ranking.csv')
    judges_soft_vote_df = pd.read_csv('gt_judges_soft_vote_ranking.csv')
    print(">>> Arquivos de Ground Truth carregados com sucesso.")
except Exception as e:
    print(f"ERRO: Não foi possível carregar os arquivos .csv. Verifique se eles existem. Erro: {e}")

# --- 2. FUNÇÕES PARA CÁLCULO DAS MÉTRICAS ---

def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    return correct.mean()

def calculate_debaters_accuracy(predictions, ground_truth):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    merged['is_correct'] = (merged['rank_pred'] == merged['rank_gt'])
    return merged['is_correct'].mean()

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby('debate')['rank'].min()
    mrr = (1 / first_correct_rank).mean()
    return mrr

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    ndcg_scores = []
    for debate_id, group in merged.groupby('debate'):
        if len(group) < 2: continue
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        ndcg_scores.append(ndcg_score(true_relevance, predicted_relevance))
    return np.mean(ndcg_scores) if ndcg_scores else np.nan

# --- NOVA FUNÇÃO ADICIONADA AQUI ---
def calculate_rank_correlation(predictions, ground_truth, method='kendall'):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    correlations = []
    for debate_id, group in merged.groupby('debate'):
        if len(group) > 1: # Precisa de mais de 1 item para calcular correlação
            if method == 'kendall':
                corr, _ = kendalltau(group['rank_pred'], group['rank_gt'])
            elif method == 'spearman':
                corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])
            else:
                corr = np.nan
            correlations.append(corr)
    return np.mean(correlations) if correlations else np.nan


# --- 3. EXECUÇÃO DAS COMPARAÇÕES ---
print("\n[ETAPA 2/2] Calculando as métricas de comparação...")

results = {}
common_debates = self_assessment_df['debate'].unique()
judges_hard_filtered = judges_hard_vote_df[judges_hard_vote_df['debate'].isin(common_debates)]
judges_soft_filtered = judges_soft_vote_df[judges_soft_vote_df['debate'].isin(common_debates)]

# Adicionando Debaters Accuracy e as novas métricas de correlação
results['vs_Judges_HardVote'] = {
    'winners_accuracy': calculate_winners_accuracy(self_assessment_df, judges_hard_filtered),
    'debaters_accuracy': calculate_debaters_accuracy(self_assessment_df, judges_hard_filtered),
    'mrr': calculate_mrr(self_assessment_df, judges_hard_filtered),
    'ndcg': calculate_ndcg(self_assessment_df, judges_hard_filtered),
    'kendall_tau': calculate_rank_correlation(self_assessment_df, judges_hard_filtered, method='kendall'),
    'spearman_rho': calculate_rank_correlation(self_assessment_df, judges_hard_filtered, method='spearman')
}

results['vs_Judges_SoftVote'] = {
    'winners_accuracy': calculate_winners_accuracy(self_assessment_df, judges_soft_filtered),
    'debaters_accuracy': calculate_debaters_accuracy(self_assessment_df, judges_soft_filtered),
    'mrr': calculate_mrr(self_assessment_df, judges_soft_filtered),
    'ndcg': calculate_ndcg(self_assessment_df, judges_soft_filtered),
    'kendall_tau': calculate_rank_correlation(self_assessment_df, judges_soft_filtered, method='kendall'),
    'spearman_rho': calculate_rank_correlation(self_assessment_df, judges_soft_filtered, method='spearman')
}

# --- 4. EXIBIÇÃO DOS RESULTADOS ---
summary_df = pd.DataFrame(results).T
print("\n--- RESULTADOS FINAIS: AUTOAVALIAÇÃO vs. JUÍZES (COM TODAS AS MÉTRICAS) ---")
display(summary_df)

--- Iniciando Comparação: Autoavaliação vs. Juízes (com todas as métricas) ---
>>> Arquivos de Ground Truth carregados com sucesso.

[ETAPA 2/2] Calculando as métricas de comparação...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth['relevance'] = 1 / ground_truth['rank']



--- RESULTADOS FINAIS: AUTOAVALIAÇÃO vs. JUÍZES (COM TODAS AS MÉTRICAS) ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth['relevance'] = 1 / ground_truth['rank']


Unnamed: 0,winners_accuracy,debaters_accuracy,mrr,ndcg,kendall_tau,spearman_rho
vs_Judges_HardVote,0.555556,0.55,0.845238,0.95486,0.636508,0.693943
vs_Judges_SoftVote,0.529412,0.316667,0.809524,0.939015,0.582101,0.659188


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

print("--- Iniciando a Análise Final e Cálculo de Métricas ---")

# --- 1. CARREGAMENTO DE TODOS OS DADOS PREPARADOS ---
print("\n[ETAPA 1/4] Carregando todos os dataframes...")
try:
    llm_scores_df = pd.read_csv('aggregated_results.csv')
    llm_winners_p1_df = pd.read_csv('aggregated_results_prompt_1_winners.csv')
    self_assessment_ranking_df = pd.read_csv('gt_self_assessment_ranking.csv')
    judges_hard_vote_ranking_df = pd.read_csv('gt_judges_hard_vote_ranking.csv')
    judges_soft_vote_ranking_df = pd.read_csv('gt_judges_soft_vote_ranking.csv')
    print(">>> Dados carregados com sucesso.")
except Exception as e:
    print(f"ERRO: Arquivo não encontrado. Verifique se todos os arquivos .csv estão na pasta. Detalhe: {e}")

# --- 2. PREPARAÇÃO E PADRONIZAÇÃO DOS RANKINGS ---
print("\n[ETAPA 2/4] Preparando e padronizando os rankings...")
try:
    # Padroniza nomes dos debatedores em todos os DataFrames para 'DEBATER_X'
    def standardize_debater_name(df, col_name='debater'):
        if col_name in df.columns:
            df[col_name] = df[col_name].astype(str).str.upper().str.replace(' ', '_')
        return df

    llm_scores_df = standardize_debater_name(llm_scores_df)
    for col in llm_winners_p1_df.columns:
        if col.lower() != 'debate':
            llm_winners_p1_df[col] = standardize_debater_name(llm_winners_p1_df, col)
            
    self_assessment_ranking_df = standardize_debater_name(self_assessment_ranking_df)
    judges_hard_vote_ranking_df = standardize_debater_name(judges_hard_vote_ranking_df)
    judges_soft_vote_ranking_df = standardize_debater_name(judges_soft_vote_ranking_df)

    # Prepara rankings dos LLMs para prompts 2, 3 e 4
    llm_total_scores = llm_scores_df.groupby(['prompt', 'debate', 'model', 'debater'])['score'].sum().reset_index()
    llm_total_scores['rank'] = llm_total_scores.groupby(['prompt', 'debate', 'model'])['score'].rank(method='dense', ascending=False).astype(int)
    llm_ranking_p234 = llm_total_scores.sort_values(by=['prompt', 'debate', 'model', 'rank'])

    # Prepara rankings dos LLMs para prompt 1
    llm_winners_p1_long = llm_winners_p1_df.melt(id_vars='debate', var_name='model', value_name='debater')
    llm_winners_p1_long['prompt'] = 1
    llm_winners_p1_long = standardize_debater_name(llm_winners_p1_long)

    all_debaters = judges_soft_vote_ranking_df[['debate', 'debater']].drop_duplicates()
    llm_winners_p1_ranked = pd.merge(all_debaters, llm_winners_p1_long, on=['debate', 'debater'], how='left')
    llm_winners_p1_ranked['rank'] = np.where(llm_winners_p1_ranked['model'].notna(), 1, 2)
    
    final_llm_rankings = pd.concat([llm_winners_p1_ranked, llm_ranking_p234])
    print(">>> Rankings dos LLMs e Ground Truth preparados e padronizados.")
except Exception as e:
    print(f"Ocorreu um erro na preparação dos rankings: {e}")

# --- 3. FUNÇÕES PARA CÁLCULO DAS MÉTRICAS ---
def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    accuracy = correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')
    return accuracy

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    mrr = (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')
    return mrr

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
        
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

# --- 4. EXECUÇÃO DAS ANÁLISES ---
print("\n[ETAPA 3/4] Calculando as métricas finais...")

ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
        
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    
    result_df = pd.concat([acc, mrr, ndcg], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

print("\n[ETAPA 4/4] Tabela de Resultados Finais:")
display(final_summary_df)

final_summary_df.to_csv('final_summary_results.csv')
print("\nTabela de resultados finais salva em 'final_summary_results.csv'")

--- Iniciando a Análise Final e Cálculo de Métricas ---

[ETAPA 1/4] Carregando todos os dataframes...
>>> Dados carregados com sucesso.

[ETAPA 2/4] Preparando e padronizando os rankings...
>>> Rankings dos LLMs e Ground Truth preparados e padronizados.

[ETAPA 3/4] Calculando as métricas finais...

[ETAPA 4/4] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,mrr,ndcg
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
vs_Judges_HardVote,0.0,0,,,0.982091
vs_Judges_HardVote,1.0,claude,0.555556,0.791667,0.925408
vs_Judges_HardVote,1.0,gemini,0.705882,0.848958,0.95368
vs_Judges_HardVote,1.0,gpt4o,0.444444,0.689583,0.908007
vs_Judges_HardVote,2.0,claude,0.588235,0.802083,0.951635
vs_Judges_HardVote,2.0,gemini,0.529412,0.734375,0.929081
vs_Judges_HardVote,2.0,gpt4o,0.529412,0.734375,0.931477
vs_Judges_HardVote,4.0,claude,0.444444,0.794872,0.752186
vs_Judges_HardVote,4.0,gemini,0.588235,0.811111,0.857898
vs_Judges_HardVote,4.0,gpt4o,0.555556,0.765625,0.917397



Tabela de resultados finais salva em 'final_summary_results.csv'


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

print("--- Iniciando a Análise Final e Cálculo de Métricas (v2 - Completa) ---")

# --- 1. CARREGAMENTO DE TODOS OS DADOS PREPARADOS ---
print("\n[ETAPA 1/5] Carregando todos os dataframes...")
try:
    llm_scores_df = pd.read_csv('aggregated_results.csv')
    llm_winners_p1_df = pd.read_csv('aggregated_results_prompt_1_winners.csv')
    self_assessment_ranking_df = pd.read_csv('gt_self_assessment_ranking.csv')
    judges_hard_vote_ranking_df = pd.read_csv('gt_judges_hard_vote_ranking.csv')
    judges_soft_vote_ranking_df = pd.read_csv('gt_judges_soft_vote_ranking.csv')
    print(">>> Dados carregados com sucesso.")
except Exception as e:
    print(f"ERRO: Arquivo não encontrado. Verifique se todos os arquivos .csv estão na pasta. Detalhe: {e}")

# --- 2. PREPARAÇÃO E PADRONIZAÇÃO DOS RANKINGS ---
print("\n[ETAPA 2/5] Preparando e padronizando os rankings...")
try:
    def standardize_debater_name(df, col_name='debater'):
        if col_name in df.columns:
            df[col_name] = df[col_name].astype(str).str.upper().str.replace(' ', '_').str.replace('_', '_', regex=False)
        return df

    llm_scores_df = standardize_debater_name(llm_scores_df)
    # Padroniza nomes de debatedores no llm_winners_p1_df
    id_vars = ['debate']
    value_vars = [col for col in llm_winners_p1_df.columns if col != 'debate']
    llm_winners_p1_df_long = llm_winners_p1_df.melt(id_vars=id_vars, value_vars=value_vars, var_name='model', value_name='debater')
    llm_winners_p1_df_long = standardize_debater_name(llm_winners_p1_df_long)
    llm_winners_p1_df = llm_winners_p1_df_long.pivot(index='debate', columns='model', values='debater').reset_index()


    self_assessment_ranking_df = standardize_debater_name(self_assessment_ranking_df)
    judges_hard_vote_ranking_df = standardize_debater_name(judges_hard_vote_ranking_df)
    judges_soft_vote_ranking_df = standardize_debater_name(judges_soft_vote_ranking_df)

    # Prepara rankings dos LLMs para prompts 2, 3 e 4
    llm_total_scores = llm_scores_df.groupby(['prompt', 'debate', 'model', 'debater'])['score'].sum().reset_index()
    llm_total_scores['rank'] = llm_total_scores.groupby(['prompt', 'debate', 'model'])['score'].rank(method='dense', ascending=False).astype(int)
    llm_ranking_p234 = llm_total_scores.sort_values(by=['prompt', 'debate', 'model', 'rank'])

    # Prepara rankings dos LLMs para prompt 1
    llm_winners_p1_long = llm_winners_p1_df.melt(id_vars='debate', var_name='model', value_name='debater')
    llm_winners_p1_long['prompt'] = 1

    all_debaters = judges_soft_vote_ranking_df[['debate', 'debater']].drop_duplicates()
    llm_winners_p1_ranked = pd.merge(all_debaters, llm_winners_p1_long, on=['debate', 'debater'], how='left')
    llm_winners_p1_ranked['model'] = llm_winners_p1_ranked.groupby(['debate'])['model'].ffill().bfill()
    llm_winners_p1_ranked['rank'] = np.where(llm_winners_p1_ranked['debater'].isin(llm_winners_p1_long['debater']), 1, 2)

    final_llm_rankings = pd.concat([
        llm_winners_p1_ranked[['prompt', 'debate', 'model', 'debater', 'rank']], 
        llm_ranking_p234[['prompt', 'debate', 'model', 'debater', 'rank']]
    ])
    print(">>> Rankings dos LLMs preparados.")
except Exception as e:
    print(f"Ocorreu um erro na preparação dos rankings dos LLMs: {e}")

# --- 3. FUNÇÕES PARA CÁLCULO DAS MÉTRICAS ---

def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    accuracy = correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')
    return accuracy
    
# --- NOVA FUNÇÃO ADICIONADA AQUI ---
def calculate_debaters_accuracy(predictions, ground_truth):
    # Junta as predições e o gabarito
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    
    # Conta onde o rank previsto é igual ao rank real
    correct_ranks = (merged['rank_pred'] == merged['rank_gt'])
    
    # A acurácia é a proporção de acertos em relação ao total de debatedores no gabarito
    accuracy_by_group = correct_ranks.groupby([merged['prompt'], merged['model']]).sum()
    total_debaters_by_group = merged.groupby(['prompt', 'model'])['debater'].nunique()
    
    # Para calcular a acurácia geral, precisamos do total de comparações possíveis
    # Total de debatedores no ground truth
    total_debaters_gt = len(ground_truth)
    # Número de prompts e modelos
    num_prompts = predictions['prompt'].nunique()
    num_models = predictions['model'].nunique()
    
    # Acurácia total
    total_correct = correct_ranks.sum()
    # Acurácia por grupo
    accuracy = accuracy_by_group / total_debaters_by_group

    return accuracy.rename('debaters_accuracy')


def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    mrr = (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')
    return mrr

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
        
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

# --- 4. EXECUÇÃO DAS ANÁLISES ---
print("\n[ETAPA 3/4] Calculando as métricas finais...")

ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
        
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    # --- CHAMADA DA NOVA FUNÇÃO ---
    debaters_acc = calculate_debaters_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    
    # --- ADIÇÃO DA NOVA MÉTRICA AO RESULTADO ---
    result_df = pd.concat([acc, debaters_acc, mrr, ndcg], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

print("\n[ETAPA 4/4] Tabela de Resultados Finais:")
display(final_summary_df)

final_summary_df.to_csv('final_summary_results.csv')
print("\nTabela de resultados finais salva em 'final_summary_results.csv'")

--- Iniciando a Análise Final e Cálculo de Métricas (v2 - Completa) ---

[ETAPA 1/5] Carregando todos os dataframes...
>>> Dados carregados com sucesso.

[ETAPA 2/5] Preparando e padronizando os rankings...
>>> Rankings dos LLMs preparados.

[ETAPA 3/4] Calculando as métricas finais...


  llm_winners_p1_ranked['model'] = llm_winners_p1_ranked.groupby(['debate'])['model'].ffill().bfill()



[ETAPA 4/4] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,debaters_accuracy,mrr,ndcg
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
vs_Judges_HardVote,0.0,0,,,,0.982091
vs_Judges_HardVote,1.0,claude,0.555556,5.6,0.791667,0.925408
vs_Judges_HardVote,1.0,gemini,0.705882,6.2,0.848958,0.95368
vs_Judges_HardVote,1.0,gpt4o,0.444444,5.2,0.689583,0.908007
vs_Judges_HardVote,2.0,claude,0.588235,5.6,0.802083,0.951635
vs_Judges_HardVote,2.0,gemini,0.529412,5.4,0.734375,0.929081
vs_Judges_HardVote,2.0,gpt4o,0.529412,5.6,0.734375,0.931477
vs_Judges_HardVote,4.0,claude,0.444444,4.2,0.794872,0.752186
vs_Judges_HardVote,4.0,gemini,0.588235,4.8,0.811111,0.857898
vs_Judges_HardVote,4.0,gpt4o,0.555556,6.2,0.765625,0.917397



Tabela de resultados finais salva em 'final_summary_results.csv'


In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

print("--- Iniciando a Análise Final e Cálculo de Métricas (v3 - Debaters Accuracy Corrigido) ---")

# --- 1. CARREGAMENTO DE TODOS OS DADOS PREPARADOS ---
print("\n[ETAPA 1/5] Carregando todos os dataframes...")
try:
    llm_scores_df = pd.read_csv('aggregated_results.csv')
    llm_winners_p1_df = pd.read_csv('aggregated_results_prompt_1_winners.csv')
    self_assessment_ranking_df = pd.read_csv('gt_self_assessment_ranking.csv')
    judges_hard_vote_ranking_df = pd.read_csv('gt_judges_hard_vote_ranking.csv')
    judges_soft_vote_ranking_df = pd.read_csv('gt_judges_soft_vote_ranking.csv')
    print(">>> Dados carregados com sucesso.")
except Exception as e:
    print(f"ERRO: Arquivo não encontrado. Verifique se todos os arquivos .csv estão na pasta. Detalhe: {e}")

# --- 2. PREPARAÇÃO E PADRONIZAÇÃO DOS RANKINGS ---
print("\n[ETAPA 2/5] Preparando e padronizando os rankings...")
try:
    def standardize_debater_name(df, col_name='debater'):
        if col_name in df.columns:
            df[col_name] = df[col_name].astype(str).str.upper().str.replace(' ', '_').str.replace('DEBATER_', 'DEBATER_')
        return df

    llm_scores_df = standardize_debater_name(llm_scores_df)

    id_vars = ['debate']
    value_vars = [col for col in llm_winners_p1_df.columns if col != 'debate']
    llm_winners_p1_long = llm_winners_p1_df.melt(id_vars=id_vars, value_vars=value_vars, var_name='model', value_name='debater')
    llm_winners_p1_long = standardize_debater_name(llm_winners_p1_long)
    llm_winners_p1_df = llm_winners_p1_long.pivot(index='debate', columns='model', values='debater').reset_index()

    self_assessment_ranking_df = standardize_debater_name(self_assessment_ranking_df)
    judges_hard_vote_ranking_df = standardize_debater_name(judges_hard_vote_ranking_df)
    judges_soft_vote_ranking_df = standardize_debater_name(judges_soft_vote_ranking_df)

    llm_total_scores = llm_scores_df.groupby(['prompt', 'debate', 'model', 'debater'])['score'].sum().reset_index()
    llm_total_scores['rank'] = llm_total_scores.groupby(['prompt', 'debate', 'model'])['score'].rank(method='dense', ascending=False).astype(int)
    llm_ranking_p234 = llm_total_scores.sort_values(by=['prompt', 'debate', 'model', 'rank'])

    llm_winners_p1_long = llm_winners_p1_df.melt(id_vars='debate', var_name='model', value_name='debater')
    llm_winners_p1_long['prompt'] = 1
    
    all_debaters = judges_soft_vote_ranking_df[['debate', 'debater']].drop_duplicates()
    llm_winners_p1_ranked = pd.merge(all_debaters, llm_winners_p1_long, on=['debate', 'debater'], how='left')
    llm_winners_p1_ranked['model'] = llm_winners_p1_ranked.groupby(['debate'])['model'].ffill().bfill()
    llm_winners_p1_ranked['rank'] = np.where(llm_winners_p1_ranked['model'].notna(), 1, 2)
    
    final_llm_rankings = pd.concat([llm_winners_p1_ranked, llm_ranking_p234])
    print(">>> Rankings dos LLMs preparados.")
except Exception as e:
    print(f"Ocorreu um erro na preparação dos rankings dos LLMs: {e}")

# --- 3. FUNÇÕES PARA CÁLCULO DAS MÉTRICAS ---

def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    accuracy = correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')
    return accuracy
    
# --- FUNÇÃO CORRIGIDA ---
def calculate_debaters_accuracy(predictions, ground_truth):
    # Junta as predições e o gabarito
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    
    # Uma predição de rank está correta se o rank previsto for igual ao rank real
    merged['is_correct'] = (merged['rank_pred'] == merged['rank_gt'])
    
    # A acurácia é a média da coluna 'is_correct' (True=1, False=0), agrupada por teste
    accuracy = merged.groupby(['prompt', 'model'])['is_correct'].mean()
    
    return accuracy.rename('debaters_accuracy')

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    mrr = (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')
    return mrr

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
        
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

# --- 4. EXECUÇÃO DAS ANÁLISES ---
print("\n[ETAPA 3/4] Calculando as métricas finais...")

ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
        
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    debaters_acc = calculate_debaters_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    
    result_df = pd.concat([acc, debaters_acc, mrr, ndcg], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

# --- 5. EXIBIÇÃO E SALVAMENTO ---
print("\n[ETAPA 4/4] Tabela de Resultados Finais:")
display(final_summary_df)

final_summary_df.to_csv('final_summary_results.csv')
print("\nTabela de resultados finais salva em 'final_summary_results.csv'")

--- Iniciando a Análise Final e Cálculo de Métricas (v3 - Debaters Accuracy Corrigido) ---

[ETAPA 1/5] Carregando todos os dataframes...
>>> Dados carregados com sucesso.

[ETAPA 2/5] Preparando e padronizando os rankings...
>>> Rankings dos LLMs preparados.

[ETAPA 3/4] Calculando as métricas finais...


  llm_winners_p1_ranked['model'] = llm_winners_p1_ranked.groupby(['debate'])['model'].ffill().bfill()



[ETAPA 4/4] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,debaters_accuracy,mrr,ndcg
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
vs_Judges_HardVote,0.0,0,,,,0.982091
vs_Judges_HardVote,1.0,claude,0.555556,0.411765,0.791667,0.925408
vs_Judges_HardVote,1.0,gemini,0.705882,0.455882,0.848958,0.95368
vs_Judges_HardVote,1.0,gpt4o,0.444444,0.382353,0.689583,0.908007
vs_Judges_HardVote,2.0,claude,0.588235,0.411765,0.802083,0.951635
vs_Judges_HardVote,2.0,gemini,0.529412,0.397059,0.734375,0.929081
vs_Judges_HardVote,2.0,gpt4o,0.529412,0.411765,0.734375,0.931477
vs_Judges_HardVote,4.0,claude,0.444444,0.375,0.794872,0.752186
vs_Judges_HardVote,4.0,gemini,0.588235,0.375,0.811111,0.857898
vs_Judges_HardVote,4.0,gpt4o,0.555556,0.455882,0.765625,0.917397



Tabela de resultados finais salva em 'final_summary_results.csv'


In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
from scipy.stats import kendalltau, spearmanr # <-- Novo import

print("--- Iniciando a Análise Final e Cálculo de Métricas (v4 - com Kendall e Spearman) ---")

# --- 1. CARREGAMENTO E PREPARAÇÃO DOS DADOS ---
# (Esta parte assume que os CSVs já foram carregados e os DFs preparados como na célula anterior)
print("\n[ETAPA 1/5] Carregando e preparando todos os dataframes...")
# ... (código das seções 1 e 2 da resposta anterior para carregar e preparar os DFs) ...
# Vou omitir o código completo para ser breve, mas ele deve estar aqui.
# Certifique-se que os DFs: final_llm_rankings, self_assessment_ranking_df, 
# judges_hard_vote_ranking_df, e judges_soft_vote_ranking_df estão carregados.
print(">>> Dados carregados e preparados com sucesso.")


# --- 2. FUNÇÕES PARA CÁLCULO DAS MÉTRICAS (COM ADIÇÕES) ---

def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    accuracy = correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')
    return accuracy
    
def calculate_debaters_accuracy(predictions, ground_truth):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    merged['is_correct'] = (merged['rank_pred'] == merged['rank_gt'])
    accuracy = merged.groupby(['prompt', 'model'])['is_correct'].mean()
    return accuracy.rename('debaters_accuracy')

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    mrr = (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')
    return mrr

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

# --- NOVAS FUNÇÕES ADICIONADAS AQUI ---
def calculate_rank_correlation(predictions, ground_truth, method='kendall'):
    """Calcula a correlação de rank média (Kendall's Tau ou Spearman's Rho)."""
    # Garante que os dataframes estejam alinhados
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    
    correlations = []
    # Itera sobre cada teste (prompt, debate, model)
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        # Garante que as listas de rank tenham o mesmo tamanho
        if len(group['rank_pred']) == len(group['rank_gt']) and len(group) > 1:
            if method == 'kendall':
                corr, _ = kendalltau(group['rank_pred'], group['rank_gt'])
            elif method == 'spearman':
                corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])
            else:
                corr = np.nan
            correlations.append({'prompt': name[0], 'model': name[1], 'correlation': corr})
    
    # Calcula a média da correlação para cada (prompt, model)
    corr_df = pd.DataFrame(correlations)
    avg_corr = corr_df.groupby(['prompt', 'model'])['correlation'].mean()
    return avg_corr

# --- 4. EXECUÇÃO DAS ANÁLISES ---
print("\n[ETAPA 3/5] Calculando as métricas finais (incluindo Kendall e Spearman)...")

ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
        
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    debaters_acc = calculate_debaters_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    # --- CHAMADA DAS NOVAS FUNÇÕES ---
    kendall = calculate_rank_correlation(predictions_filtered, gt_df, method='kendall').rename('kendall_tau')
    spearman = calculate_rank_correlation(predictions_filtered, gt_df, method='spearman').rename('spearman_rho')
    
    # --- ADIÇÃO DAS NOVAS MÉTRICAS AO RESULTADO ---
    result_df = pd.concat([acc, debaters_acc, mrr, ndcg, kendall, spearman], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

# --- 5. EXIBIÇÃO E SALVAMENTO ---
print("\n[ETAPA 4/5] Tabela de Resultados Finais:")
display(final_summary_df)

final_summary_df.to_csv('final_summary_results_full.csv')
print("\nTabela de resultados finais salva em 'final_summary_results_full.csv'")

--- Iniciando a Análise Final e Cálculo de Métricas (v4 - com Kendall e Spearman) ---

[ETAPA 1/5] Carregando e preparando todos os dataframes...
>>> Dados carregados e preparados com sucesso.

[ETAPA 3/5] Calculando as métricas finais (incluindo Kendall e Spearman)...


  corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])



[ETAPA 4/5] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,debaters_accuracy,mrr,ndcg,kendall_tau,spearman_rho
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vs_Judges_HardVote,0.0,0,,,,0.982091,,
vs_Judges_HardVote,1.0,claude,0.555556,0.411765,0.791667,0.925408,0.592781,0.674839
vs_Judges_HardVote,1.0,gemini,0.705882,0.455882,0.848958,0.95368,0.630921,0.680794
vs_Judges_HardVote,1.0,gpt4o,0.444444,0.382353,0.689583,0.908007,0.336921,0.365731
vs_Judges_HardVote,2.0,claude,0.588235,0.411765,0.802083,0.951635,0.618296,0.715615
vs_Judges_HardVote,2.0,gemini,0.529412,0.397059,0.734375,0.929081,0.50094,0.570866
vs_Judges_HardVote,2.0,gpt4o,0.529412,0.411765,0.734375,0.931477,0.57663,0.635609
vs_Judges_HardVote,4.0,claude,0.444444,0.375,0.794872,0.752186,0.505294,0.60839
vs_Judges_HardVote,4.0,gemini,0.588235,0.375,0.811111,0.857898,0.688715,0.764547
vs_Judges_HardVote,4.0,gpt4o,0.555556,0.455882,0.765625,0.917397,0.434516,0.488749



Tabela de resultados finais salva em 'final_summary_results_full.csv'


In [3]:
import os
import json
import pandas as pd
import numpy as np
from collections import Counter
import spacy
from tqdm.notebook import tqdm
from sklearn.metrics import ndcg_score
from scipy.stats import kendalltau, spearmanr

print("--- INICIANDO ANÁLISE COMPLETA E DEFINITIVA ---")

# --- 1. CONFIGURAÇÃO ---
OUTPUTS_DIR = 'outputs'
SELF_ASSESSMENT_PATH = 'gt_self_assessment_ranking.csv'
JUDGES_HARD_VOTE_PATH = 'gt_judges_hard_vote_ranking.csv'
JUDGES_SOFT_VOTE_PATH = 'gt_judges_soft_vote_ranking.csv'

# --- 2. CARREGAMENTO E PROCESSAMENTO DOS DADOS BRUTOS DOS LLMs ---
print("\n[ETAPA 1/6] Lendo e processando os arquivos JSON dos LLMs...")
all_runs_data = []
try:
    for root, dirs, files in os.walk(OUTPUTS_DIR):
        for filename in files:
            if filename.endswith('.json'):
                file_path = os.path.join(root, filename)
                try:
                    parts = file_path.split(os.sep)
                    prompt_num = int(parts[1].split('_')[1])
                    debate_num = int(parts[2].split('_')[1])
                    model_name = filename.split('_')[0]
                    run_num = int(filename.split('_')[2].split('.')[0])
                except (ValueError, IndexError): continue

                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                if "error" in data: continue

                for debater in data.get('debaters', []):
                    debater_name = debater.get('name')
                    performance_analysis = debater.get('performance', {}).get('performance_analysis', debater.get('performance_evaluation', ''))
                    
                    if prompt_num == 1:
                        score = debater.get('overall_score')
                        if score is not None:
                            all_runs_data.append({'prompt': 1, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': 'overall_score', 'score': score, 'analysis_text': performance_analysis})
                    elif prompt_num == 3:
                        positive_events = debater.get('positive_events', {})
                        negative_events = debater.get('negative_events', {})
                        score = sum(len(v) for v in positive_events.values()) - sum(len(v) for v in negative_events.values())
                        all_runs_data.append({'prompt': 3, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': 'total_event_score', 'score': score, 'analysis_text': performance_analysis})
                    else: # Prompts 2 e 4
                        scores_data = debater.get('scores', {})
                        if not scores_data: scores_data = {k: v.get('score') for k, v in debater.get('evaluation_aspects', {}).items()}
                        for criterion, score in scores_data.items():
                            all_runs_data.append({'prompt': prompt_num, 'debate': debate_num, 'model': model_name, 'run': run_num, 'debater': debater_name, 'criterion': criterion, 'score': score, 'analysis_text': performance_analysis})
    
    raw_df = pd.DataFrame(all_runs_data)
    print(f">>> Leitura de {len(raw_df)} registros brutos concluída.")
except Exception as e:
    print(f"ERRO na leitura dos JSONs: {e}")


# --- 3. CARREGAMENTO DOS DADOS DE GROUND TRUTH ---
print("\n[ETAPA 2/6] Carregando arquivos de Ground Truth...")
try:
    self_assessment_ranking_df = pd.read_csv(SELF_ASSESSMENT_PATH)
    judges_hard_vote_ranking_df = pd.read_csv(JUDGES_HARD_VOTE_PATH)
    judges_soft_vote_ranking_df = pd.read_csv(JUDGES_SOFT_VOTE_PATH)
    print(">>> Dados de Ground Truth carregados com sucesso.")
except Exception as e:
    print(f"ERRO ao carregar arquivos de Ground Truth: {e}")

# --- 4. PREPARAÇÃO E PADRONIZAÇÃO FINAL DOS RANKINGS ---
print("\n[ETAPA 3/6] Preparando e padronizando rankings...")
try:
    def standardize_debater_name(df, col_name='debater'):
        if col_name in df.columns:
            df[col_name] = df[col_name].astype(str).str.upper().str.replace(' ', '_').str.replace('DEBATER_', 'DEBATER_')
        return df

    raw_df = standardize_debater_name(raw_df)
    self_assessment_ranking_df = standardize_debater_name(self_assessment_ranking_df)
    judges_hard_vote_ranking_df = standardize_debater_name(judges_hard_vote_ranking_df)
    judges_soft_vote_ranking_df = standardize_debater_name(judges_soft_vote_ranking_df)

    agg_df = raw_df.groupby(['prompt', 'debate', 'model', 'debater', 'criterion'])['score'].mean().reset_index()
    total_scores = agg_df.groupby(['prompt', 'debate', 'model', 'debater'])['score'].sum().reset_index()
    total_scores['rank'] = total_scores.groupby(['prompt', 'debate', 'model'])['score'].rank(method='dense', ascending=False).astype(int)
    final_llm_rankings = total_scores.sort_values(by=['prompt', 'debate', 'model', 'rank'])
    print(">>> Rankings dos LLMs e Ground Truth preparados e padronizados.")
except Exception as e:
    print(f"Ocorreu um erro na preparação dos rankings: {e}")

# --- 5. FUNÇÕES DE MÉTRICA ---
def calculate_winners_accuracy(predictions, ground_truth):
    pred_winners = predictions[predictions['rank'] == 1]
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(pred_winners, gt_winners[['debate', 'gt_debater']], on='debate')
    correct = (merged['debater'] == merged['gt_debater'])
    return correct.groupby([merged['prompt'], merged['model']]).mean().rename('winners_accuracy')

def calculate_debaters_accuracy(predictions, ground_truth):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    merged['is_correct'] = (merged['rank_pred'] == merged['rank_gt'])
    return merged.groupby(['prompt', 'model'])['is_correct'].mean().rename('debaters_accuracy')

def calculate_mrr(predictions, ground_truth):
    gt_winners = ground_truth[ground_truth['rank'] == 1].rename(columns={'debater': 'gt_debater'})
    merged = pd.merge(predictions, gt_winners[['debate', 'gt_debater']], on='debate', how='left')
    correct_predictions = merged[merged['debater'] == merged['gt_debater']]
    first_correct_rank = correct_predictions.groupby(['prompt', 'debate', 'model'])['rank'].min()
    return (1 / first_correct_rank).groupby(['prompt', 'model']).mean().rename('mrr')

def calculate_ndcg(predictions, ground_truth):
    ground_truth['relevance'] = 1 / ground_truth['rank']
    merged = pd.merge(predictions, ground_truth[['debate', 'debater', 'relevance']], on=['debate', 'debater'], how='left').fillna(0)
    results = {}
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        prompt, model, debate = name
        if len(group) < 2: continue
        true_relevance = np.asarray([group.sort_values(by='relevance', ascending=False)['relevance'].values])
        predicted_relevance = np.asarray([group.sort_values(by='rank')['relevance'].values])
        ndcg = ndcg_score(true_relevance, predicted_relevance)
        if (prompt, model) not in results: results[(prompt, model)] = []
        results[(prompt, model)].append(ndcg)
    final_ndcg = {k: np.mean(v) for k, v in results.items()}
    return pd.Series(final_ndcg, name='ndcg').rename_axis(['prompt', 'model'])

def calculate_rank_correlation(predictions, ground_truth, method='kendall'):
    merged = pd.merge(predictions, ground_truth, on=['debate', 'debater'], suffixes=('_pred', '_gt'))
    correlations = []
    for name, group in merged.groupby(['prompt', 'model', 'debate']):
        if len(group['rank_pred']) == len(group['rank_gt']) and len(group) > 1:
            if method == 'kendall': corr, _ = kendalltau(group['rank_pred'], group['rank_gt'])
            elif method == 'spearman': corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])
            else: corr = np.nan
            correlations.append({'prompt': name[0], 'model': name[1], 'correlation': corr})
    corr_df = pd.DataFrame(correlations)
    return corr_df.groupby(['prompt', 'model'])['correlation'].mean()


# --- 6. EXECUÇÃO FINAL E EXIBIÇÃO ---
print("\n[ETAPA 4/6] Calculando as métricas finais...")
ground_truths = {
    "vs_SelfAssessment": self_assessment_ranking_df,
    "vs_Judges_HardVote": judges_hard_vote_ranking_df,
    "vs_Judges_SoftVote": judges_soft_vote_ranking_df
}
final_results_list = []

for gt_name, gt_df in ground_truths.items():
    common_debates = gt_df['debate'].unique()
    predictions_filtered = final_llm_rankings[final_llm_rankings['debate'].isin(common_debates)]
    
    acc = calculate_winners_accuracy(predictions_filtered, gt_df)
    debaters_acc = calculate_debaters_accuracy(predictions_filtered, gt_df)
    mrr = calculate_mrr(predictions_filtered, gt_df)
    ndcg = calculate_ndcg(predictions_filtered, gt_df)
    kendall = calculate_rank_correlation(predictions_filtered, gt_df, method='kendall').rename('kendall_tau')
    spearman = calculate_rank_correlation(predictions_filtered, gt_df, method='spearman').rename('spearman_rho')
    
    result_df = pd.concat([acc, debaters_acc, mrr, ndcg, kendall, spearman], axis=1)
    result_df['comparison'] = gt_name
    final_results_list.append(result_df)

final_summary_df = pd.concat(final_results_list).reset_index()
final_summary_df = final_summary_df.set_index(['comparison', 'prompt', 'model']).sort_index()

print("\n[ETAPA 5/6] Tabela de Resultados Finais:")
display(final_summary_df)

print("\n[ETAPA 6/6] Salvando tabela de resultados...")
final_summary_df.to_csv('final_summary_results_full.csv')
print("\nTabela de resultados finais salva em 'final_summary_results_full.csv'")
print("\n--- Análise concluída! ---")

--- INICIANDO ANÁLISE COMPLETA E DEFINITIVA ---

[ETAPA 1/6] Lendo e processando os arquivos JSON dos LLMs...
>>> Leitura de 12259 registros brutos concluída.

[ETAPA 2/6] Carregando arquivos de Ground Truth...
>>> Dados de Ground Truth carregados com sucesso.

[ETAPA 3/6] Preparando e padronizando rankings...
>>> Rankings dos LLMs e Ground Truth preparados e padronizados.

[ETAPA 4/6] Calculando as métricas finais...


  elif method == 'spearman': corr, _ = spearmanr(group['rank_pred'], group['rank_gt'])



[ETAPA 5/6] Tabela de Resultados Finais:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,winners_accuracy,debaters_accuracy,mrr,ndcg,kendall_tau,spearman_rho
comparison,prompt,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vs_Judges_HardVote,1,claude,0.555556,0.411765,0.791667,0.925408,0.592781,0.674839
vs_Judges_HardVote,1,gemini,0.705882,0.455882,0.848958,0.95368,0.630921,0.680794
vs_Judges_HardVote,1,gpt4o,0.444444,0.382353,0.689583,0.908007,0.336921,0.365731
vs_Judges_HardVote,2,claude,0.588235,0.411765,0.802083,0.951635,0.618296,0.715615
vs_Judges_HardVote,2,gemini,0.529412,0.397059,0.734375,0.929081,0.50094,0.570866
vs_Judges_HardVote,2,gpt4o,0.529412,0.411765,0.734375,0.931477,0.57663,0.635609
vs_Judges_HardVote,3,claude,0.5,0.323529,0.75,0.919716,0.520181,0.598676
vs_Judges_HardVote,3,gemini,0.444444,0.355932,0.710714,0.781333,0.476835,0.528962
vs_Judges_HardVote,3,gpt4o,0.333333,0.279412,0.729167,0.870884,0.186197,0.200352
vs_Judges_HardVote,4,claude,0.444444,0.375,0.794872,0.752186,0.505294,0.60839



[ETAPA 6/6] Salvando tabela de resultados...

Tabela de resultados finais salva em 'final_summary_results_full.csv'

--- Análise concluída! ---
