# CÓDIGO PARA COMPARAÇÃO SELF ASSESSMENT X AVALIAÇÃO DOS JURADOS

## ACURACIA WINNERS

In [1]:
# Importing necessary libraries
import pandas as pd
import json

# Placeholder paths for JSON input and CSV outputs
input_json_path = "aggregated_debate_results.json"
soft_voting_csv_path = "soft_voting.csv"
hard_voting_csv_path = "hard_voting.csv"

# Load JSON data
with open(input_json_path, "r") as file:
    data = json.load(file)

# Prepare lists for each CSV
soft_voting_rows = []
hard_voting_rows = []

# Process the JSON data to extract rows for both CSVs
for debate in data:
    debate_id = debate["debate_id"]
    for entry in debate["soft_voting_ranking"]:
        soft_voting_rows.append(
            {"debate_id": debate_id, "position": entry["position"], "name": entry["name"], "score": entry["score"]}
        )
    for entry in debate["hard_voting_ranking"]:
        hard_voting_rows.append(
            {"debate_id": debate_id, "position": entry["position"], "name": entry["name"], "score": entry["score"]}
        )

# Create DataFrames
soft_voting_df = pd.DataFrame(soft_voting_rows)
hard_voting_df = pd.DataFrame(hard_voting_rows)

# Save to CSV files
soft_voting_df.to_csv(soft_voting_csv_path, index=False)
hard_voting_df.to_csv(hard_voting_csv_path, index=False)

print(f"Soft voting CSV saved to: {soft_voting_csv_path}")
print(f"Hard voting CSV saved to: {hard_voting_csv_path}")


Soft voting CSV saved to: soft_voting.csv
Hard voting CSV saved to: hard_voting.csv


In [2]:
import pandas as pd

def evaluate_predictions(ground_truth_path, predictions_path, excluded_debates, voting_type):
    """
    Avalia a precisão das predições com base nos vencedores previstos e reais.
    
    Args:
        ground_truth_path (str): Caminho para o arquivo CSV de ground truth (self-assessment).
        predictions_path (str): Caminho para o arquivo CSV das predições (hard ou soft voting).
        excluded_debates (list): Lista de IDs de debates a serem excluídos.
        voting_type (str): Tipo de votação ('hard' ou 'soft') para indicar o tipo de predictions.
    
    Returns:
        None: Exibe a precisão calculada e detalhes.
    """
    # Load datasets
    ground_truth_df = pd.read_csv(ground_truth_path)
    predictions_df = pd.read_csv(predictions_path)

    # Exclude specific debates
    ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Rename columns to ensure consistency
    ground_truth_df = ground_truth_df.rename(columns={
        'debater_name': 'name',
        'debater_position': 'position'
    })

    # Merge datasets on debate_id
    merged_df = pd.merge(
        ground_truth_df[['debate_id', 'name', 'position']],
        predictions_df[['debate_id', 'name', 'position']],
        on='debate_id',
        suffixes=('_true', '_pred')
    )

    # Identify ground truth winners
    ground_truth_winners = merged_df[merged_df['position_true'] == 1]

    # Initialize list for accuracy calculations and details
    correct_predictions = []
    debate_details = []

    for debate_id, group in ground_truth_winners.groupby('debate_id'):
        # Get the names of the ground truth winners
        true_winners = set(group['name_true'])
        
        # Get the predicted winners for this debate_id
        predicted_winners = set(
            merged_df[
                (merged_df['debate_id'] == debate_id) & 
                (merged_df['position_pred'] == 1)
            ]['name_pred']
        )
        
        # Save the details for display
        debate_details.append({
            'debate_id': debate_id,
            'true_winners': true_winners,
            'predicted_winners': predicted_winners
        })
        
        # If there is an intersection between true and predicted winners, it's a correct prediction
        if true_winners.intersection(predicted_winners):
            correct_predictions.append(1)
        else:
            correct_predictions.append(0)

    # Calculate accuracy
    accuracy = sum(correct_predictions) / len(correct_predictions)
    print(f'Accuracy of winners ({voting_type} voting): {accuracy * 100:.2f}%')

    # Print the expected and predicted winners for each debate
    for detail in debate_details:
        print(f"Debate ID: {detail['debate_id']}")
        print(f"Expected winners: {', '.join(detail['true_winners'])}")
        print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
        print('-' * 40)

# File paths
hard_voting_path = 'hard_voting.csv'
soft_voting_path = 'soft_voting.csv'
self_assessment_path = 'rankings_self_assessment_.csv'

# Excluded debates
excluded_debates = [10, 17]

# Evaluate for hard voting
print("Evaluating Hard Voting:")
evaluate_predictions(self_assessment_path, hard_voting_path, excluded_debates, 'hard')

# Evaluate for soft voting
print("\nEvaluating Soft Voting:")
evaluate_predictions(self_assessment_path, soft_voting_path, excluded_debates, 'soft')




Evaluating Hard Voting:
Accuracy of winners (hard voting): 71.43%
Debate ID: 1
Expected winners: Debater 2
Predicted winners: Debater 2
----------------------------------------
Debate ID: 2
Expected winners: Debater 4, Debater 1
Predicted winners: Debater 5
----------------------------------------
Debate ID: 3
Expected winners: Debater 4
Predicted winners: Debater 3
----------------------------------------
Debate ID: 5
Expected winners: Debater 3
Predicted winners: Debater 5
----------------------------------------
Debate ID: 6
Expected winners: Debater 1
Predicted winners: Debater 1
----------------------------------------
Debate ID: 7
Expected winners: Debater 2
Predicted winners: Debater 2
----------------------------------------
Debate ID: 8
Expected winners: Debater 1
Predicted winners: Debater 1
----------------------------------------
Debate ID: 9
Expected winners: Debater 1
Predicted winners: Debater 4, Debater 1
----------------------------------------
Debate ID: 11
Expected w

## ACURACIA DEBATERS

In [3]:
import pandas as pd

def calculate_ranking_accuracy(ground_truth_df, predictions_df, excluded_debates=[10, 17]):
    """
    Calcula a acurácia do ranking comparando as previsões de rankings com os rankings reais (ground truth).

    Parameters:
    - ground_truth_df: DataFrame com os rankings reais (hard ou soft voting).
    - predictions_df: DataFrame com as previsões dos modelos (ex: self-assessment).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).

    Returns:
    - ranking_accuracy: Acurácia do ranking (em porcentagem).
    """

    # Filtra os debates a serem excluídos
    ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar as acurácias de ranking de cada debate
    correct_rankings = []
    debate_details = []

    # Variáveis para acumular o número total de acertos e debatedores
    total_correct = 0
    total_debaters = 0

    # Para cada debate, compara o ranking completo dos debatedores
    for debate_id in ground_truth_df['debate_id'].unique():
        # Filtra os debatedores do debate real e do modelo
        true_ranking = ground_truth_df[ground_truth_df['debate_id'] == debate_id][['name', 'position']]
        predicted_ranking = predictions_df[predictions_df['debate_id'] == debate_id][['debater_name', 'debater_position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsão)
        true_ranking_sorted = true_ranking.sort_values(by='position').reset_index(drop=True)
        predicted_ranking_sorted = predicted_ranking.sort_values(by='debater_position').reset_index(drop=True)

        # Salva os detalhes do debate para exibição
        debate_details.append({
            'debate_id': debate_id,
            'true_ranking': true_ranking_sorted['name'].tolist(),
            'predicted_ranking': predicted_ranking_sorted['debater_name'].tolist()
        })

        # Inicializa o contador de acertos considerando empates
        correct_rankings_count = 0

        # Compara os rankings completos (considerando a ordem exata de posições)
        for i in range(len(true_ranking_sorted)):
            if true_ranking_sorted.iloc[i]['name'] == predicted_ranking_sorted.iloc[i]['debater_name']:
                correct_rankings_count += 1

        # Calcula a fração de debatedores classificados corretamente neste debate
        correct_rankings.append(correct_rankings_count / len(true_ranking_sorted))

        # Acumula os valores totais
        total_correct += correct_rankings_count
        total_debaters += len(true_ranking_sorted)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Ranking esperado: {', '.join(true_ranking_sorted['name'])}")
        print(f"Ranking previsto: {', '.join(predicted_ranking_sorted['debater_name'])}")
        print(f"{correct_rankings_count} acertos de {len(true_ranking_sorted)} debatedores.\n")

    # Calcula a acurácia média de ranking
    ranking_accuracy = sum(correct_rankings) / len(correct_rankings) if correct_rankings else 0

    # Exibe a acurácia do ranking
    print(f'Acurácia de ranking: {ranking_accuracy * 100:.2f}%')

    # Exibe o número total de acertos dividido pelo número total de debatedores
    print(f"Total de acertos: {total_correct} de {total_debaters} debatedores ({(total_correct / total_debaters) * 100:.2f}%)")

    return ranking_accuracy

# Função principal para avaliar hard e soft voting
def evaluate_voting(ground_truth_path, predictions_path, excluded_debates, voting_type):
    print(f"\n--- Avaliando {voting_type.capitalize()} Voting ---\n")

    # Carregar os datasets
    ground_truth_df = pd.read_csv(ground_truth_path)
    predictions_df = pd.read_csv(predictions_path)

    # Ajustar nomes das colunas para uniformizar
    ground_truth_df.rename(columns={'position': 'position', 'name': 'name'}, inplace=True)
    predictions_df.rename(columns={'debater_position': 'debater_position', 'debater_name': 'debater_name'}, inplace=True)

    # Calcular acurácia do ranking
    calculate_ranking_accuracy(ground_truth_df, predictions_df, excluded_debates)

# Caminhos para os arquivos
hard_voting_path = "hard_voting.csv"
soft_voting_path = "soft_voting.csv"
self_assessment_path = "rankings_self_assessment_.csv"

# Excluir debates
excluded_debates = [10, 17]

# Avaliar para hard voting
evaluate_voting(hard_voting_path, self_assessment_path, excluded_debates, 'hard')

# Avaliar para soft voting
evaluate_voting(soft_voting_path, self_assessment_path, excluded_debates, 'soft')



--- Avaliando Hard Voting ---

Debate ID: 1
Ranking esperado: Debater 2, Debater 3, Debater 1, Debater 4
Ranking previsto: Debater 2, Debater 4, Debater 1, Debater 3
2 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Ranking previsto: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
3 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 3, Debater 4, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
3 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 5, Debater 3, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
3 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
4 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 3, Debater 1, Debater 4
Ranking 

## MRR

In [8]:
import pandas as pd

def calculate_mrr(hard_voting_df, soft_voting_df, self_assessment_df, excluded_debates=[10, 17]):
    """
    Calcula o Mean Reciprocal Rank (MRR) comparando as previsões de rankings com os rankings reais (ground truth).
    
    Parameters:
    - hard_voting_df: DataFrame com as previsões dos votos duros (hard voting).
    - soft_voting_df: DataFrame com as previsões dos votos suaves (soft voting).
    - self_assessment_df: DataFrame com as autoavaliações (previsões).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).
    
    Returns:
    - mrr_hard: Mean Reciprocal Rank (em decimal) para hard voting.
    - mrr_soft: Mean Reciprocal Rank (em decimal) para soft voting.
    """
    
    # Filtra os debates a serem excluídos
    hard_voting_df = hard_voting_df[~hard_voting_df['debate_id'].isin(excluded_debates)]
    soft_voting_df = soft_voting_df[~soft_voting_df['debate_id'].isin(excluded_debates)]
    self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar os Reciprocal Ranks de cada debate
    reciprocal_ranks_hard = []
    reciprocal_ranks_soft = []

    # Para cada debate, calcula o Reciprocal Rank para hard e soft voting
    for debate_id in self_assessment_df['debate_id'].unique():
        # Filtra os debatedores do debate real (hard e soft voting)
        true_ranking_hard = hard_voting_df[hard_voting_df['debate_id'] == debate_id][['name', 'position']]
        true_ranking_soft = soft_voting_df[soft_voting_df['debate_id'] == debate_id][['name', 'position']]
        
        # Filtra os rankings de predições (self-assessment)
        predicted_ranking = self_assessment_df[self_assessment_df['debate_id'] == debate_id][['debater_name', 'debater_position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsões)
        true_ranking_hard_sorted = true_ranking_hard.sort_values(by='position').reset_index(drop=True)
        true_ranking_soft_sorted = true_ranking_soft.sort_values(by='position').reset_index(drop=True)
        predicted_ranking_sorted = predicted_ranking.sort_values(by='debater_position').reset_index(drop=True)

        # Lista de vencedores no ground truth (pode haver empate)
        true_winners_hard = set(true_ranking_hard_sorted[true_ranking_hard_sorted['position'] == 1]['name'])
        true_winners_soft = set(true_ranking_soft_sorted[true_ranking_soft_sorted['position'] == 1]['name'])

        # Calcula o Reciprocal Rank para hard voting
        rr_hard = 0
        for rank, debater in enumerate(predicted_ranking_sorted['debater_name'], start=1):
            if debater in true_winners_hard:
                rr_hard = 1 / rank
                break
        
        # Calcula o Reciprocal Rank para soft voting
        rr_soft = 0
        for rank, debater in enumerate(predicted_ranking_sorted['debater_name'], start=1):
            if debater in true_winners_soft:
                rr_soft = 1 / rank
                break
        
        # Armazena os RRs dos debates
        reciprocal_ranks_hard.append(rr_hard)
        reciprocal_ranks_soft.append(rr_soft)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Vencedores esperados (Hard Voting): {', '.join(true_winners_hard)}")
        print(f"Vencedores esperados (Soft Voting): {', '.join(true_winners_soft)}")
        print(f"Ranking previsto (Self Assessment): {', '.join(predicted_ranking_sorted['debater_name'])}")
        print(f"Reciprocal Rank (Hard Voting): {rr_hard:.3f}")
        print(f"Reciprocal Rank (Soft Voting): {rr_soft:.3f}\n")

    # Calcula o MRR para hard e soft voting
    mrr_hard = sum(reciprocal_ranks_hard) / len(reciprocal_ranks_hard) if reciprocal_ranks_hard else 0
    mrr_soft = sum(reciprocal_ranks_soft) / len(reciprocal_ranks_soft) if reciprocal_ranks_soft else 0
    
    # Exibe o MRR
    print(f"Mean Reciprocal Rank (MRR) - Hard Voting: {mrr_hard:.3f}")
    print(f"Mean Reciprocal Rank (MRR) - Soft Voting: {mrr_soft:.3f}")

    return mrr_hard, mrr_soft


In [9]:
import pandas as pd

# Carregar os arquivos CSV
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
hard_voting_df = pd.read_csv('hard_voting.csv')
soft_voting_df = pd.read_csv('soft_voting.csv')

# Chamar a função para calcular o MRR
mrr_hard, mrr_soft = calculate_mrr(hard_voting_df, soft_voting_df, self_assessment_df)

# Exibir os resultados
print(f"MRR - Hard Voting: {mrr_hard:.3f}")
print(f"MRR - Soft Voting: {mrr_soft:.3f}")


Debate ID: 1
Vencedores esperados (Hard Voting): Debater 2
Vencedores esperados (Soft Voting): Debater 2
Ranking previsto (Self Assessment): Debater 2, Debater 4, Debater 1, Debater 3
Reciprocal Rank (Hard Voting): 1.000
Reciprocal Rank (Soft Voting): 1.000

Debate ID: 2
Vencedores esperados (Hard Voting): Debater 5
Vencedores esperados (Soft Voting): Debater 5
Ranking previsto (Self Assessment): Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Reciprocal Rank (Hard Voting): 0.333
Reciprocal Rank (Soft Voting): 0.333

Debate ID: 3
Vencedores esperados (Hard Voting): Debater 3
Vencedores esperados (Soft Voting): Debater 3
Ranking previsto (Self Assessment): Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Reciprocal Rank (Hard Voting): 0.500
Reciprocal Rank (Soft Voting): 0.500

Debate ID: 5
Vencedores esperados (Hard Voting): Debater 5
Vencedores esperados (Soft Voting): Debater 5
Ranking previsto (Self Assessment): Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Recipr

## NDCG

In [2]:
from sklearn.metrics import ndcg_score
import pandas as pd
import numpy as np

def calculate_ndcg(ground_truth_df, predictions_df, k=None, exclude_debates=None):
    """
    Calcula o Mean nDCG para os rankings previstos, utilizando os scores do GT e posições previstas pelos modelos.

    Parameters:
    - ground_truth_df: DataFrame contendo os dados reais (ground truth).
      Deve conter as colunas: 'debate_id', 'position', 'name', 'score'.
    - predictions_df: DataFrame contendo as previsões de rankings.
      Deve conter as colunas: 'debate_id', 'debater_name', 'debater_position', 'debater_score'.
    - k: Inteiro opcional para limitar o cálculo ao top-k. Default: None.
    - exclude_debates: Lista de IDs de debates a serem excluídos. Default: None.

    Returns:
    - mean_ndcg: nDCG médio (float).
    """
    # Filtrar debates, se necessário
    if exclude_debates:
        ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(exclude_debates)]
        predictions_df = predictions_df[~predictions_df['debate_id'].isin(exclude_debates)]

    ndcg_scores = []

    for debate_id in ground_truth_df['debate_id'].unique():
        ground_truth = ground_truth_df[ground_truth_df['debate_id'] == debate_id]
        predictions = predictions_df[predictions_df['debate_id'] == debate_id]

        # Garantir consistência
        ground_truth = ground_truth.sort_values(by='name').reset_index(drop=True)
        predictions = predictions.sort_values(by='debater_name').reset_index(drop=True)

        if len(ground_truth) != len(predictions):
            raise ValueError(f"Debate {debate_id} possui tamanhos inconsistentes entre ground truth e previsões.")

        # Relevâncias: usar os scores do GT e ordenar conforme as posições previstas
        true_relevance = ground_truth['score'].values
        pred_order = predictions.sort_values(by='debater_position')['debater_name'].values

        # Reordenar relevâncias do GT com base nas posições previstas
        reordered_true_relevance = [true_relevance[list(ground_truth['name']).index(name)] for name in pred_order]

        # Debugging: Print values to ensure they are correct
        print(f"Debate ID: {debate_id}")
        print(f"True Relevance (original): {true_relevance}")
        print(f"Predicted Order: {pred_order}")
        print(f"Reordered True Relevance: {reordered_true_relevance}")

        ndcg = ndcg_score([true_relevance], [reordered_true_relevance], k=k)
        ndcg_scores.append(ndcg)

        print(f"nDCG for Debate {debate_id}: {ndcg:.3f}\n")

    mean_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0

    # Debugging: Print final results
    print(f"All nDCG Scores: {ndcg_scores}")
    print(f"Mean nDCG: {mean_ndcg:.3f}")
    return mean_ndcg


# Carregar os arquivos CSV
hard_voting_df = pd.read_csv('hard_voting.csv')
soft_voting_df = pd.read_csv('soft_voting.csv')
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')

# Renomear as colunas para uniformizar
#hard_voting_df.rename(columns={'position': 'debater_position', 'name': 'debater_name'}, inplace=True)
#soft_voting_df.rename(columns={'position': 'debater_position', 'name': 'debater_name'}, inplace=True)

# Excluir debates 10 e 17
exclude_debates = [10, 17]

# Calcular o nDCG para Hard Voting
print("\nCalculating nDCG for Hard Voting...")
mean_ndcg_hard = calculate_ndcg(hard_voting_df, self_assessment_df,exclude_debates=exclude_debates)

# Calcular o nDCG para Soft Voting
print("\nCalculating nDCG for Soft Voting...")
mean_ndcg_soft = calculate_ndcg(soft_voting_df,self_assessment_df, exclude_debates=exclude_debates)

# Exibir os resultados
print(f"\nMean nDCG - Hard Voting: {mean_ndcg_hard:.3f}")
print(f"Mean nDCG - Soft Voting: {mean_ndcg_soft:.3f}")




Calculating nDCG for Hard Voting...
Debate ID: 1
True Relevance (original): [0 3 2 0]
Predicted Order: ['Debater 2' 'Debater 4' 'Debater 1' 'Debater 3']
Reordered True Relevance: [np.int64(3), np.int64(0), np.int64(0), np.int64(2)]
nDCG for Debate 1: 0.546

Debate ID: 2
True Relevance (original): [0 0 0 2 5]
Predicted Order: ['Debater 1' 'Debater 4' 'Debater 5' 'Debater 2' 'Debater 3']
Reordered True Relevance: [np.int64(0), np.int64(2), np.int64(5), np.int64(0), np.int64(0)]
nDCG for Debate 2: 0.491

Debate ID: 3
True Relevance (original): [1 0 3 2 0]
Predicted Order: ['Debater 4' 'Debater 3' 'Debater 2' 'Debater 1' 'Debater 5']
Reordered True Relevance: [np.int64(2), np.int64(3), np.int64(0), np.int64(1), np.int64(0)]
nDCG for Debate 3: 0.600

Debate ID: 5
True Relevance (original): [0 0 2 0 4]
Predicted Order: ['Debater 3' 'Debater 5' 'Debater 2' 'Debater 1' 'Debater 4']
Reordered True Relevance: [np.int64(2), np.int64(4), np.int64(0), np.int64(0), np.int64(0)]
nDCG for Debate 5: 0

## COHENS KAPPA

In [7]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

def calculate_cohens_kappa(ground_truth_df, predictions_df, exclude_debates=None):
    """
    Calcula o Cohen's Kappa para as posições dos debatedores.

    Parameters:
    - ground_truth_df: DataFrame contendo os dados reais (ground truth).
      Deve conter as colunas: 'debate_id', 'debater', 'position'.
    - predictions_df: DataFrame contendo as previsões de rankings.
      Deve conter as colunas: 'debate_id', 'debater', 'position'.
    - exclude_debates: Lista de IDs de debates a serem excluídos. Default: None.

    Returns:
    - kappa_scores: Dicionário com o Cohen's Kappa para cada debate.
    - mean_kappa: Média do Cohen's Kappa entre todos os debates.
    """
    # Filtrar debates, se necessário
    if exclude_debates:
        ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(exclude_debates)]
        predictions_df = predictions_df[~predictions_df['debate_id'].isin(exclude_debates)]

    kappa_scores = {}

    for debate_id in ground_truth_df['debate_id'].unique():
        # Filtrar dados do debate atual
        ground_truth = ground_truth_df[ground_truth_df['debate_id'] == debate_id]
        predictions = predictions_df[predictions_df['debate_id'] == debate_id]

        # Garantir consistência na ordenação por debatedor
        ground_truth = ground_truth.sort_values(by='name').reset_index(drop=True)
        predictions = predictions.sort_values(by='name').reset_index(drop=True)

        if len(ground_truth) != len(predictions):
            raise ValueError(f"Debate {debate_id} possui tamanhos inconsistentes entre ground truth e previsões.")

        # Extração das posições
        y_true = ground_truth['position'].values
        y_pred = predictions['position'].values

        # Calcular o Cohen's Kappa
        kappa = cohen_kappa_score(y_true, y_pred)
        kappa_scores[debate_id] = kappa

    # Calcular a média do Kappa
    mean_kappa = sum(kappa_scores.values()) / len(kappa_scores) if kappa_scores else 0

    return kappa_scores, mean_kappa


In [10]:
# Carregar os arquivos CSV
hard_voting_df = pd.read_csv('hard_voting.csv')
soft_voting_df = pd.read_csv('soft_voting.csv')
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')

# Renomear as colunas para uniformizar
self_assessment_df.rename(columns={'debater_position': 'position', 'debater_name': 'name'}, inplace=True)

# Excluir debates 10 e 17
exclude_debates = [10, 17]

# Chamada da função para calcular Cohen's Kappa
kappa_scores_hard, mean_cohens_hard = calculate_cohens_kappa(hard_voting_df, self_assessment_df, exclude_debates=exclude_debates)
kappa_scores_soft, mean_cohens_soft = calculate_cohens_kappa(soft_voting_df, self_assessment_df, exclude_debates=exclude_debates)

# Debugging: Verificar o retorno das variáveis
print(f"Kappa Scores (Hard Voting): {kappa_scores_hard}")
print(f"Kappa Scores (Soft Voting): {kappa_scores_soft}")
print(f"Mean Cohen's Kappa (Hard Voting): {mean_cohens_hard}")
print(f"Mean Cohen's Kappa (Soft Voting): {mean_cohens_soft}")

# Exibir os resultados de forma correta
print(f"\nMean Cohen's Kappa - Hard Voting: {mean_cohens_hard:.3f}")
print(f"Mean Cohen's Kappa - Soft Voting: {mean_cohens_soft:.3f}")


Kappa Scores (Hard Voting): {np.int64(1): np.float64(0.19999999999999996), np.int64(2): np.float64(-0.25), np.int64(3): np.float64(0.0), np.int64(5): np.float64(0.2857142857142857), np.int64(6): np.float64(0.5555555555555556), np.int64(7): np.float64(0.33333333333333337), np.int64(8): np.float64(0.5), np.int64(9): np.float64(0.19999999999999996), np.int64(11): np.float64(-0.0714285714285714), np.int64(12): np.float64(0.33333333333333337), np.int64(13): np.float64(0.5), np.int64(14): np.float64(0.25), np.int64(16): np.float64(0.31818181818181823), np.int64(18): np.float64(0.33333333333333337)}
Kappa Scores (Soft Voting): {np.int64(1): np.float64(0.19999999999999996), np.int64(2): np.float64(-0.25), np.int64(3): np.float64(0.0), np.int64(5): np.float64(0.0), np.int64(6): np.float64(0.33333333333333337), np.int64(7): np.float64(-0.33333333333333326), np.int64(8): np.float64(0.5), np.int64(9): np.float64(0.0), np.int64(11): np.float64(-0.25), np.int64(12): np.float64(0.33333333333333337), 