# CÓDIGO PARA COMPARAÇÃO SELF ASSESSMENT X AVALIAÇÃO DOS JURADOS

## ACURACIA WINNERS

In [2]:
# Importing necessary libraries
import pandas as pd
import json

# Placeholder paths for JSON input and CSV outputs
input_json_path = "aggregated_debate_results.json"
soft_voting_csv_path = "soft_voting.csv"
hard_voting_csv_path = "hard_voting.csv"

# Load JSON data
with open(input_json_path, "r") as file:
    data = json.load(file)

# Prepare lists for each CSV
soft_voting_rows = []
hard_voting_rows = []

# Process the JSON data to extract rows for both CSVs
for debate in data:
    debate_id = debate["debate_id"]
    for entry in debate["soft_voting_ranking"]:
        soft_voting_rows.append(
            {"debate_id": debate_id, "position": entry["position"], "name": entry["name"], "score": entry["score"]}
        )
    for entry in debate["hard_voting_ranking"]:
        hard_voting_rows.append(
            {"debate_id": debate_id, "position": entry["position"], "name": entry["name"], "score": entry["score"]}
        )

# Create DataFrames
soft_voting_df = pd.DataFrame(soft_voting_rows)
hard_voting_df = pd.DataFrame(hard_voting_rows)

# Save to CSV files
soft_voting_df.to_csv(soft_voting_csv_path, index=False)
hard_voting_df.to_csv(hard_voting_csv_path, index=False)

print(f"Soft voting CSV saved to: {soft_voting_csv_path}")
print(f"Hard voting CSV saved to: {hard_voting_csv_path}")


Soft voting CSV saved to: soft_voting.csv
Hard voting CSV saved to: hard_voting.csv


In [5]:
import pandas as pd

def evaluate_predictions(ground_truth_path, predictions_path, excluded_debates, voting_type):
    """
    Avalia a precisão das predições com base nos vencedores previstos e reais.
    
    Args:
        ground_truth_path (str): Caminho para o arquivo CSV de ground truth (self-assessment).
        predictions_path (str): Caminho para o arquivo CSV das predições (hard ou soft voting).
        excluded_debates (list): Lista de IDs de debates a serem excluídos.
        voting_type (str): Tipo de votação ('hard' ou 'soft') para indicar o tipo de predictions.
    
    Returns:
        None: Exibe a precisão calculada e detalhes.
    """
    # Load datasets
    ground_truth_df = pd.read_csv(ground_truth_path)
    predictions_df = pd.read_csv(predictions_path)

    # Exclude specific debates
    ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Rename columns to ensure consistency
    ground_truth_df = ground_truth_df.rename(columns={
        'debater_name': 'name',
        'debater_position': 'position'
    })

    # Merge datasets on debate_id
    merged_df = pd.merge(
        ground_truth_df[['debate_id', 'name', 'position']],
        predictions_df[['debate_id', 'name', 'position']],
        on='debate_id',
        suffixes=('_true', '_pred')
    )

    # Identify ground truth winners
    ground_truth_winners = merged_df[merged_df['position_true'] == 1]

    # Initialize list for accuracy calculations and details
    correct_predictions = []
    debate_details = []

    for debate_id, group in ground_truth_winners.groupby('debate_id'):
        # Get the names of the ground truth winners
        true_winners = set(group['name_true'])
        
        # Get the predicted winners for this debate_id
        predicted_winners = set(
            merged_df[
                (merged_df['debate_id'] == debate_id) & 
                (merged_df['position_pred'] == 1)
            ]['name_pred']
        )
        
        # Save the details for display
        debate_details.append({
            'debate_id': debate_id,
            'true_winners': true_winners,
            'predicted_winners': predicted_winners
        })
        
        # If there is an intersection between true and predicted winners, it's a correct prediction
        if true_winners.intersection(predicted_winners):
            correct_predictions.append(1)
        else:
            correct_predictions.append(0)

    # Calculate accuracy
    accuracy = sum(correct_predictions) / len(correct_predictions)
    print(f'Accuracy of winners ({voting_type} voting): {accuracy * 100:.2f}%')

    # Print the expected and predicted winners for each debate
    for detail in debate_details:
        print(f"Debate ID: {detail['debate_id']}")
        print(f"Expected winners: {', '.join(detail['true_winners'])}")
        print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
        print('-' * 40)

# File paths
hard_voting_path = 'hard_voting.csv'
soft_voting_path = 'soft_voting.csv'
self_assessment_path = 'rankings_self_assessment_.csv'

# Excluded debates
excluded_debates = [10, 17]

# Evaluate for hard voting
print("Evaluating Hard Voting:")
evaluate_predictions(self_assessment_path, hard_voting_path, excluded_debates, 'hard')

# Evaluate for soft voting
print("\nEvaluating Soft Voting:")
evaluate_predictions(self_assessment_path, soft_voting_path, excluded_debates, 'soft')




Evaluating Hard Voting:
Accuracy of winners (hard voting): 71.43%
Debate ID: 1
Expected winners: Debater 2
Predicted winners: Debater 2
----------------------------------------
Debate ID: 2
Expected winners: Debater 1, Debater 4
Predicted winners: Debater 5
----------------------------------------
Debate ID: 3
Expected winners: Debater 4
Predicted winners: Debater 3
----------------------------------------
Debate ID: 5
Expected winners: Debater 3
Predicted winners: Debater 5
----------------------------------------
Debate ID: 6
Expected winners: Debater 1
Predicted winners: Debater 1
----------------------------------------
Debate ID: 7
Expected winners: Debater 2
Predicted winners: Debater 2
----------------------------------------
Debate ID: 8
Expected winners: Debater 1
Predicted winners: Debater 1
----------------------------------------
Debate ID: 9
Expected winners: Debater 1
Predicted winners: Debater 1, Debater 4
----------------------------------------
Debate ID: 11
Expected w

## ACURACIA DEBATERS

In [8]:
import pandas as pd

def calculate_ranking_accuracy(ground_truth_df, predictions_df, excluded_debates=[10, 17]):
    """
    Calcula a acurácia do ranking comparando as previsões de rankings com os rankings reais (ground truth).

    Parameters:
    - ground_truth_df: DataFrame com os rankings reais (hard ou soft voting).
    - predictions_df: DataFrame com as previsões dos modelos (ex: self-assessment).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).

    Returns:
    - ranking_accuracy: Acurácia do ranking (em porcentagem).
    """

    # Filtra os debates a serem excluídos
    ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar as acurácias de ranking de cada debate
    correct_rankings = []
    debate_details = []

    # Variáveis para acumular o número total de acertos e debatedores
    total_correct = 0
    total_debaters = 0

    # Para cada debate, compara o ranking completo dos debatedores
    for debate_id in ground_truth_df['debate_id'].unique():
        # Filtra os debatedores do debate real e do modelo
        true_ranking = ground_truth_df[ground_truth_df['debate_id'] == debate_id][['name', 'position']]
        predicted_ranking = predictions_df[predictions_df['debate_id'] == debate_id][['debater_name', 'debater_position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsão)
        true_ranking_sorted = true_ranking.sort_values(by='position').reset_index(drop=True)
        predicted_ranking_sorted = predicted_ranking.sort_values(by='debater_position').reset_index(drop=True)

        # Salva os detalhes do debate para exibição
        debate_details.append({
            'debate_id': debate_id,
            'true_ranking': true_ranking_sorted['name'].tolist(),
            'predicted_ranking': predicted_ranking_sorted['debater_name'].tolist()
        })

        # Inicializa o contador de acertos considerando empates
        correct_rankings_count = 0

        # Compara os rankings completos (considerando a ordem exata de posições)
        for i in range(len(true_ranking_sorted)):
            if true_ranking_sorted.iloc[i]['name'] == predicted_ranking_sorted.iloc[i]['debater_name']:
                correct_rankings_count += 1

        # Calcula a fração de debatedores classificados corretamente neste debate
        correct_rankings.append(correct_rankings_count / len(true_ranking_sorted))

        # Acumula os valores totais
        total_correct += correct_rankings_count
        total_debaters += len(true_ranking_sorted)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Ranking esperado: {', '.join(true_ranking_sorted['name'])}")
        print(f"Ranking previsto: {', '.join(predicted_ranking_sorted['debater_name'])}")
        print(f"{correct_rankings_count} acertos de {len(true_ranking_sorted)} debatedores.\n")

    # Calcula a acurácia média de ranking
    ranking_accuracy = sum(correct_rankings) / len(correct_rankings) if correct_rankings else 0

    # Exibe a acurácia do ranking
    print(f'Acurácia de ranking: {ranking_accuracy * 100:.2f}%')

    # Exibe o número total de acertos dividido pelo número total de debatedores
    print(f"Total de acertos: {total_correct} de {total_debaters} debatedores ({(total_correct / total_debaters) * 100:.2f}%)")

    return ranking_accuracy

# Função principal para avaliar hard e soft voting
def evaluate_voting(ground_truth_path, predictions_path, excluded_debates, voting_type):
    print(f"\n--- Avaliando {voting_type.capitalize()} Voting ---\n")

    # Carregar os datasets
    ground_truth_df = pd.read_csv(ground_truth_path)
    predictions_df = pd.read_csv(predictions_path)

    # Ajustar nomes das colunas para uniformizar
    ground_truth_df.rename(columns={'position': 'position', 'name': 'name'}, inplace=True)
    predictions_df.rename(columns={'debater_position': 'debater_position', 'debater_name': 'debater_name'}, inplace=True)

    # Calcular acurácia do ranking
    calculate_ranking_accuracy(ground_truth_df, predictions_df, excluded_debates)

# Caminhos para os arquivos
hard_voting_path = "hard_voting.csv"
soft_voting_path = "soft_voting.csv"
self_assessment_path = "rankings_self_assessment_.csv"

# Excluir debates
excluded_debates = [10, 17]

# Avaliar para hard voting
evaluate_voting(hard_voting_path, self_assessment_path, excluded_debates, 'hard')

# Avaliar para soft voting
evaluate_voting(soft_voting_path, self_assessment_path, excluded_debates, 'soft')



--- Avaliando Hard Voting ---

Debate ID: 1
Ranking esperado: Debater 2, Debater 3, Debater 1, Debater 4
Ranking previsto: Debater 2, Debater 4, Debater 1, Debater 3
2 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Ranking previsto: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
3 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 3, Debater 4, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
3 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 5, Debater 3, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
3 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
4 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 3, Debater 1, Debater 4
Ranking 

## MRR

In [9]:
import pandas as pd

def calculate_mrr(hard_voting_df, soft_voting_df, self_assessment_df, excluded_debates=[10, 17]):
    """
    Calcula o Mean Reciprocal Rank (MRR) comparando as previsões de rankings com os rankings reais (ground truth).
    
    Parameters:
    - hard_voting_df: DataFrame com as previsões dos votos duros (hard voting).
    - soft_voting_df: DataFrame com as previsões dos votos suaves (soft voting).
    - self_assessment_df: DataFrame com as autoavaliações (ground truth).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).
    
    Returns:
    - mrr_hard: Mean Reciprocal Rank (em decimal) para hard voting.
    - mrr_soft: Mean Reciprocal Rank (em decimal) para soft voting.
    """
    
    # Filtra os debates a serem excluídos
    hard_voting_df = hard_voting_df[~hard_voting_df['debate_id'].isin(excluded_debates)]
    soft_voting_df = soft_voting_df[~soft_voting_df['debate_id'].isin(excluded_debates)]
    self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar os Reciprocal Ranks de cada debate
    reciprocal_ranks_hard = []
    reciprocal_ranks_soft = []

    # Para cada debate, calcula o Reciprocal Rank para hard e soft voting
    for debate_id in self_assessment_df['debate_id'].unique():
        # Filtra os debatedores do debate real (ground truth)
        true_ranking = self_assessment_df[self_assessment_df['debate_id'] == debate_id][['debater_name', 'debater_position']]
        
        # Filtra os rankings de predições (hard e soft voting)
        predicted_ranking_hard = hard_voting_df[hard_voting_df['debate_id'] == debate_id][['name', 'position']]
        predicted_ranking_soft = soft_voting_df[soft_voting_df['debate_id'] == debate_id][['name', 'position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsões)
        true_ranking_sorted = true_ranking.sort_values(by='debater_position').reset_index(drop=True)
        predicted_ranking_hard_sorted = predicted_ranking_hard.sort_values(by='position').reset_index(drop=True)
        predicted_ranking_soft_sorted = predicted_ranking_soft.sort_values(by='position').reset_index(drop=True)

        # Lista de vencedores no ground truth (pode haver empate)
        true_winners = set(true_ranking_sorted[true_ranking_sorted['debater_position'] == 1]['debater_name'])

        # Calcula o Reciprocal Rank para hard voting
        rr_hard = 0
        for rank, debater in enumerate(predicted_ranking_hard_sorted['name'], start=1):
            if debater in true_winners:
                rr_hard = 1 / rank
                break
        
        # Calcula o Reciprocal Rank para soft voting
        rr_soft = 0
        for rank, debater in enumerate(predicted_ranking_soft_sorted['name'], start=1):
            if debater in true_winners:
                rr_soft = 1 / rank
                break
        
        # Armazena os RRs dos debates
        reciprocal_ranks_hard.append(rr_hard)
        reciprocal_ranks_soft.append(rr_soft)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Vencedores esperados: {', '.join(true_winners)}")
        print(f"Ranking previsto (Hard Voting): {', '.join(predicted_ranking_hard_sorted['name'])}")
        print(f"Ranking previsto (Soft Voting): {', '.join(predicted_ranking_soft_sorted['name'])}")
        print(f"Reciprocal Rank (Hard Voting): {rr_hard:.3f}")
        print(f"Reciprocal Rank (Soft Voting): {rr_soft:.3f}\n")

    # Calcula o MRR para hard e soft voting
    mrr_hard = sum(reciprocal_ranks_hard) / len(reciprocal_ranks_hard) if reciprocal_ranks_hard else 0
    mrr_soft = sum(reciprocal_ranks_soft) / len(reciprocal_ranks_soft) if reciprocal_ranks_soft else 0
    
    # Exibe o MRR
    print(f"Mean Reciprocal Rank (MRR) - Hard Voting: {mrr_hard:.3f}")
    print(f"Mean Reciprocal Rank (MRR) - Soft Voting: {mrr_soft:.3f}")

    return mrr_hard, mrr_soft

In [10]:
import pandas as pd

# Carregar os arquivos CSV
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
hard_voting_df = pd.read_csv('hard_voting.csv')
soft_voting_df = pd.read_csv('soft_voting.csv')

# Chamar a função para calcular o MRR
mrr_hard, mrr_soft = calculate_mrr(hard_voting_df, soft_voting_df, self_assessment_df)

# Exibir os resultados
print(f"MRR - Hard Voting: {mrr_hard:.3f}")
print(f"MRR - Soft Voting: {mrr_soft:.3f}")


Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto (Hard Voting): Debater 2, Debater 3, Debater 1, Debater 4
Ranking previsto (Soft Voting): Debater 2, Debater 3, Debater 1, Debater 4
Reciprocal Rank (Hard Voting): 1.000
Reciprocal Rank (Soft Voting): 1.000

Debate ID: 2
Vencedores esperados: Debater 1, Debater 4
Ranking previsto (Hard Voting): Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Ranking previsto (Soft Voting): Debater 5, Debater 4, Debater 3, Debater 1, Debater 2
Reciprocal Rank (Hard Voting): 0.500
Reciprocal Rank (Soft Voting): 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto (Hard Voting): Debater 3, Debater 4, Debater 1, Debater 2, Debater 5
Ranking previsto (Soft Voting): Debater 3, Debater 4, Debater 1, Debater 5, Debater 2
Reciprocal Rank (Hard Voting): 0.500
Reciprocal Rank (Soft Voting): 0.500

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto (Hard Voting): Debater 5, Debater 3, Debater 1, Debater 2, Debater 4
Rank

## NDCG

In [12]:
from sklearn.metrics import ndcg_score
import pandas as pd
import numpy as np

def calculate_ndcg_from_dataframes(ground_truth_df, predictions_df, k=None, exclude_debates=None):
    """
    Calculates the mean nDCG for all debates using scikit-learn's ndcg_score, excluding specified debates.

    Parameters:
    - ground_truth_df: DataFrame containing the ground truth data.
      Must have columns: 'debate_id', 'debater_name', 'debater_score'.
    - predictions_df: DataFrame containing the predicted data.
      Must have the same columns as ground_truth_df.
    - k: Integer, optional. Defines the top-k for nDCG calculation. If None, considers all.
    - exclude_debates: List of debate IDs to exclude from the calculation.

    Returns:
    - mean_ndcg: Mean nDCG across all debates (float).
    """
    # Ensure the required columns are present
    required_columns = ['debate_id', 'debater_name', 'debater_score']
    for df in [ground_truth_df, predictions_df]:
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"Both DataFrames must contain the columns: {required_columns}")

    # Filter out excluded debates if provided
    if exclude_debates:
        ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(exclude_debates)]
        predictions_df = predictions_df[~predictions_df['debate_id'].isin(exclude_debates)]

    # Get unique debates
    debate_ids = ground_truth_df['debate_id'].unique()
    ndcg_scores = []

    for debate_id in debate_ids:
        print(f"\nProcessing Debate ID: {debate_id}")

        # Filter the scores for the current debate
        ground_truth_scores = (
            ground_truth_df[ground_truth_df['debate_id'] == debate_id]
            .sort_values(by='debater_name')['debater_score']
            .values
        )
        predicted_scores = (
            predictions_df[predictions_df['debate_id'] == debate_id]
            .sort_values(by='debater_name')['debater_score']
            .values
        )

        # Ensure the ground truth and predictions have the same length
        if len(ground_truth_scores) != len(predicted_scores):
            raise ValueError(f"Mismatch in number of scores for debate {debate_id}.")
        
        print(f"Ground truth scores: {ground_truth_scores}")
        print(f"Predicted scores: {predicted_scores}")

        # Reshape scores for scikit-learn (expects 2D arrays)
        ground_truth_scores = ground_truth_scores.reshape(1, -1)
        predicted_scores = predicted_scores.reshape(1, -1)

        # Compute nDCG using scikit-learn
        ndcg = ndcg_score(ground_truth_scores, predicted_scores, k=k)
        print(f"nDCG for this debate: {ndcg}")

        ndcg_scores.append(ndcg)

    # Compute mean nDCG
    mean_ndcg = np.mean(ndcg_scores)
    print(f"\nMean nDCG across all debates: {mean_ndcg}")
    return mean_ndcg


# Carregar os arquivos CSV
hard_voting_df = pd.read_csv('hard_voting.csv')
soft_voting_df = pd.read_csv('soft_voting.csv')
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')

# Renomear as colunas para uniformizar
hard_voting_df.rename(columns={'position': 'debater_position', 'name': 'debater_name', 'score': 'debater_score'}, inplace=True)
soft_voting_df.rename(columns={'position': 'debater_position', 'name': 'debater_name', 'score': 'debater_score'}, inplace=True)

# Excluir debates 10 e 17
exclude_debates = [10, 17]

# Calcular o nDCG para Hard Voting
print("\nCalculating nDCG for Hard Voting...")
mean_ndcg_hard = calculate_ndcg_from_dataframes(self_assessment_df, hard_voting_df, exclude_debates=exclude_debates)

# Calcular o nDCG para Soft Voting
print("\nCalculating nDCG for Soft Voting...")
mean_ndcg_soft = calculate_ndcg_from_dataframes(self_assessment_df, soft_voting_df, exclude_debates=exclude_debates)

# Exibir os resultados
print(f"\nMean nDCG - Hard Voting: {mean_ndcg_hard:.3f}")
print(f"Mean nDCG - Soft Voting: {mean_ndcg_soft:.3f}")



Calculating nDCG for Hard Voting...

Processing Debate ID: 1
Ground truth scores: [0. 2. 0. 1.]
Predicted scores: [0 3 2 0]
nDCG for this debate: 0.937059712708037

Processing Debate ID: 2
Ground truth scores: [2. 0. 0. 2. 1.]
Predicted scores: [0 0 0 2 5]
nDCG for this debate: 0.8347500515066167

Processing Debate ID: 3
Ground truth scores: [0. 0. 1. 4. 0.]
Predicted scores: [1 0 3 2 0]
nDCG for this debate: 0.760909623292876

Processing Debate ID: 5
Ground truth scores: [0. 0. 2. 0. 1.]
Predicted scores: [0 0 2 0 4]
nDCG for this debate: 0.8597186998521971

Processing Debate ID: 6
Ground truth scores: [3. 0. 0. 0.]
Predicted scores: [4 2 2 0]
nDCG for this debate: 1.0

Processing Debate ID: 7
Ground truth scores: [0. 3. 0. 0.]
Predicted scores: [1 4 2 1]
nDCG for this debate: 1.0

Processing Debate ID: 8
Ground truth scores: [1. 0. 0.]
Predicted scores: [5 0 1]
nDCG for this debate: 1.0

Processing Debate ID: 9
Ground truth scores: [2. 1. 0. 0.]
Predicted scores: [3 0 0 3]
nDCG for 