# CÓDIGO PARA COMPARAÇÃO SELF ASSESSMENT X LLMS

## ACURACIA WINNERS

In [18]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gemini_predictions_df = pd.read_csv('prompt1_gemini_.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gemini_predictions_df = gemini_predictions_df[~gemini_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("Gemini Predictions Columns:", gemini_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gemini_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gemini_predictions_df[gemini_predictions_df['debate_id'] == debate_id]['debater_name'][gemini_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)



Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
Gemini Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 4                      1  
1         Debater 

In [19]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt1_gpt4o.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)


Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 2                      1  
1         Debater 4

ACURÁCIA - PROMPT 2

In [23]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt2_gpt4o.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 3                      1  
1         Debater 1

In [22]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt2_gemini.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 2                      1  
1         Debater 1

In [25]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt3_gemini.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 2                      1  
1         Debater 4

In [26]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt3_gpt4o.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 1                      1  
1         Debater 2

In [27]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt4_gpt4o.csv')

# Filter out debate_id = 10 and debate_id = 17 from both datasets
self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin([10, 17])]
gpt4o_predictions_df = gpt4o_predictions_df[~gpt4o_predictions_df['debate_id'].isin([10, 17])]

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("GPT4o Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
GPT4o Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 2                      1  
1         Debater 3

In [28]:
import pandas as pd

# Load the datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gpt4o_predictions_df = pd.read_csv('prompt4_gemini.csv')

# Print columns to check for discrepancies in column names
print("Self-Assessment Columns:", self_assessment_df.columns)
print("Gemini Predictions Columns:", gpt4o_predictions_df.columns)

# Merge the two datasets on 'debate_id' to align the ground truth with the predictions
merged_df = pd.merge(self_assessment_df[['debate_id', 'debater_name', 'debater_position']],
                     gpt4o_predictions_df[['debate_id', 'debater_name', 'debater_position']],
                     on='debate_id', suffixes=('_ground_truth', '_pred'))

# Print the merged DataFrame to check column names
print("\nMerged DataFrame Columns:", merged_df.columns)
print("\nMerged DataFrame Preview:\n", merged_df.head())

# Find the winners from the ground truth
ground_truth_winners = merged_df[merged_df['debater_position_ground_truth'] == 1]

# Initialize list for accuracy calculations and details
correct_predictions = []
debate_details = []

for debate_id, group in ground_truth_winners.groupby('debate_id'):
    # Get the names of the ground truth winners
    true_winners = set(group['debater_name_ground_truth'])
    
    # Get the predicted winners for this debate_id
    predicted_winners = set(gpt4o_predictions_df[gpt4o_predictions_df['debate_id'] == debate_id]['debater_name'][gpt4o_predictions_df['debater_position'] == 1])
    
    # Save the details for display
    debate_details.append({
        'debate_id': debate_id,
        'true_winners': true_winners,
        'predicted_winners': predicted_winners
    })
    
    # If there is an intersection between true and predicted winners, it's a correct prediction
    if true_winners.intersection(predicted_winners):
        correct_predictions.append(1)
    else:
        correct_predictions.append(0)

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy of winners: {accuracy * 100:.2f}%')

# Print the expected and predicted winners for each debate
for detail in debate_details:
    print(f"Debate ID: {detail['debate_id']}")
    print(f"Expected winners: {', '.join(detail['true_winners'])}")
    print(f"Predicted winners: {', '.join(detail['predicted_winners'])}")
    print('-' * 40)

Self-Assessment Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')
Gemini Predictions Columns: Index(['debate_id', 'debater_name', 'debater_position', 'debater_score'], dtype='object')

Merged DataFrame Columns: Index(['debate_id', 'debater_name_ground_truth',
       'debater_position_ground_truth', 'debater_name_pred',
       'debater_position_pred'],
      dtype='object')

Merged DataFrame Preview:
    debate_id debater_name_ground_truth  debater_position_ground_truth  \
0          1                 Debater 2                              1   
1          1                 Debater 2                              1   
2          1                 Debater 2                              1   
3          1                 Debater 2                              1   
4          1                 Debater 4                              2   

  debater_name_pred  debater_position_pred  
0         Debater 1                      1  
1         Debater 

CONVERTENDO JSON PARA CSV

In [15]:
import json
import csv

# Nome do arquivo JSON
json_file = 'prompt4_gpt4o.json'

# Carrega o JSON do arquivo
with open(json_file, 'r') as file:
    json_data = json.load(file)

# Função para transformar o JSON
def transform_json(json_data):
    transformed_data = []

    for entry in json_data:
        debate_id = entry["debate_id"]

        # Para cada debatedor no ranking, criar uma linha separada
        for rank in entry["ranking"]:
            row = {
                "debate_id": debate_id,
                "debater_name": rank["name"],  # Adiciona o nome do debatedor
                "debater_position": rank["position"],
                "debater_score": rank["score"]
            }
            transformed_data.append(row)

    return transformed_data

# Transformar os dados do JSON
transformed_data = transform_json(json_data)

# Obter os cabeçalhos dinamicamente
headers = ["debate_id", "debater_name", "debater_position", "debater_score"]

# Salvar no CSV
csv_file = 'prompt4_gpt4o.csv'
with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()
    writer.writerows(transformed_data)

print(f"JSON convertido e salvo em {csv_file}.")

JSON convertido e salvo em prompt4_gpt4o.csv.


## ACURACIA RANKINGS

In [3]:
import pandas as pd

def calculate_ranking_accuracy(self_assessment_df, predictions_df, excluded_debates=[10, 17]):
    """
    Calcula a acurácia do ranking comparando as previsões de rankings com os rankings reais (ground truth).
    
    Parameters:
    - self_assessment_df: DataFrame com as autoavaliações (ground truth).
    - predictions_df: DataFrame com as previsões dos modelos (ex: Gemini, GPT-4).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).
    
    Returns:
    - ranking_accuracy: Acurácia do ranking (em porcentagem).
    """
    
    # Filtra os debates a serem excluídos
    self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar as acurácias de ranking de cada debate
    correct_rankings = []
    debate_details = []
    
    # Variáveis para acumular o número total de acertos e debatedores
    total_correct = 0
    total_debaters = 0

    # Para cada debate, compara o ranking completo dos debatedores
    for debate_id in self_assessment_df['debate_id'].unique():
        # Filtra os debatedores do debate real e do modelo
        true_ranking = self_assessment_df[self_assessment_df['debate_id'] == debate_id][['debater_name', 'debater_position']]
        predicted_ranking = predictions_df[predictions_df['debate_id'] == debate_id][['debater_name', 'debater_position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsão)
        true_ranking_sorted = true_ranking.sort_values(by='debater_position').reset_index(drop=True)
        predicted_ranking_sorted = predicted_ranking.sort_values(by='debater_position').reset_index(drop=True)

        # Salva os detalhes do debate para exibição
        debate_details.append({
            'debate_id': debate_id,
            'true_ranking': true_ranking_sorted['debater_name'].tolist(),
            'predicted_ranking': predicted_ranking_sorted['debater_name'].tolist()
        })

        # Inicializa o contador de acertos considerando empates
        correct_rankings_count = 0

        # Compara os rankings completos (considerando a ordem exata de posições)
        for i in range(len(true_ranking_sorted)):
            if true_ranking_sorted.iloc[i]['debater_name'] == predicted_ranking_sorted.iloc[i]['debater_name']:
                correct_rankings_count += 1

        # Calcula a fração de debatedores classificados corretamente neste debate
        correct_rankings.append(correct_rankings_count / len(true_ranking_sorted))
        
        # Acumula os valores totais
        total_correct += correct_rankings_count
        total_debaters += len(true_ranking_sorted)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Ranking esperado: {', '.join(true_ranking_sorted['debater_name'])}")
        print(f"Ranking previsto: {', '.join(predicted_ranking_sorted['debater_name'])}")
        print(f"{correct_rankings_count} acertos de {len(true_ranking_sorted)} debatedores.\n")
    
    # Calcula a acurácia média de ranking
    ranking_accuracy = sum(correct_rankings) / len(correct_rankings) if correct_rankings else 0
    
    # Exibe a acurácia do ranking
    print(f'Acurácia de ranking: {ranking_accuracy * 100:.2f}%')

    # Exibe o número total de acertos dividido pelo número total de debatedores
    print(f"Total de acertos: {total_correct} de {total_debaters} debatedores ({(total_correct / total_debaters) * 100:.2f}%)")
    
    return ranking_accuracy




In [83]:
# Carregar os datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gemini_predictions_df = pd.read_csv('prompt1_gemini.csv')
gpt4_predictions_df = pd.read_csv('prompt1_gpt4o.csv')

# Calcular acurácia de ranking para o modelo Gemini
ranking_accuracy_gemini = calculate_ranking_accuracy(self_assessment_df, gemini_predictions_df)

# Calcular acurácia de ranking para o modelo GPT-4
ranking_accuracy_gpt4 = calculate_ranking_accuracy(self_assessment_df, gpt4_predictions_df)


Debate ID: 1
Ranking esperado: Debater 2, Debater 4, Debater 1, Debater 3
Ranking previsto: Debater 4, Debater 2, Debater 3, Debater 1
0 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Ranking previsto: Debater 5, Debater 4, Debater 2, Debater 1, Debater 3
2 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 4, Debater 2, Debater 5, Debater 1, Debater 3
1 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 5, Debater 3, Debater 2, Debater 4, Debater 1
0 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 2, Debater 1, Debater 4, Debater 3
0 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 1, Debater 3, Debater 4
Ranking previsto: Debater 4, Debater 3, 

In [84]:
# Carregar os datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gemini_predictions_df = pd.read_csv('prompt2_gemini.csv')
gpt4_predictions_df = pd.read_csv('prompt2_gpt4o.csv')

# Calcular acurácia de ranking para o modelo Gemini
ranking_accuracy_gemini = calculate_ranking_accuracy(self_assessment_df, gemini_predictions_df)

# Calcular acurácia de ranking para o modelo GPT-4
ranking_accuracy_gpt4 = calculate_ranking_accuracy(self_assessment_df, gpt4_predictions_df)

Debate ID: 1
Ranking esperado: Debater 2, Debater 4, Debater 1, Debater 3
Ranking previsto: Debater 2, Debater 1, Debater 3, Debater 4
1 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Ranking previsto: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
3 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 4, Debater 1, Debater 2, Debater 3, Debater 5
2 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 3, Debater 2, Debater 5, Debater 1, Debater 4
2 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 4, Debater 2, Debater 3
1 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 1, Debater 3, Debater 4
Ranking previsto: Debater 4, Debater 2, 

In [4]:
import pandas as pd

# Carregar os datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gemini_predictions_df = pd.read_csv('prompt3_gemini.csv')
gpt4_predictions_df = pd.read_csv('prompt3_gpt4o.csv')

# Calcular acurácia de ranking para o modelo Gemini
ranking_accuracy_gemini = calculate_ranking_accuracy(self_assessment_df, gemini_predictions_df)

# Calcular acurácia de ranking para o modelo GPT-4
ranking_accuracy_gpt4 = calculate_ranking_accuracy(self_assessment_df, gpt4_predictions_df)

Debate ID: 1
Ranking esperado: Debater 2, Debater 4, Debater 1, Debater 3
Ranking previsto: Debater 2, Debater 4, Debater 3, Debater 1
2 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Ranking previsto: Debater 5, Debater 1, Debater 3, Debater 2, Debater 4
1 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 1, Debater 3, Debater 4, Debater 5, Debater 2
1 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4, Debater 5
0 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
4 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 1, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 2, 

In [5]:
import pandas as pd

# Carregar os datasets
self_assessment_df = pd.read_csv('rankings_self_assessment_.csv')
gemini_predictions_df = pd.read_csv('prompt4_gemini.csv')
gpt4_predictions_df = pd.read_csv('prompt4_gpt4o.csv')

# Calcular acurácia de ranking para o modelo Gemini
ranking_accuracy_gemini = calculate_ranking_accuracy(self_assessment_df, gemini_predictions_df)

# Calcular acurácia de ranking para o modelo GPT-4
ranking_accuracy_gpt4 = calculate_ranking_accuracy(self_assessment_df, gpt4_predictions_df)

Debate ID: 1
Ranking esperado: Debater 2, Debater 4, Debater 1, Debater 3
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
0 acertos de 4 debatedores.

Debate ID: 2
Ranking esperado: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Ranking previsto: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
3 acertos de 5 debatedores.

Debate ID: 3
Ranking esperado: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Ranking previsto: Debater 4, Debater 1, Debater 3, Debater 5, Debater 2
1 acertos de 5 debatedores.

Debate ID: 5
Ranking esperado: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Ranking previsto: Debater 2, Debater 3, Debater 1, Debater 4, Debater 5
1 acertos de 5 debatedores.

Debate ID: 6
Ranking esperado: Debater 1, Debater 2, Debater 3, Debater 4
Ranking previsto: Debater 2, Debater 1, Debater 4, Debater 3
0 acertos de 4 debatedores.

Debate ID: 7
Ranking esperado: Debater 2, Debater 1, Debater 3, Debater 4
Ranking previsto: Debater 1, Debater 4, 

## MRR

In [6]:
import pandas as pd

def calculate_mrr(self_assessment_df, predictions_df, excluded_debates=[10, 17]):
    """
    Calcula o Mean Reciprocal Rank (MRR) comparando as previsões de rankings com os rankings reais (ground truth).
    
    Parameters:
    - self_assessment_df: DataFrame com as autoavaliações (ground truth).
    - predictions_df: DataFrame com as previsões dos modelos (ex: Gemini, GPT-4).
    - excluded_debates: Lista de debates a serem excluídos da análise (default: [10, 17]).
    
    Returns:
    - mrr: Mean Reciprocal Rank (em decimal).
    """
    
    # Filtra os debates a serem excluídos
    self_assessment_df = self_assessment_df[~self_assessment_df['debate_id'].isin(excluded_debates)]
    predictions_df = predictions_df[~predictions_df['debate_id'].isin(excluded_debates)]

    # Lista para armazenar os Reciprocal Ranks de cada debate
    reciprocal_ranks = []

    # Para cada debate, calcula o Reciprocal Rank
    for debate_id in self_assessment_df['debate_id'].unique():
        # Filtra os debatedores do debate real e do modelo
        true_ranking = self_assessment_df[self_assessment_df['debate_id'] == debate_id][['debater_name', 'debater_position']]
        predicted_ranking = predictions_df[predictions_df['debate_id'] == debate_id][['debater_name', 'debater_position']]

        # Ordena os debatedores conforme suas posições no ranking (ground truth e previsão)
        true_ranking_sorted = true_ranking.sort_values(by='debater_position').reset_index(drop=True)
        predicted_ranking_sorted = predicted_ranking.sort_values(by='debater_position').reset_index(drop=True)

        # Lista de vencedores no ground truth (pode haver empate)
        true_winners = set(true_ranking_sorted[true_ranking_sorted['debater_position'] == 1]['debater_name'])

        # Calcula o Reciprocal Rank
        rr = 0
        for rank, debater in enumerate(predicted_ranking_sorted['debater_name'], start=1):
            if debater in true_winners:
                rr = 1 / rank
                break
        
        # Armazena o RR do debate
        reciprocal_ranks.append(rr)

        # Exibe informações do debate
        print(f"Debate ID: {debate_id}")
        print(f"Vencedores esperados: {', '.join(true_winners)}")
        print(f"Ranking previsto: {', '.join(predicted_ranking_sorted['debater_name'])}")
        print(f"Reciprocal Rank: {rr:.3f}\n")

    # Calcula o MRR
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
    
    # Exibe o MRR
    print(f"Mean Reciprocal Rank (MRR): {mrr:.3f}")

    return mrr


In [7]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt1_gemini.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 4, Debater 2, Debater 3, Debater 1
Reciprocal Rank: 0.500

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 4, Debater 2, Debater 1, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 2, Debater 5, Debater 1, Debater 3
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 5, Debater 3, Debater 2, Debater 4, Debater 1
Reciprocal Rank: 0.500

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 2, Debater 1, Debater 4, Debater 3
Reciprocal Rank: 0.500

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 4, Debater 3, Debater 2, Debater 1
Reciprocal Rank: 0.333

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 2, Debater 1, Debater 3
Reciprocal Rank: 0.500

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [8]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt1_gpt4o.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 4, Debater 1, Debater 3
Reciprocal Rank: 1.000

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 4, Debater 2, Debater 1, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 3, Debater 1, Debater 2, Debater 5
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 3, Debater 4, Debater 2, Debater 5, Debater 1
Reciprocal Rank: 1.000

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 3, Debater 1, Debater 2, Debater 4
Reciprocal Rank: 0.500

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 1, Debater 4, Debater 3
Reciprocal Rank: 1.000

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 2, Debater 1, Debater 3
Reciprocal Rank: 0.500

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [9]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt2_gemini.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 1, Debater 3, Debater 4
Reciprocal Rank: 1.000

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 1, Debater 2, Debater 3, Debater 5
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 3, Debater 2, Debater 5, Debater 1, Debater 4
Reciprocal Rank: 1.000

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 4, Debater 2, Debater 3
Reciprocal Rank: 1.000

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 4, Debater 2, Debater 3, Debater 1
Reciprocal Rank: 0.500

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 2
Reciprocal Rank: 1.000

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [10]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt2_gpt4o.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 3, Debater 1, Debater 2, Debater 4
Reciprocal Rank: 0.333

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 1, Debater 3, Debater 5, Debater 2
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 3, Debater 2, Debater 4, Debater 5, Debater 1
Reciprocal Rank: 1.000

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 2, Debater 4
Reciprocal Rank: 1.000

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 4, Debater 3, Debater 1
Reciprocal Rank: 1.000

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 3, Debater 1, Debater 2
Reciprocal Rank: 0.500

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [11]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt3_gemini.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 4, Debater 3, Debater 1
Reciprocal Rank: 1.000

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 1, Debater 3, Debater 2, Debater 4
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 1, Debater 3, Debater 4, Debater 5, Debater 2
Reciprocal Rank: 0.333

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4, Debater 5
Reciprocal Rank: 0.333

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
Reciprocal Rank: 1.000

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
Reciprocal Rank: 0.500

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 2
Reciprocal Rank: 1.000

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [12]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt3_gpt4o.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
Reciprocal Rank: 0.500

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 1, Debater 4, Debater 5, Debater 2, Debater 3
Reciprocal Rank: 1.000

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 1, Debater 2, Debater 3, Debater 5
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 1, Debater 3, Debater 2, Debater 4, Debater 5
Reciprocal Rank: 0.500

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 4, Debater 2
Reciprocal Rank: 1.000

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 3, Debater 4, Debater 1
Reciprocal Rank: 1.000

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 2, Debater 3
Reciprocal Rank: 1.000

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [13]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt4_gemini.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 1, Debater 2, Debater 3, Debater 4
Reciprocal Rank: 0.500

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 4, Debater 1, Debater 2, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 4, Debater 1, Debater 3, Debater 5, Debater 2
Reciprocal Rank: 1.000

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 2, Debater 3, Debater 1, Debater 4, Debater 5
Reciprocal Rank: 0.500

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 2, Debater 1, Debater 4, Debater 3
Reciprocal Rank: 0.500

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 1, Debater 4, Debater 2, Debater 3
Reciprocal Rank: 0.333

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 2
Reciprocal Rank: 1.000

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

In [14]:
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt4_gpt4o.csv")

mrr = calculate_mrr(self_assessment_df, predictions_df)

Debate ID: 1
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 3, Debater 1, Debater 4
Reciprocal Rank: 1.000

Debate ID: 2
Vencedores esperados: Debater 4, Debater 1
Ranking previsto: Debater 5, Debater 1, Debater 4, Debater 2, Debater 3
Reciprocal Rank: 0.500

Debate ID: 3
Vencedores esperados: Debater 4
Ranking previsto: Debater 1, Debater 4, Debater 3, Debater 2, Debater 5
Reciprocal Rank: 0.500

Debate ID: 5
Vencedores esperados: Debater 3
Ranking previsto: Debater 3, Debater 5, Debater 1, Debater 2, Debater 4
Reciprocal Rank: 1.000

Debate ID: 6
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 4, Debater 3, Debater 2
Reciprocal Rank: 1.000

Debate ID: 7
Vencedores esperados: Debater 2
Ranking previsto: Debater 2, Debater 3, Debater 4, Debater 1
Reciprocal Rank: 1.000

Debate ID: 8
Vencedores esperados: Debater 1
Ranking previsto: Debater 1, Debater 3, Debater 2
Reciprocal Rank: 1.000

Debate ID: 9
Vencedores esperados: Debater 1
Ranking prev

## NDCG

In [41]:
from sklearn.metrics import ndcg_score
import pandas as pd
import numpy as np

def calculate_ndcg_from_dataframes(ground_truth_df, predictions_df, k=None, exclude_debates=None):
    """
    Calculates the mean nDCG for all debates using scikit-learn's ndcg_score, excluding specified debates.

    Parameters:
    - ground_truth_df: DataFrame containing the ground truth data.
      Must have columns: 'debate_id', 'debater_name', 'debater_score'.
    - predictions_df: DataFrame containing the predicted data.
      Must have the same columns as ground_truth_df.
    - k: Integer, optional. Defines the top-k for nDCG calculation. If None, considers all.
    - exclude_debates: List of debate IDs to exclude from the calculation.

    Returns:
    - mean_ndcg: Mean nDCG across all debates (float).
    """
    # Ensure the required columns are present
    required_columns = ['debate_id', 'debater_name', 'debater_score']
    for df in [ground_truth_df, predictions_df]:
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"Both DataFrames must contain the columns: {required_columns}")

    # Filter out excluded debates if provided
    if exclude_debates:
        ground_truth_df = ground_truth_df[~ground_truth_df['debate_id'].isin(exclude_debates)]
        predictions_df = predictions_df[~predictions_df['debate_id'].isin(exclude_debates)]

    # Get unique debates
    debate_ids = ground_truth_df['debate_id'].unique()
    ndcg_scores = []

    for debate_id in debate_ids:
        print(f"\nProcessing Debate ID: {debate_id}")

        # Filter the scores for the current debate
        ground_truth_scores = (
            ground_truth_df[ground_truth_df['debate_id'] == debate_id]
            .sort_values(by='debater_name')['debater_score']
            .values
        )
        predicted_scores = (
            predictions_df[predictions_df['debate_id'] == debate_id]
            .sort_values(by='debater_name')['debater_score']
            .values
        )

        # Ensure the ground truth and predictions have the same length
        if len(ground_truth_scores) != len(predicted_scores):
            raise ValueError(f"Mismatch in number of scores for debate {debate_id}.")
        
        print(f"Ground truth scores: {ground_truth_scores}")
        print(f"Predicted scores: {predicted_scores}")

        # Reshape scores for scikit-learn (expects 2D arrays)
        ground_truth_scores = ground_truth_scores.reshape(1, -1)
        predicted_scores = predicted_scores.reshape(1, -1)

        # Compute nDCG using scikit-learn
        ndcg = ndcg_score(ground_truth_scores, predicted_scores, k=k)
        print(f"nDCG for this debate: {ndcg}")

        ndcg_scores.append(ndcg)

    # Compute mean nDCG
    mean_ndcg = np.mean(ndcg_scores)
    print(f"\nMean nDCG across all debates: {mean_ndcg}")
    return mean_ndcg




In [47]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt1_gemini.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")




Processing Debate ID: 1
Ground truth scores: [0. 2. 0. 1.]
Predicted scores: [7.5 8.8 8.2 9.1]
nDCG for this debate: 0.8597186998521971

Processing Debate ID: 2
Ground truth scores: [2. 0. 0. 2. 1.]
Predicted scores: [7.5 8.  6.5 8.5 9. ]
nDCG for this debate: 0.8302310645465177

Processing Debate ID: 3
Ground truth scores: [0. 0. 1. 4. 0.]
Predicted scores: [7.5 8.  7.  8.5 8. ]
nDCG for this debate: 0.9472941807962689

Processing Debate ID: 5
Ground truth scores: [0. 0. 2. 0. 1.]
Predicted scores: [6.5 7.5 8.  7.  8.5]
nDCG for this debate: 0.8597186998521971

Processing Debate ID: 6
Ground truth scores: [3. 0. 0. 0.]
Predicted scores: [8.5 9.  7.5 8. ]
nDCG for this debate: 0.6309297535714573

Processing Debate ID: 7
Ground truth scores: [0. 3. 0. 0.]
Predicted scores: [7.5 8.  8.5 9. ]
nDCG for this debate: 0.5

Processing Debate ID: 8
Ground truth scores: [1. 0. 0.]
Predicted scores: [8.5 9.  7.5]
nDCG for this debate: 0.6309297535714573

Processing Debate ID: 9
Ground truth scor

In [29]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt1_gpt4o.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.8987


In [30]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt2_gemini.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.9236


In [31]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt2_gpt4o.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.8869


In [32]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt3_gemini.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.8266


In [33]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt3_gpt4o.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.8680


In [38]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt4_gemini.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.8234


In [40]:
# Load the DataFrames
self_assessment_df = pd.read_csv("rankings_self_assessment_.csv")
predictions_df = pd.read_csv("prompt4_gpt4o.csv")

# Exclude debates 10 and 17
exclude_debates = [10, 17]

# Calculate mean nDCG
mean_ndcg = calculate_ndcg_from_dataframes(self_assessment_df, predictions_df, k=5, exclude_debates=exclude_debates)
print(f"Mean nDCG (excluding debates 10 and 17): {mean_ndcg:.4f}")

Mean nDCG (excluding debates 10 and 17): 0.9497
