In [1]:
import pandas as pd
import json
import time
import matplotlib.pyplot as plt
from tqdm import tqdm  # Import tqdm for the progress bar
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, classification_report
import seaborn as sns
import numpy as np
import sklearn

In [2]:
from confidenceinterval import classification_report_with_ci
from confidenceinterval.bootstrap import bootstrap_ci


# Evaluate Multi-Class


## Load and combine multiple files with predictions

In [3]:
# each of the sub-lists will determine the reading of the relevant files
prompt_ids_to_eval = [["P1"], ["P2"], ["P3_1"], ["P3_2", "P3_3", "P3_4"], ["P4_1", "P4_2"], ["P5", "P6", "P7"], ["P9", "P9_1"], ["P10", "P11"], ["P11_1", "P11_2", "P11_3"], ["P11_4", "P11_5"], ["P12"], ["P12_1", "P12_2"]]

model = "gpt-3.5-turbo" #"gpt-3.5-turbo" "gpt-4-turbo-preview"
data_type = "enriched_kw"
#data_type = "enriched"
#data_type = ""

In [4]:
# Define a function to process each sublist
def process_prompt_ids(prompt_ids):
    # Create the list of prediction columns based on the prompt IDs
    prediction_columns = [f'gpt_predictions_{prompt_id}' for prompt_id in prompt_ids]
    # Specify the basic columns to include in the DataFrame
    basic_columns = ['pmid', 'accepted_label', 'multi_label', 'binary_label']
    # Combine basic columns with the dynamically generated prediction columns
    columns_to_read = basic_columns + prediction_columns
    # Read the CSV file
    df = pd.read_csv(f"predictions/{model}_{data_type}_test_outputs_{'_'.join(prompt_ids)}.csv")[columns_to_read] #for multi-label
    return df, prediction_columns
    
# Initialize an empty set to store all prediction columns
all_prediction_columns = set()

# Process the first sublist to initialize the big DataFrame
df, prediction_columns = process_prompt_ids(prompt_ids_to_eval[0])
# Add the prediction columns to the set
all_prediction_columns.update(prediction_columns)

# Iterate over the remaining sublists
for prompt_ids in prompt_ids_to_eval[1:]:
    # Process the current sublist
    df_single_file, prediction_columns = process_prompt_ids(prompt_ids)
    # Merge the big DataFrame with the current DataFrame on 'pmid' using a left join
    df = pd.merge(df, df_single_file[['pmid'] + prediction_columns], on='pmid', how='left')
    # Keep only the prediction columns from the current DataFrame
    #big_df = big_df[['pmid', 'accepted_label', 'multi_label', 'binary_label'] + prediction_columns]
    # Add the prediction columns to the set
    all_prediction_columns.update(prediction_columns)

# Convert the set of prediction columns to a list
all_prediction_columns = list(all_prediction_columns)
df.head()

TypeError: sequence item 0: expected str instance, list found

In [None]:
prediction_columns = all_prediction_columns

## Map predictions to numerical

In [None]:
label_to_numerical = {
    'Remaining': 0,
    'Non-systematic-review': 1,
    'Human-non-RCT-non-drug-intervention': 2,
    'Human-non-RCT-drug-intervention': 3,
    'Human-case-report': 4,
    'Animal-other': 5,
    'Animal-drug-intervention': 6,
    'Human-systematic-review': 7,
    'In-vitro-study': 8,
    'Human-RCT-non-drug-intervention': 9,
    'Animal-non-drug-intervention': 10,
    'Human-RCT-drug-intervention': 11,
    'Clinical-study-protocol': 12,
    'Human-RCT-non-intervention': 13
}

In [None]:
numerical_to_label = {v: f"{k}" for k, v in label_to_numerical.items()}
numerical_to_label

In [None]:
import difflib

def map_label_fuzzy(label, label_dict):
    #label = label.lower().replace('-', '').replace('_', ' ').strip()
    best_match = difflib.get_close_matches(label, label_dict.keys(), n=1, cutoff=0.6)
    if best_match:
        #print(f'{label} matched to {best_match[0]}')
        return label_dict[best_match[0]]
    return -1

In [None]:
map_label_fuzzy("RCT-drug-intervention", label_to_numerical)

In [None]:
def map_label_to_numerical(label, label_dict):
    # Check if label is a dictionary
    if isinstance(label, dict):
        # Extract the label with the highest score/probability
        highest_label = max(normalized_label, key=label.get)
        return label_dict.get(highest_label, -1)
    else:
         # Normalize label
        normalized_label = label.replace(',', '').strip().replace(' ', '-')#.strip()
        # Directly map string labels to numerical IDs
        numerical_label = label_dict.get(label, -1)
        # Fuzzy match if no direct mapping possible
        if numerical_label == -1:
            numerical_label = map_label_fuzzy(normalized_label, label_dict)
        # If fuzzy match did not work, check if the label string contains the key 'label' and use it to split the string; keep everything to the right as the potential label
        if numerical_label == -1:
            if 'label' in label:
                label_part = label.split('label')[1]
                numerical_label = map_label_fuzzy(label_part, label_dict)
        return numerical_label
        
# Convert accepted labels to numerical
df['accepted_label_numerical'] = df['accepted_label'].apply(lambda x: map_label_to_numerical(x, label_to_numerical))


for col in prediction_columns:
    df[f'{col}_numerical'] = df[col].apply(lambda x: map_label_to_numerical(x, label_to_numerical))

df.head()

In [None]:
def is_list_of_lists(lst):
    return all(isinstance(sublist, list) for sublist in lst)

if is_list_of_lists(prompt_ids_to_eval):
    # Flatten the list of lists
    prompt_ids_to_eval_flat = [item for sublist in prompt_ids_to_eval for item in sublist]
    # Create a string suffix for the CSV file name
    csv_file_suffix = '_'.join(prompt_ids_to_eval_flat)
else:
    csv_file_suffix = '_'.join(prompt_ids_to_eval)

In [None]:
csv_file_suffix = 'all_prompts' # when many prompts were loaded, the file name becomes too long and cannot be saved

In [None]:
#df.to_csv(f"predictions/{model}_{data_type}test_outputs_{'_'.join(csv_file_suffix)}_structured.csv")
df.to_csv(f"predictions/{model}_{data_type}_test_outputs_{'_'.join(csv_file_suffix)}_structured.csv")

#### Important: some labels from GPT could not be mapped to a target label

In [None]:
rows_with_minus_one = df[(df == -1).any(axis=1)]
rows_with_minus_one

## Evaluate prompts

In [None]:
def evaluate_predictions_with_ci(df, target_label_col, prompt_ids_to_eval, model, eval_type, label_to_numerical, numerical_to_label, csv_file_suffix=None, digits=3):
    report_dfs = []
    summary_stats = []

    for prompt_id in prompt_ids_to_eval:
        print("Evaluating ", prompt_id)
        prediction_col = f'gpt_predictions_{prompt_id}'

        # Extract arrays for evaluation
        y_true = df[target_label_col].values
        y_pred = df[f'{prediction_col}_numerical'].values
        cm = confusion_matrix(y_true, y_pred, labels=range(len(label_to_numerical)))
        
        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        accuracy_balanced = balanced_accuracy_score(y_true, y_pred)
        report = classification_report_with_ci(y_true, y_pred, numerical_to_label_map=numerical_to_label, round_ndigits = digits)

        
        # Create DataFrame from report
        report_df = pd.DataFrame(report)
        report_df['Prompt ID'] = prompt_id
        report_dfs.append(report_df)
        
        # Extract summary statistics
        report_df.set_index('class', inplace=True)
        summary = report_df.loc['weighted avg', ['precision', 'precision CI', 'recall', 'recall CI', 'f1-score', 'f1-score CI', 'accuracy', 'accuracy CI']].to_dict()
        summary['Prompt ID'] = prompt_id
        summary_stats.append(summary)

        # Plotting confusion matrix
        plt.figure(figsize=(10, 6))
        ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(len(label_to_numerical)), yticklabels=range(len(label_to_numerical)))
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=13)
        plt.title(f'Confusion Matrix for Model {model} and Prompt {prompt_id}', fontsize=14)
        plt.xlabel('Predicted Labels', fontsize=13)
        plt.ylabel('True Labels', fontsize=13)

        # Add an inset with label mapping
        textstr = '\n'.join([f'{v}: {k}' for k, v in label_to_numerical.items()])
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        ax.text(1.16, 1.0, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='top', bbox=props)
        plt.tight_layout()
        plt.savefig(f'plots/confusion_matrix_{model}_{data_type}_{prompt_id}_{eval_type}_ci.pdf')  # Save to PDF

    # Combine all report DataFrames
    all_reports_df = pd.concat(report_dfs)

    # Create a summary table for average precision, recall, and F1-score
    summary_df = pd.DataFrame(summary_stats)

    if not csv_file_suffix:
        csv_file_suffix = '_'.join(prompt_ids_to_eval) + "_" + eval_type
    
    # Save results to CSV files
    all_reports_df.to_csv(f"evaluations/{model}_{data_type}_test_per_class_{csv_file_suffix}_with_ci.csv")
    summary_df.to_csv(f"evaluations/{model}_{data_type}_test_summary_{csv_file_suffix}_with_ci.csv")
    
    print("Results saved to evaluations/ and plots/ folders.")

    return all_reports_df, summary_df


In [None]:
if is_list_of_lists(prompt_ids_to_eval):
    # Flatten the list of lists
    prompt_ids_to_eval = [item for sublist in prompt_ids_to_eval for item in sublist]

In [None]:
csv_file_suffix # control that is correct one= "all prompts"

In [None]:
target_label_col = 'accepted_label_numerical'
#eval_type = 'multi_label'
eval_type = 'hierarchical'
all_reports_df, summary_df = evaluate_predictions_with_ci(df, target_label_col, prompt_ids_to_eval, model, eval_type, label_to_numerical, numerical_to_label, csv_file_suffix)

In [None]:
summary_df

In [None]:
all_reports_df

### Format Results and Generate LateX

In [None]:
#model = "gpt-4-turbo-preview"

#prompt_ids_to_eval = ["P6", "P7", "P11_3", "P11_4"]
#summary_gpt4_raw_1 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P1", "P4_1", "P5"]
#summary_gpt4_raw_2 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P12_2"]
#summary_gpt4_raw_3 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#summary_gpt4_raw = pd.concat([summary_gpt4_raw_1,summary_gpt4_raw_2, summary_gpt4_raw_3]) 
#summary_gpt4_raw

In [None]:
# Case of combining separately evaluated prompt files

#model = "gpt-3.5-turbo"

#prompt_ids_to_eval = ["P5"]
#summary_gpt3_raw_1 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P6"]
#summary_gpt3_raw_2 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P1"]
#summary_gpt3_raw_3 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P7"]
#summary_gpt3_raw_4 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P3_3","P3_4","P4_1","P4_2"]
#summary_gpt3_raw_5 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P11","P11_1","P11_2","P11_3"]
#summary_gpt3_raw_6 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P12_2"]
#summary_gpt3_raw_7 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#prompt_ids_to_eval = ["P11_4","P11_5","P12"]
#summary_gpt3_raw_8 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

#summary_gpt3_raw = pd.concat([summary_gpt3_raw_1,summary_gpt3_raw_2, summary_gpt3_raw_3, summary_gpt3_raw_4, summary_gpt3_raw_5, summary_gpt3_raw_6, summary_gpt3_raw_7, summary_gpt3_raw_8]) 
#summary_gpt3_raw

In [None]:
# case when the prompt files were combined before the evaluation and the evaluation is already in a single file

model = "gpt-3.5-turbo"

csv_file_suffix="all_prompts"
summary_gpt3_raw = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_{csv_file_suffix}_with_ci.csv", index_col=0)
summary_gpt3_raw

In [None]:
map_prompt_to_concept = {
    'P1': 'zero-shot',
    'P2': 'zero-shot',
    'P3_1': 'CC',
    'P3_2': 'CC',
    'P4_1': 'CC',
    'P4_2': 'CC',
    'P5': 'CC',
    'P6': 'CC',
    'P7': 'CoT',
    'P9': 'CoT + CC',
    'P9_1': 'CoT + CC',
    'P10': 'CoT',
    'P11': 'CoT + CC',
    'P11_1': 'CoT + CC',
    'P11_2': 'CoT + CC',
    'P11_3': 'CoT + CC',
    'P11_4': 'CoT + CC',
    'P11_5': 'CoT + CC',
    'P12': '2 CoT + CC',
    'P12_1': '2 CoT + CC',
    'P12_2': '2 CoT + CC'
}

In [None]:
# Custom sorting function
def custom_sort(prompt):
    import re
    # Extract numbers from the prompt
    numbers = re.findall(r'\d+', prompt)
    if numbers:
        # Primary sort by the first number, secondary sort by the full string
        return (int(numbers[0]), prompt)
    return (float('inf'), prompt)  # Handles cases without numbers
    
def format_summary_for_latex_report(summary_df, map_prompt_to_concept):
    summary_df = summary_df.copy()
    
    # Concatenating each metric with its CI
    summary_df['Precision (CI)'] = summary_df['precision'].astype(str) + ' ' + summary_df['precision CI'].astype(str)
    summary_df['Recall (CI)'] = summary_df['recall'].astype(str) + ' ' + summary_df['recall CI'].astype(str)
    summary_df['F1-Score (CI)'] = summary_df['f1-score'].astype(str) + ' ' + summary_df['f1-score CI'].astype(str)
    summary_df['Accuracy (CI)'] = summary_df['accuracy'].astype(str) + ' ' + summary_df['accuracy CI'].astype(str)
    
    # Dropping old columns
    summary_df.drop(columns=['precision', 'precision CI', 'recall', 'recall CI', 'f1-score', 'f1-score CI', 'accuracy', 'accuracy CI'], inplace=True)
    
    # Rename 'Prompt ID' to 'Prompt'
    summary_df.rename(columns={'Prompt ID': 'Prompt'}, inplace=True)
    
    # Apply the mapping
    summary_df['Concept'] = summary_df['Prompt'].map(map_prompt_to_concept)
    
    # Rearrange the columns to put 'Concept' after 'Prompt'
    summary_df = summary_df[['Prompt', 'Concept', 'Precision (CI)', 'Recall (CI)', 'F1-Score (CI)', 'Accuracy (CI)']]

    summary_df['sort_key'] = summary_df['Prompt'].apply(custom_sort)
    summary_df.sort_values('sort_key', inplace=True)
    summary_df.drop('sort_key', inplace=True, axis=1)
    
    return summary_df
    

In [None]:
#summary_gpt4 = format_summary_for_latex_report(summary_gpt4_raw, map_prompt_to_concept)
#summary_gpt4['Model'] = 'gpt-4'
#summary_gpt4

In [None]:
#print(summary_gpt4.to_latex(float_format="%.3f", index=False, formatters={'Prompt': lambda x: x.replace('_', r'\_')}))

In [None]:
summary_gpt3 = format_summary_for_latex_report(summary_gpt3_raw, map_prompt_to_concept)
summary_gpt3['Model'] = 'gpt-3.5'
summary_gpt3 = summary_gpt3.dropna(subset=['Concept'])
summary_gpt3

In [None]:
summary_gpt3.drop(columns=['Model', 'Accuracy (CI)'], inplace=True)

print(summary_gpt3.to_latex(float_format="%.3f", index=False, formatters={'Prompt': lambda x: x.replace('_', r'\_')}))

### combine gpt-3.5 and gpt-4 results in one table

In [None]:
# Combine the DataFrames
#combined_df = pd.concat([summary_gpt4, summary_gpt3])

# Pivot table with multi-level columns for metrics and models as subcolumns
#pivot_df = combined_df.pivot_table(
    #index=['Prompt', 'Concept'],
    #columns='Model',
    #values=['Precision (CI)', 'Recall (CI)', 'F1-Score (CI)', 'Accuracy (CI)'],
    #aggfunc='first'
)

# Simplify the MultiIndex in columns
#pivot_df.columns = [' '.join(col).strip() for col in pivot_df.columns.values]
#pivot_df = pivot_df.reset_index()

#pivot_df['sort_key'] = pivot_df['Prompt'].apply(custom_sort)
#pivot_df.sort_values('sort_key', inplace=True)
#pivot_df.drop('sort_key', inplace=True, axis=1)

#pivot_df

In [None]:
#pivot_df = pd.DataFrame(pivot_df).drop(columns=['Accuracy (CI) gpt-3.5', 'Accuracy (CI) gpt-4'])

In [None]:
# Creating MultiIndex for columns based on model and metric
#new_columns = [
    ##('Accuracy', 'gpt-3.5'), ('Accuracy', 'gpt-4'),
    #('Precision', 'gpt-3.5'), ('Precision', 'gpt-4'),
    #('Recall', 'gpt-3.5'), ('Recall', 'gpt-4'),
    #('F1-Score', 'gpt-3.5'), ('F1-Score', 'gpt-4')
]

# Define the new column index as a MultiIndex
#multi_index = pd.MultiIndex.from_tuples(new_columns, names=['Metric', 'Model'])

# Create a new DataFrame using only the relevant columns and assign the MultiIndex
#new_df = pd.DataFrame(pivot_df, columns=['Prompt', 'Concept', 
                                   ##'Accuracy (CI) gpt-3.5', 'Accuracy (CI) gpt-4',
                                   #'Precision (CI) gpt-3.5', 'Precision (CI) gpt-4',
                                   #'Recall (CI) gpt-3.5', 'Recall (CI) gpt-4',
                                   #'F1-Score (CI) gpt-3.5', 'F1-Score (CI) gpt-4',])

# Rename columns to match the MultiIndex
#new_df.columns = ['Prompt', 'Concept'] + multi_index.to_flat_index().tolist()

# Set the new column index
#new_df.columns = pd.MultiIndex.from_tuples([('', 'Prompt'), ('', 'Concept')] + new_columns)

#new_df

In [None]:
# Formatter function to convert float to percentage
#def to_percentage(x):
    #return "{:.1f}%".format(x * 100)

# Define column formatters
#formatters = {
    #'Prompt': lambda x: x.replace('_', r'\_'),
    #'Accuracy (gpt-3.5)': to_percentage,
    #'Accuracy (gpt-4)': to_percentage,
    #'F1-Score (gpt-3.5)': to_percentage,
    #'F1-Score (gpt-4)': to_percentage,
    #'Precision (gpt-3.5)': to_percentage,
    #'Precision (gpt-4)': to_percentage,
    #'Recall (gpt-3.5)': to_percentage,
    #'Recall (gpt-4)': to_percentage,
}

In [None]:
#print(new_df.to_latex(float_format="%.3f", index=False, formatters={'': lambda x: x.replace('_', r'\_')}))