In [61]:
import pandas as pd
import json
import time
import matplotlib.pyplot as plt
from tqdm import tqdm  # Import tqdm for the progress bar
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, classification_report
import seaborn as sns
import numpy as np
import sklearn

In [62]:
from confidenceinterval import classification_report_with_ci
from confidenceinterval.bootstrap import bootstrap_ci


# Evaluate Multi-Class


## Load individual file with predictions

Skip the below if you have already loaded multiple files!

In [86]:
prompt_ids_to_eval = ["P1_HIERARCHY","P2_HIERARCHY"] #["P1", "P4_1", "P5"] # ["P6", "P7", "P11_3", "P11_4"]
model = "gpt-3.5-turbo" #"gpt-3.5-turbo" "gpt-4-turbo-preview"
data_type = "enriched_kw" # change when evaluating enriched without kw/ not enriched to be able to load and save the correct names

In [87]:
# Create the list of prediction columns based on the prompt IDs
prediction_columns = [f'gpt_predictions_{prompt_id}' for prompt_id in prompt_ids_to_eval]
# Specify the basic columns to include in the DataFrame
basic_columns = ['pmid', 'accepted_label', 'multi_label', 'binary_label']
# Combine basic columns with the dynamically generated prediction columns
columns_to_read = basic_columns + prediction_columns

df = pd.read_csv(f"predictions/{model}_{data_type}_test_outputs_P2_{'_'.join(prompt_ids_to_eval)}_hierarchical.csv")[columns_to_read]

In [88]:
columns_to_read

['pmid',
 'accepted_label',
 'multi_label',
 'binary_label',
 'gpt_predictions_P1_HIERARCHY',
 'gpt_predictions_P2_HIERARCHY']

In [89]:
df.shape

(534, 6)

In [90]:
df.head()

Unnamed: 0,pmid,accepted_label,multi_label,binary_label,gpt_predictions_P1_HIERARCHY,gpt_predictions_P2_HIERARCHY
0,32147509,Non-systematic-review,1,0,Animal-systematic-review,Animal-systematic-review
1,8800803,Non-systematic-review,1,0,Animal-other,Animal-other
2,23811310,Non-systematic-review,1,0,Animal-other,Animal-other
3,36314672,Remaining,0,0,Animal-other,Animal-other
4,11909745,Remaining,0,0,Animal-other,Animal-other


## Map predictions to numerical

In [91]:
label_to_numerical = {
    'Remaining': 0,
    'Non-systematic-review': 1,
    'Human-non-RCT-non-drug-intervention': 2,
    'Human-non-RCT-drug-intervention': 3,
    'Human-case-report': 4,
    'Animal-other': 5,
    'Animal-drug-intervention': 6,
    'Human-systematic-review': 7,
    'In-vitro-study': 8,
    'Human-RCT-non-drug-intervention': 9,
    'Animal-non-drug-intervention': 10,
    'Human-RCT-drug-intervention': 11,
    'Clinical-study-protocol': 12,
    'Human-RCT-non-intervention': 13
}

In [92]:
numerical_to_label = {v: f"{k}" for k, v in label_to_numerical.items()}
numerical_to_label

{0: 'Remaining',
 1: 'Non-systematic-review',
 2: 'Human-non-RCT-non-drug-intervention',
 3: 'Human-non-RCT-drug-intervention',
 4: 'Human-case-report',
 5: 'Animal-other',
 6: 'Animal-drug-intervention',
 7: 'Human-systematic-review',
 8: 'In-vitro-study',
 9: 'Human-RCT-non-drug-intervention',
 10: 'Animal-non-drug-intervention',
 11: 'Human-RCT-drug-intervention',
 12: 'Clinical-study-protocol',
 13: 'Human-RCT-non-intervention'}

In [93]:
import difflib
def map_label_fuzzy(label, label_dict):
    #label = label.lower().replace('-', '').replace('_', ' ').strip()
    best_match = difflib.get_close_matches(label, label_dict.keys(), n=1, cutoff=0.6)
    if best_match:
        #print(f'{label} matched to {best_match[0]}')
        return label_dict[best_match[0]]
    return -1

In [94]:
map_label_fuzzy("RCT-drug-intervention", label_to_numerical)

11

In [95]:
def map_label_to_numerical(label, label_dict):
    # Check if label is a dictionary
    if isinstance(label, dict):
        print(label)
        # Extract the label with the highest score/probability
        highest_label = max(normalized_label, key=label.get)
        return label_dict.get(highest_label, -1)
    else:
         # Normalize label
        normalized_label = label.replace(',', '').strip().replace(' ', '-')#.strip()
        # Directly map string labels to numerical IDs
        numerical_label = label_dict.get(label, -1)
        # Fuzzy match if no direct mapping possible
        if numerical_label == -1:
            numerical_label = map_label_fuzzy(normalized_label, label_dict)
        # If fuzzy match did not work, check if the label string contains the key 'label' and use it to split the string; keep everything to the right as the potential label
        if numerical_label == -1:
            if 'label' in label:
                label_part = label.split('label')[1]
                numerical_label = map_label_fuzzy(label_part, label_dict)
        return numerical_label
        
# Convert accepted labels to numerical
df['accepted_label_numerical'] = df['accepted_label'].apply(lambda x: map_label_to_numerical(x, label_to_numerical))


for col in prediction_columns:
    df[f'{col}_numerical'] = df[col].apply(lambda x: map_label_to_numerical(x, label_to_numerical))

df.head()

Unnamed: 0,pmid,accepted_label,multi_label,binary_label,gpt_predictions_P1_HIERARCHY,gpt_predictions_P2_HIERARCHY,accepted_label_numerical,gpt_predictions_P1_HIERARCHY_numerical,gpt_predictions_P2_HIERARCHY_numerical
0,32147509,Non-systematic-review,1,0,Animal-systematic-review,Animal-systematic-review,1,7,7
1,8800803,Non-systematic-review,1,0,Animal-other,Animal-other,1,5,5
2,23811310,Non-systematic-review,1,0,Animal-other,Animal-other,1,5,5
3,36314672,Remaining,0,0,Animal-other,Animal-other,0,5,5
4,11909745,Remaining,0,0,Animal-other,Animal-other,0,5,5


In [96]:
def is_list_of_lists(lst):
    return all(isinstance(sublist, list) for sublist in lst)

if is_list_of_lists(prompt_ids_to_eval):
    # Flatten the list of lists
    prompt_ids_to_eval_flat = [item for sublist in prompt_ids_to_eval for item in sublist]
    # Create a string suffix for the CSV file name
    csv_file_suffix = '_'.join(prompt_ids_to_eval_flat)
else:
    csv_file_suffix = '_'.join(prompt_ids_to_eval)

In [97]:
csv_file_suffix = 'P2_P1_HIERARCHY_P2_HIERARCHY' #redefine

In [98]:
df.to_csv(f"predictions/{model}_{data_type}_test_outputs_{'_'.join(csv_file_suffix)}_structured.csv")

#### Important: some labels from GPT could not be mapped to a target label

In [99]:
rows_with_minus_one = df[(df == -1).any(axis=1)]
rows_with_minus_one

Unnamed: 0,pmid,accepted_label,multi_label,binary_label,gpt_predictions_P1_HIERARCHY,gpt_predictions_P2_HIERARCHY,accepted_label_numerical,gpt_predictions_P1_HIERARCHY_numerical,gpt_predictions_P2_HIERARCHY_numerical


## Evaluate prompts

In [100]:
def evaluate_predictions_with_ci(df, target_label_col, prompt_ids_to_eval, model, eval_type, label_to_numerical, numerical_to_label, csv_file_suffix=None, digits=3):
    report_dfs = []
    summary_stats = []

    for prompt_id in prompt_ids_to_eval:
        print("Evaluating ", prompt_id)
        prediction_col = f'gpt_predictions_{prompt_id}'

        # Extract arrays for evaluation
        y_true = df[target_label_col].values
        y_pred = df[f'{prediction_col}_numerical'].values
        cm = confusion_matrix(y_true, y_pred, labels=range(len(label_to_numerical)))
        
        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        accuracy_balanced = balanced_accuracy_score(y_true, y_pred)
        report = classification_report_with_ci(y_true, y_pred, numerical_to_label_map=numerical_to_label, round_ndigits = digits)
        
        # Create DataFrame from report
        report_df = pd.DataFrame(report)
        report_df['Prompt ID'] = prompt_id
        report_dfs.append(report_df)
        
        # Extract summary statistics
        report_df.set_index('class', inplace=True)
        summary = report_df.loc['weighted avg', ['precision', 'precision CI', 'recall', 'recall CI', 'f1-score', 'f1-score CI', 'accuracy', 'accuracy CI']].to_dict()
        summary['Prompt ID'] = prompt_id
        summary_stats.append(summary)

        # Plotting confusion matrix
        plt.figure(figsize=(10, 6))
        ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(len(label_to_numerical)), yticklabels=range(len(label_to_numerical)))
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=13)
        plt.title(f'Confusion Matrix for Model {model} and Prompt {prompt_id}', fontsize=14)
        plt.xlabel('Predicted Labels', fontsize=13)
        plt.ylabel('True Labels', fontsize=13)

        # Add an inset with label mapping
        textstr = '\n'.join([f'{v}: {k}' for k, v in label_to_numerical.items()])
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        ax.text(1.16, 1.0, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='top', bbox=props)
        plt.tight_layout()
        plt.savefig(f'plots/confusion_matrix_{model}_{data_type}_P2_{prompt_id}_{eval_type}_with_ci.pdf')  # Save to PDF

    # Combine all report DataFrames
    all_reports_df = pd.concat(report_dfs)

    # Create a summary table for average precision, recall, and F1-score
    summary_df = pd.DataFrame(summary_stats)

    if not csv_file_suffix:
        csv_file_suffix = '_'.join(prompt_ids_to_eval) + "_" + eval_type
    
    # Save results to CSV files
    all_reports_df.to_csv(f"evaluations/{model}_{data_type}_test_per_class_{csv_file_suffix}_{eval_type}_with_ci.csv")
    summary_df.to_csv(f"evaluations/{model}_{data_type}_test_summary_{csv_file_suffix}_{eval_type}_with_ci.csv")
    
    print("Results saved to evaluations/ and plots/ folders.")

    return all_reports_df, summary_df


In [101]:
csv_file_suffix

'P2_P1_HIERARCHY_P2_HIERARCHY'

In [102]:
prompt_ids_to_eval

['P1_HIERARCHY', 'P2_HIERARCHY']

In [None]:
target_label_col = 'accepted_label_numerical'
eval_type = 'hierarchical'
all_reports_df, summary_df = evaluate_predictions_with_ci(df, target_label_col, prompt_ids_to_eval, model, eval_type, label_to_numerical, numerical_to_label, csv_file_suffix)


Evaluating  P1_HIERARCHY


  q_ = count_a / nobs_a
  denom = 1 + crit2 / nobs_a
  center = (q_ + crit2 / (2 * nobs_a)) / denom
  q_ * (1.0 - q_) / nobs_a + crit2 / (4.0 * nobs_a**2)
  P_i = np.diag(p / total_detected_as_each_category)
  np.sqrt(delta_method_variance), recall_macro + \
  z * np.sqrt(delta_method_variance)


In [None]:
summary_df

In [None]:
all_reports_df

## Format Results and Generate LateX

In [60]:
# Case of combining separately evaluated prompt files

model = "gpt-3.5-turbo"

prompt_ids_to_eval = ["P1_HIERARCHY", "P2_HIERARCHY"]
summary_gpt3_raw_1 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_P1_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

prompt_ids_to_eval = ["P1_HIERARCHY", "P2_HIERARCHY"]
summary_gpt3_raw_2 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_P2_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

prompt_ids_to_eval = ["P1_HIERARCHY", "P2_HIERARCHY"]
summary_gpt3_raw_3 = pd.read_csv(f"evaluations/{model}_{data_type}_test_summary_P3_{'_'.join(prompt_ids_to_eval)}_{eval_type}_with_ci.csv", index_col=0)

summary_gpt3_raw = pd.concat([summary_gpt3_raw_1,summary_gpt3_raw_2, summary_gpt3_raw_3]) 
summary_gpt3_raw

FileNotFoundError: [Errno 2] No such file or directory: 'evaluations/gpt-3.5-turbo_enriched_kw_test_summary_P1_P1_HIERARCHY_P2_HIERARCHY_multi_label_with_ci.csv'

In [644]:
map_prompt_to_concept = {
    'P1_HIERARCHY': 'zero-shot',
    'P2_HIERARCHY': 'CC'
}

In [645]:
# Custom sorting function
def custom_sort(prompt):
    import re
    # Extract numbers from the prompt
    numbers = re.findall(r'\d+', prompt)
    if numbers:
        # Primary sort by the first number, secondary sort by the full string
        return (int(numbers[0]), prompt)
    return (float('inf'), prompt)  # Handles cases without numbers
    
def format_summary_for_latex_report(summary_df, map_prompt_to_concept):
    summary_df = summary_df.copy()
    
    # Concatenating each metric with its CI
    summary_df['Precision (CI)'] = summary_df['precision'].astype(str) + ' ' + summary_df['precision CI'].astype(str)
    summary_df['Recall (CI)'] = summary_df['recall'].astype(str) + ' ' + summary_df['recall CI'].astype(str)
    summary_df['F1-Score (CI)'] = summary_df['f1-score'].astype(str) + ' ' + summary_df['f1-score CI'].astype(str)
    summary_df['Accuracy (CI)'] = summary_df['accuracy'].astype(str) + ' ' + summary_df['accuracy CI'].astype(str)
    
    # Dropping old columns
    summary_df.drop(columns=['precision', 'precision CI', 'recall', 'recall CI', 'f1-score', 'f1-score CI', 'accuracy', 'accuracy CI'], inplace=True)
    
    # Rename 'Prompt ID' to 'Prompt'
    summary_df.rename(columns={'Prompt ID': 'Prompt'}, inplace=True)
    
    # Apply the mapping
    summary_df['Concept'] = summary_df['Prompt'].map(map_prompt_to_concept)
    
    # Rearrange the columns to put 'Concept' after 'Prompt'
    summary_df = summary_df[['Prompt', 'Concept', 'Precision (CI)', 'Recall (CI)', 'F1-Score (CI)', 'Accuracy (CI)']]

    summary_df['sort_key'] = summary_df['Prompt'].apply(custom_sort)
    summary_df.sort_values('sort_key', inplace=True)
    summary_df.drop('sort_key', inplace=True, axis=1)
    
    return summary_df
    

In [648]:
summary_gpt3 = format_summary_for_latex_report(summary_gpt3_raw, map_prompt_to_concept)
summary_gpt3['Model'] = 'gpt-3.5'
summary_gpt3 = summary_gpt3.dropna(subset=['Concept'])
summary_gpt3

Unnamed: 0,Prompt,Concept,Precision (CI),Recall (CI),F1-Score (CI),Accuracy (CI),Model
0,P1,zero-shot,"0.454 (0.351, 0.539)","0.331 (0.292, 0.371)","0.261 (0.22, 0.305)","0.331 (0.292, 0.373)",gpt-3.5
1,P2,zero-shot,"0.45 (0.292, 0.652)","0.279 (0.243, 0.318)","0.203 (0.167, 0.242)","0.279 (0.242, 0.316)",gpt-3.5
2,P3_1,CC,"0.584 (0.513, 0.64)","0.453 (0.412, 0.494)","0.43 (0.386, 0.476)","0.453 (0.412, 0.496)",gpt-3.5
3,P3_2,CC,"0.556 (0.487, 0.614)","0.451 (0.41, 0.494)","0.427 (0.382, 0.472)","0.451 (0.41, 0.494)",gpt-3.5
6,P4_1,CC,"0.563 (0.498, 0.611)","0.434 (0.393, 0.476)","0.416 (0.372, 0.461)","0.434 (0.393, 0.476)",gpt-3.5
7,P4_2,CC,"0.506 (0.441, 0.556)","0.414 (0.371, 0.457)","0.392 (0.347, 0.437)","0.414 (0.373, 0.457)",gpt-3.5
8,P5,CC,"0.608 (0.54, 0.65)","0.511 (0.466, 0.552)","0.498 (0.452, 0.542)","0.511 (0.468, 0.552)",gpt-3.5
9,P6,CC,"0.591 (0.536, 0.631)","0.541 (0.5, 0.584)","0.532 (0.487, 0.575)","0.541 (0.5, 0.584)",gpt-3.5
10,P7,CoT,"0.467 (0.334, 0.589)","0.305 (0.266, 0.345)","0.229 (0.19, 0.269)","0.305 (0.268, 0.345)",gpt-3.5
11,P9,CoT + CC,"0.542 (0.453, 0.616)","0.41 (0.371, 0.453)","0.375 (0.332, 0.42)","0.41 (0.369, 0.453)",gpt-3.5


In [650]:
summary_gpt3.drop(columns=['Model', 'Accuracy (CI)'], inplace=True)

print(summary_gpt3.to_latex(float_format="%.3f", index=False, formatters={'Prompt': lambda x: x.replace('_', r'\_')}))

\begin{tabular}{lllll}
\toprule
Prompt & Concept & Precision (CI) & Recall (CI) & F1-Score (CI) \\
\midrule
P1 & zero-shot & 0.454 (0.351, 0.539) & 0.331 (0.292, 0.371) & 0.261 (0.22, 0.305) \\
P2 & zero-shot & 0.45 (0.292, 0.652) & 0.279 (0.243, 0.318) & 0.203 (0.167, 0.242) \\
P3\_1 & CC & 0.584 (0.513, 0.64) & 0.453 (0.412, 0.494) & 0.43 (0.386, 0.476) \\
P3\_2 & CC & 0.556 (0.487, 0.614) & 0.451 (0.41, 0.494) & 0.427 (0.382, 0.472) \\
P4\_1 & CC & 0.563 (0.498, 0.611) & 0.434 (0.393, 0.476) & 0.416 (0.372, 0.461) \\
P4\_2 & CC & 0.506 (0.441, 0.556) & 0.414 (0.371, 0.457) & 0.392 (0.347, 0.437) \\
P5 & CC & 0.608 (0.54, 0.65) & 0.511 (0.466, 0.552) & 0.498 (0.452, 0.542) \\
P6 & CC & 0.591 (0.536, 0.631) & 0.541 (0.5, 0.584) & 0.532 (0.487, 0.575) \\
P7 & CoT & 0.467 (0.334, 0.589) & 0.305 (0.266, 0.345) & 0.229 (0.19, 0.269) \\
P9 & CoT + CC & 0.542 (0.453, 0.616) & 0.41 (0.371, 0.453) & 0.375 (0.332, 0.42) \\
P9\_1 & CoT + CC & 0.521 (0.444, 0.589) & 0.406 (0.365, 0.448) & 0.375 