In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ast
import re
import json


# Step 1: Parse the list string into an actual list if needed
def fix_ranked_list(value):
    if isinstance(value, str):
        try:
            return ast.literal_eval(value)
        except Exception:
            return []  # fallback if parsing fails
    return value  # already a list


# Step 2: Normalize templates
def normalize_template(template):
    if not isinstance(template, str):
        return ""
    template = template.replace('\r\n', '\n').strip()
    return re.sub(r'\s+', ' ', template)


# Step 3: Find rank
def find_template_rank(row):
    ranked_list_raw = fix_ranked_list(row['all_prompts_ranked'])
    ranked_list = [normalize_template(t) for t in ranked_list_raw]
    template = normalize_template(row['sparql_template'])
    try:
        return ranked_list.index(template) + 1
    except ValueError:
        return -1
    


def clean_and_parse_result(raw):
    if isinstance(raw, list):
        return raw
    if not isinstance(raw, str):
        return []

    try:
        # Replace single quotes with double quotes
        raw = raw.replace("'", '"')

        # Regex: match any value after a colon that is NOT already quoted, and quote it
        raw = re.sub(r'(": )([^"{\[\],\s][^,\]}]*)', r'\1"\2"', raw)

        return json.loads(raw)
    except Exception:
        return []
    

def extract_iris(results):
    """
    Extract patient IRIs as strings from a list of dicts like [{'patient': IRI<...>}]
    Can also be used for different results like mean or count should not matter
    """
    return set(str(list(d.values())[0]) for d in results)


def compute_precision_recall(row):
    result_list = clean_and_parse_result(row['result'])
    expected_list = clean_and_parse_result(row['expected_result'])

    result_set = extract_iris(result_list)
    expected_set = extract_iris(expected_list)
    
    true_positives = result_set & expected_set
    precision = len(true_positives) / len(result_set) if result_set else 0
    recall = len(true_positives) / len(expected_set) if expected_set else 0
    if len(expected_set) == 0:
        if len(result_set) == 0:
            precision = 1
            recall = 1
    
    exact = result_set == expected_set

    return pd.Series([precision, recall, exact])

def load_result_data(test_set_path,result_path):
    #Load in the data
    test_set_df = pd.read_csv(test_set_path)
    pipeline_results_df = pd.read_csv(result_path)
    # Rename columns for merging
    pipeline_results_df.rename(columns={'natural_language_question': 'question'}, inplace=True)
    # Make sure to remove erronous testdata
    test_set_df = test_set_df[test_set_df['question'].map(test_set_df['question'].value_counts()) == 1]
    pipeline_results_df = pipeline_results_df[
        pipeline_results_df['question'].map(pipeline_results_df['question'].value_counts()) == 1]
    # Merge the data
    merged_df = pd.merge(pipeline_results_df, test_set_df, on="question", how='inner')
    # Remove more errournous data
    merged_df = merged_df[~merged_df['question'].str.contains(r'{', na=False)]
    
    # Get template rank from the retrieved vector store
    merged_df['template_rank'] = merged_df.apply(find_template_rank, axis=1)
    
    # Get Precision Recal and Accuracy Metrics
    merged_df[['precision', 'recall', 'exact_match']] = merged_df.apply(compute_precision_recall, axis=1)
    
    
    # Prepare a boiled down summary df
    columns_to_keep = ["validation_time", "attempts", "initial_query_time", "final_query_execution_time", "total_time",
                       "template_rank",
                       'precision', 'recall', 'exact_match', "sparql_template"]

    
    reduced_merged_df = merged_df[columns_to_keep]

    # 1) Work on an explicit copy to avoid SettingWithCopyWarning
    reduced = reduced_merged_df.copy()
    
    
    # Safely parse and sum lists from string representations
    reduced["validation_time"] = reduced_merged_df["validation_time"].apply(
        lambda x: sum(ast.literal_eval(x)) if isinstance(x, str) else sum(x) if isinstance(x, list) else 0
    )
    
    
    
    # 2) Add exact_match_int
    reduced['exact_match_int'] = reduced['exact_match'].astype(int)
    
    # 3) Group and compute means on numeric columns only
    #    Pandas >= 1.5 lets you pass numeric_only=True to .mean()
    numeric_means = (
        reduced
        .groupby('sparql_template', as_index=False)
        .mean(numeric_only=True)
    )
    
    # 4) Convert the fraction into a percentage
    numeric_means['exact_match_percent'] = numeric_means['exact_match_int'] * 100
    
    # 5) (Optional) bring back the count per template
    counts = reduced.groupby('sparql_template').size().rename('n_queries').reset_index()
    summary = numeric_means.merge(counts, on='sparql_template')
    
    # 6) (Optional) drop the raw exact_match_int column
    summary = summary.drop(columns=['exact_match_int'])
    
    return merged_df, summary




In [4]:
merged_full_pipeline_df,summary_full_pipeline_df, = load_result_data('/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/test_set.csv','/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/pipeline_results.csv')
merged_baseline_pipeline_df,summary_baseline, = load_result_data('/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/test_set.csv','/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/baseline_results.csv')
merged_sparql_llm_pipeline_df,summary_sparql_llm_pipeline_df, = load_result_data('/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/test_set.csv','/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/sparql_llm_results.csv')

# Save the dataframes to CSV
merged_full_pipeline_df.to_csv('merged_full_pipeline.csv', index=False)
summary_full_pipeline_df.to_csv('summary_full_pipeline.csv', index=False)
merged_baseline_pipeline_df.to_csv('merged_baseline_pipeline.csv', index=False)
summary_baseline.to_csv('summary_baseline.csv', index=False)
merged_sparql_llm_pipeline_df.to_csv('merged_sparql_llm_pipeline.csv', index=False)
summary_sparql_llm_pipeline_df.to_csv('summary_sparql_llm_pipeline.csv', index=False)

In [5]:
merged_load = pd.read_csv('merged_full_pipeline.csv')

In [9]:
merged_load['expected_result'][0]

"[{'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10052992>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10076144>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10407702>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10517523>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10610599>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10648665>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/10789187>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/11058454>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/11287545>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/11374785>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/11709560>}, {'patient': IRI<https://www.biomedit.ch/rdf/sphn-schema/sphn/PATIENTS/12543255>}, {'patient': IR