### Combine all results 

In [None]:
import os
import json
from pathlib import Path

def get_all_json_files(directory):
    directory_path = Path(directory)
    return [str(file) for file in directory_path.rglob('*.json')]

def load_json_files(json_file_paths):
    json_data_list = []
    error_file_list = []
    for file_path in json_file_paths:
        try:
            with open(file_path, 'r') as json_file:
                data = json.load(json_file)  # Load JSON content
                json_data_list.append(data)  # Add to list
        except Exception as e:
            print(f"Failed to load {file_path}: {e}")
            error_file_list.append(file_path)
    return json_data_list, error_file_list

# Example usage
model_name = "gem7b"

directory_path = f"GEM_Evaluation_PTBXL/gpt_evaluated/{model_name}"  # Replace with your actual path
all_json_files = get_all_json_files(directory_path)
print(len(all_json_files))
json_data, error_data = load_json_files(all_json_files)

In [None]:
all_results = {}
for d in json_data:
    all_results[d['id']] = d['results']

save_path = f"GEM_Evaluation_PTBXL/gpt_evaluated_all/{model_name}_all_results.json"

os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, 'w') as f:
    json.dump(all_results, f, indent=4)

### Calculate scores

In [None]:
import os
import re
import json
import pandas as pd

model_name = "gem7b"

directory_path = f"GEM_Evaluation_PTBXL/gpt_evaluated_all/{model_name}_all_results.json"

with open(directory_path, 'r') as json_file:
    json_data = json.load(json_file)  # Load JSON content

expected_keys = [
    'DiagnosisAccuracy',
    'AnalysisCompleteness',
    'AnalysisRelevance',
    'LeadAssessmentCoverage',
    'LeadAssessmentAccuracy',
    'ECGFeatureGrounding',
    'EvidenceBasedReasoning',
    'ClinicalDiagnosticFidelity'
]

pattern = re.compile(r'\"(?P<key>{})\":\s*(?P<content>\[.*?\])'.format('|'.join(expected_keys)), re.DOTALL)

result = {}

# Additional cleaning functions
def fix_unterminated_string(content):
    quote_count = len(re.findall(r'(?<!\\)"', content))
    if quote_count % 2 == 1:
        content = re.sub(r'(\s*[}\]])', r'"\1', content, count=1)
    return content

def escape_inner_quotes_in_explanation(content):
    def replacer(match):
        explanation = match.group(1)
        fixed = re.sub(r'(?<!\\)"', r'\\"', explanation)
        return f'"Explanation": "{fixed}"'
    return re.sub(r'"Explanation":\s*"([^"]*?)"', replacer, content)

def remove_extra_quotes(content):
    content = re.sub(r'""+', '"', content)
    return content

def fix_unmatched_brackets(content):
    def replacer(match):
        explanation = match.group(1)
        fixed = re.sub(r'[\[\]]', '', explanation)
        return f'"Explanation": "{fixed}"'
    return re.sub(r'"Explanation":\s*"([^"]*?)"', replacer, content)


def fix_missing_commas(content):
    content = re.sub(r'(\})(\s*\{)', r'\1,\2', content)
    return content

def safe_eval(match):
    try:
        return str(eval(match.group(1)))
    except:
        return match.group(1)  # Return the original string if eval fails

for id, content in json_data.items():

    json_content = content.strip('```json\n').strip('\n```')
    result[id] = {}

    matches = pattern.finditer(json_content)
    
    for match in matches:
        key = match.group('key')
        content = match.group('content')

        # Original cleaning steps
        content = content.replace("\"", '"').replace("“", '"').replace("”", '"')
        content = re.sub(r'//.*', '', content)
        content = re.sub(r',\s*([}\]])', r'\1', content)
        content = re.sub(r'"\s*"', ' ', content)
        content = re.sub(r'\+(\d)', r'\1', content)
        content = re.sub(r'(\d+[\d\s\*\+\-\/]+\d+)', safe_eval, content)
        content = content.replace('");', '"')
        content = re.sub(r'\s+', ' ', content)
        content = re.sub(r'\n|\r', ' ', content)

        # Remove unmatched trailing characters after quote
        content = re.sub(r'"\s*[\]\)]', '"', content)
        # Fix missing commas between JSON objects in arrays
        content = re.sub(r'\}\s*\{', '},{', content)
        content = re.sub(r'("Explanation":\s*".*?)(?<!\\)"\s*,?\s*\{', r'\1"}, {', content)

        # Additional cleaning steps
        content = fix_unterminated_string(content)
        content = escape_inner_quotes_in_explanation(content)
        content = remove_extra_quotes(content)
        content = fix_unmatched_brackets(content)
        content = fix_missing_commas(content)

        open_braces = content.count('{')
        close_braces = content.count('}')
        if open_braces > close_braces:
            content += '}' * (open_braces - close_braces)
        
        open_brackets = content.count('[')
        close_brackets = content.count(']')
        if open_brackets > close_brackets:
            content += ']' * (open_brackets - close_brackets)

        try:
            content_json = json.loads(content)
        except json.JSONDecodeError as e:
            print(f"JSON decoding error for id {id}, key {key}: {e}")
            print("Content:", content)
            continue

        scores = []
        explanations = []

        for item in content_json:
            score = item.get('Score')
            explanation = item.get('Explanation', '').strip()
            extra_fields = {k: v for k, v in item.items() if k not in ['Score', 'Explanation']}

            if extra_fields:
                explanation += " Additional details: " + json.dumps(extra_fields)

            scores.append(score)
            explanations.append(explanation)

        result[id][key] = {
            'Scores': scores,
            'Explanations': explanations
        }

In [None]:
results = {}
for id, content in result.items():
    results[id] = {}
    for key, value in content.items():
        
        if key in ['LeadAssessmentCoverage', 'LeadAssessmentAccuracy', 'AnalysisCompleteness', 'AnalysisRelevance']:
            # Filter out zero scores
            average = sum(value['Scores'])
        else:
            # Filter out zero scores
            filtered_lst = [x for x in value['Scores'] if x > 0] 
            average = sum(filtered_lst) / len(filtered_lst) if filtered_lst else 0

        results[id][key] = average

In [None]:
df = pd.DataFrame(results).T

In [None]:
df['DiagnosisAccuracy'] = df['DiagnosisAccuracy']/2 * 100
df['LeadAssessmentCoverage'] = df['LeadAssessmentCoverage'].clip(upper=12)/12 * 100
df['LeadAssessmentAccuracy'] = df['LeadAssessmentAccuracy']/24 * 100

df.mean().round(2)