# JSON based Key Value metrics

## Setup

In [None]:
!python3 -m pip install levenshtein

Collecting levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from levenshtein)
  Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, levenshtein
Successfully installed levenshtein-0.25.1 rapidfuzz-3.9.6


## Upload data: the results to be evaluate and the gt

In [None]:
!unzip spark_generic_reports_v6_gt.zip
!unzip vLLM_v3_CustomReports01_fixoutput.zip

Archive:  spark_generic_reports_v6_gt.zip
   creating: spark_generic_reports_v6_gt/
  inflating: spark_generic_reports_v6_gt/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEBchanged_0.json  
  inflating: spark_generic_reports_v6_gt/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEB_0.json  
  inflating: spark_generic_reports_v6_gt/Caris-Molecular-Intelligence_MI-Profile_Breast_NOS_WEB_1.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_10.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_11.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_12.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_13.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_14.json  
  inflating: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1-unlocked_15.json  


## Processing JSON functions to Eval

In [None]:
import os
import json
import Levenshtein as lev
from Levenshtein import distance


# Función para convertir las claves del diccionario a minúsculas
def convert_keys_to_lowercase(obj):
    if isinstance(obj, dict):
        return {k.lower(): convert_keys_to_lowercase(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_keys_to_lowercase(item) for item in obj]
    else:
        return obj


# let's define the metric here, we can modularize later
def score(sample, gold, taos=[.5]):
    """
    Scores a 'sample' JSON string against a 'gold' reference.

    :param sample: JSON string to be scored.
    :param gold: Reference 'gold' JSON string.
    :param taos: Optional threshold(s) for the scoring metric (default is [0.5]).
                 The lower the threshold, the most permissive the metric.
    :return: an array of scores for each tao value.
    """
    sample = json.loads(sample)
    gold = json.loads(gold)

    # stringify all the values
    sample = {k: str(v) for k,v in sample.items()}
    gold = {k: str(v) for k,v in gold.items()}

    # True Positives: take the output of provider and match it against ground truth.
    # We can penalize on the OCR quality like {Name: Alberto} vs. {Name: Albero}, we
    # will count 0.8 instead of 1, etc. The same for the key.
    tp = []
    # False Positives: the provider returns a key that is not in gold truth,
    # like "city: New York", and that pair is not in gold standard.
    fp = []
    for sample_key in sample:
        # is this key in the gold ref?
        if sample_key in gold.keys():
            goldv = gold[sample_key]
            norm_dist = distance(sample[sample_key], goldv) / max(len(sample_key), len(goldv))
            # apply the tao threshold on the distance
            tp.append(1 - norm_dist if norm_dist < taos[0] else 0.0)
            gold.pop(sample_key)
        # if not then we soft match on the keys
        else:
            # let's collect candidate keys
            cand_keys = [(k, distance(sample_key, k)/max(len(k), len(sample_key))) for k in gold.keys()]
            cand_keys = [(k, 1 - nd if nd < taos[0] else 0.0) for k, nd in cand_keys]
            cand_keys = [(k, v) for k,v in cand_keys if v > 0.0]
            # if we found candidate keys we move forward otherwise we don't count a tp
            if cand_keys:
                (key, kscore) = max(cand_keys, key=lambda x: x[1])

                vdist = distance(str(gold[key]), str(sample[sample_key])) / max(len(gold[key]), len(sample[sample_key]))
                vscore = 1 - vdist if vdist < taos[0] else 0.0

                # if we meet the 2 thresholds we count the average of k/v scores
                if vscore > 0.0:
                    tp.append((kscore + vscore) / 2)
                    # penalize the OCR mismatch
                    fp.append(1 - (kscore + vscore) / 2)
                    gold.pop(key)
            else:
                fp.append(1.0)
    # False Negatives: keys that were missed by the provider, of course there is some "OCR tolerance" for this.
    # Like if the gold contains {diagnosis: heart problems} and provider returns {diagno$is: heart problems},
    # we still count this with some penalty(check True Positives definition).
    # (everything that is still unmatched is a fn)
    fn = len(gold.keys())
    fp = sum(fp)
    tp = sum(tp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return (precision, recall)


def distance(s1, s2):
    return lev.distance(s1, s2)


def score_without_lowerupper_letters(sample, gold, taos=[.5], test=False):
    """
    Scores a 'sample' JSON string against a 'gold' reference.

    :param sample: JSON string to be scored.
    :param gold: Reference 'gold' JSON string.
    :param taos: Optional threshold(s) for the scoring metric (default is [0.5]).
                 The lower the threshold, the most permissive the metric.
    :return: an array of scores for each tao value.
    """
    if not test:
      sample = json.loads(sample)
      gold = json.loads(gold)

    # Convert all keys and values to lowercase strings
    sample = {k.lower(): str(v).lower() for k, v in sample.items()}
    gold = {k.lower(): str(v).lower() for k, v in gold.items()}

    # True Positives: take the output of provider and match it against ground truth.
    tp = []
    # False Positives: the provider returns a key that is not in gold truth
    fp = []
    for sample_key in sample:
        # Is this key in the gold reference?
        if sample_key in gold.keys():
            goldv = gold[sample_key]
            norm_dist = distance(sample[sample_key], goldv) / max(len(sample[sample_key]), len(goldv))
            # Apply the tao threshold on the distance
            tp.append(1 - norm_dist if norm_dist < taos[0] else 0.0)
            gold.pop(sample_key)
        else:
            # Collect candidate keys
            cand_keys = [(k, distance(sample_key, k) / max(len(k), len(sample_key))) for k in gold.keys()]
            cand_keys = [(k, 1 - nd if nd < taos[0] else 0.0) for k, nd in cand_keys]
            cand_keys = [(k, v) for k, v in cand_keys if v > 0.0]
            # If we found candidate keys, move forward; otherwise, don't count a tp
            if cand_keys:
                (key, kscore) = max(cand_keys, key=lambda x: x[1])
                vdist = distance(str(gold[key]), str(sample[sample_key])) / max(len(gold[key]), len(sample[sample_key]))
                vscore = 1 - vdist if vdist < taos[0] else 0.0
                # If we meet the 2 thresholds, count the average of k/v scores
                if vscore > 0.0:
                    tp.append((kscore + vscore) / 2)
                    # Penalize the OCR mismatch
                    fp.append(1 - (kscore + vscore) / 2)
                    gold.pop(key)
            else:
                fp.append(1.0)
    # False Negatives: keys that were missed by the provider
    fn = len(gold.keys())
    fp = sum(fp)
    tp = sum(tp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    return precision, recall


# Function to process all JSON files and calculate average score
def calculate_average_score(gt_path, results_path):
    total_precision = 0
    total_recall = 0
    count = 0
    files_with_results = 0

    images_with_problems = []
    images_with_results = []

    for filename in os.listdir(gt_path):
        try:
            if filename.endswith('.json'):
                gt_filepath = os.path.join(gt_path, filename)
                results_filepath = os.path.join(results_path, filename)
                print("\nGT:", gt_filepath)
                print("RESULT:", results_filepath)

                if os.path.exists(results_filepath):
                    with open(gt_filepath, 'r', encoding='utf-8') as gt_file:
                        gt_data = gt_file.read()
                    with open(results_filepath, 'r', encoding='utf-8') as results_file:
                        results_data = results_file.read()

                    #### IMPORTANT!!
                    # precision, recall = score(results_data, gt_data)
                    precision, recall = score_without_lowerupper_letters(results_data, gt_data)

                    print(f"... Precision: {precision} Recall: {recall}")
                    total_precision += precision
                    total_recall += recall
                    count += 1
                    files_with_results += 1
                    images_with_results.append(filename)  # Add filename to list of images with results
                else:
                    print("------> ERROR: Result file not found")
                    images_with_problems.append(filename)  # Add filename to list of images with problems

        except Exception as e:
            print("------> ERROR:", e)
            images_with_problems.append(filename)  # Add filename to list of images with problems

    if count > 0:
        avg_precision = total_precision / count
        avg_recall = total_recall / count
    else:
        avg_precision = 0
        avg_recall = 0

    print(f"\nTotal files processed from GT: {count} out of {len(os.listdir(gt_path))}")

    print("\nImages with problems:")
    print(images_with_problems)

    print("\nImages with good results:")
    print(images_with_results)

    return avg_precision, avg_recall

## Evaluation

In [None]:
gt_path = "spark_generic_reports_v6_gt"
results_path = "vLLM_v3_CustomReports01_fixoutput"

# Calculate the average score
avg_precision, avg_recall = calculate_average_score(gt_path, results_path)

# Calculate F1-Score
if avg_precision + avg_recall > 0:
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
else:
    f1_score = 0.0

print(f"\nAverage Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"F1-Score: {f1_score}")


GT: spark_generic_reports_v6_gt/oncoextra-tnbc-ntrk-wm-sample-report_4.json
RESULT: vLLM_v3_CustomReports01_fixoutput/oncoextra-tnbc-ntrk-wm-sample-report_4.json
... Precision: 0.9724738515430454 Recall: 1.0

GT: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_9.json
RESULT: vLLM_v3_CustomReports01_fixoutput/Tempus-Onco_Clinical-Report-Sample_9.json
... Precision: 0.9425426702339866 Recall: 0.4289934819321714

GT: spark_generic_reports_v6_gt/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1_15.json
RESULT: vLLM_v3_CustomReports01_fixoutput/CarisReport_2023_NSCLC_KRAS_G12C_PD-L1_15.json
... Precision: 1.0 Recall: 1.0

GT: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_6.json
RESULT: vLLM_v3_CustomReports01_fixoutput/Tempus-Onco_Clinical-Report-Sample_6.json
... Precision: 0.0 Recall: 0.0

GT: spark_generic_reports_v6_gt/Tempus-Onco_Clinical-Report-Sample_4.json
RESULT: vLLM_v3_CustomReports01_fixoutput/Tempus-Onco_Clinical-Report-Sample_4.json
... Precision: 0.83283