In [None]:
# importing necessary libraries
import json
import os
from collections import Counter
import pandas as pd

# Load gold labels

In [15]:
with open("C:\\Users\\kaliv\\SamProjects\\CommVersion_Assignment\\data\\gold_labels.json") as f:
    gold = json.load(f)

# Discover all output files

In [17]:
output_files = [
    f for f in os.listdir("outputs")
    if f.endswith(".json")
]

# Safe JSON parser

In [None]:
import re

def safe_parse_json(text):
    """
    Safely attempt to parse a JSON string.

    The function first tries to directly parse the given text as JSON.
    If that fails, it searches for the first JSON-like block (enclosed
    in curly braces) within the text and attempts to parse that instead.
    Returns None if parsing is unsuccessful.

    Parameters
    ----------
    text : str or None
        The input string that may contain JSON data. If None, the function
        immediately returns None.

    Returns
    -------
    dict or list or None
        Parsed JSON object (dict or list) if successful, otherwise None.
    """
    if text is None:
        return None

    # Try direct parse
    try:
        return json.loads(text)
    except:
        pass

    # Try extracting JSON from code blocks
    match = re.search(r"\{[\s\S]*\}", text)
    if match:
        try:
            return json.loads(match.group())
        except:
            return None

    return None


# Parse & normalize all outputs

In [22]:
all_parsed_outputs = {}

for file_name in output_files:
    with open(f"outputs/{file_name}") as f:
        raw_outputs = json.load(f)

    parsed_outputs = {}

    for chat_id, raw_text in raw_outputs.items():
        parsed_outputs[chat_id] = safe_parse_json(raw_text)

    all_parsed_outputs[file_name] = parsed_outputs

## sanity check

In [23]:
for file, chats in all_parsed_outputs.items():
    valid = sum(v is not None for v in chats.values())
    print(file, valid, "/ 20 valid JSON")


gpt_prompt1_raw.json 20 / 20 valid JSON
gpt_prompt2_raw.json 20 / 20 valid JSON
gpt_prompt3_raw.json 20 / 20 valid JSON
gpt_prompt4_raw.json 20 / 20 valid JSON
gpt_prompt5_raw.json 20 / 20 valid JSON
gpt_prompt6_raw.json 20 / 20 valid JSON
llama_prompt1_raw.json 20 / 20 valid JSON
llama_prompt2_raw.json 20 / 20 valid JSON
llama_prompt3_raw.json 20 / 20 valid JSON
llama_prompt4_raw.json 20 / 20 valid JSON
llama_prompt5_raw.json 20 / 20 valid JSON
llama_prompt6_raw.json 20 / 20 valid JSON
qwen_prompt1_raw.json 20 / 20 valid JSON
qwen_prompt2_raw.json 17 / 20 valid JSON
qwen_prompt3_raw.json 19 / 20 valid JSON
qwen_prompt4_raw.json 16 / 20 valid JSON
qwen_prompt5_raw.json 14 / 20 valid JSON
qwen_prompt6_raw.json 20 / 20 valid JSON


- GPT: perfect JSON reliability across all prompts
- LLaMA: perfect JSON reliability after parsing cleanup
- Qwen: sensitive to prompt strategy (drops in strict / negative prompts)

ðŸ‘‰ Some models are more prompt-sensitive in structured extraction tasks.

# PHASE 3B â€” Entity-level Metrics (Precision / Recall / F1)

We now answer the core question:

When JSON is valid, how accurate is the extraction?

## Define entities

In [24]:
ENTITIES = [
    "first_name",
    "last_name",
    "phone_number",
    "email",
    "budget",
    "current_location",
    "preferred_location",
    "profession",
    "date_of_visit",
    "buying_timeline_weeks"
]

## Comparison logic

In [None]:
def compare_entity(pred, truth):
    """
    Compare predicted and true entity values.

    This function evaluates a prediction against the ground truth and
    returns a classification code:
      - "TP" (True Positive): prediction matches the truth.
      - "TN" (True Negative): both prediction and truth are None.
      - "FP" (False Positive): prediction exists but truth is None.
      - "FN" (False Negative): prediction is None but truth exists.

    Parameters
    ----------
    pred : any or None
        The predicted entity value.
    truth : any or None
        The ground truth entity value.

    Returns
    -------
    str
        One of {"TP", "TN", "FP", "FN"} indicating the comparison result.
    """
    
    if pred is None and truth is None:
        return "TN"
    if pred == truth:
        return "TP"
    if pred is not None and truth is None:
        return "FP"
    if pred is None and truth is not None:
        return "FN"
    return "FN"

## Evaluation 

In [None]:
def evaluate_file(parsed_outputs, gold):
    """
    Evaluate predicted outputs against gold-standard labels.

    For each chat and entity, the function compares the predicted value
    with the ground truth using `compare_entity`. It aggregates counts
    of "TP", "TN", "FP", and "FN" for each entity type.

    Parameters
    ----------
    parsed_outputs : dict
        Mapping of chat IDs to parsed model outputs (dicts of entity values).
    gold : dict
        Mapping of chat IDs to gold-standard entity labels.

    Returns
    -------
    dict
        A dictionary where each entity maps to a Counter object summarizing
        the number of TP, TN, FP, and FN results.
    """

    counts = {e: Counter() for e in ENTITIES}

    for chat_id in gold:
        for entity in ENTITIES:
            truth = gold[chat_id][entity]

            if parsed_outputs[chat_id] is None:
                pred = None
            else:
                pred = parsed_outputs[chat_id].get(entity)

            result = compare_entity(pred, truth)
            counts[entity][result] += 1

    return counts

## Run evaluation for all 18 outputs

In [27]:
all_metrics = {}

for file_name, parsed_outputs in all_parsed_outputs.items():
    metrics = evaluate_file(parsed_outputs, gold)
    all_metrics[file_name] = metrics

In [None]:
# quick look on all metrics of a model-prompt output
all_metrics['gpt_prompt2_raw.json']

{'first_name': Counter({'TP': 18, 'TN': 2}),
 'last_name': Counter({'TP': 13, 'TN': 7}),
 'phone_number': Counter({'FN': 11, 'TN': 9}),
 'email': Counter({'TP': 12, 'TN': 8}),
 'budget': Counter({'FN': 19, 'FP': 1}),
 'current_location': Counter({'TP': 20}),
 'preferred_location': Counter({'TP': 20}),
 'profession': Counter({'FN': 10, 'TN': 5, 'FP': 3, 'TP': 2}),
 'date_of_visit': Counter({'TP': 12, 'TN': 4, 'FN': 4}),
 'buying_timeline_weeks': Counter({'FN': 12, 'TP': 7, 'TN': 1})}

## Compute Precision / Recall / F1

In [None]:
def compute_prf(counter):
    """
    Compute precision, recall, and F1 score from classification counts.

    Parameters
    ----------
    counter : collections.Counter or dict
        A mapping containing counts for "TP", "FP", and "FN".

    Returns
    -------
    tuple of float
        (precision, recall, f1) where:
        - precision = TP / (TP + FP), 0 if denominator is 0
        - recall    = TP / (TP + FN), 0 if denominator is 0
        - f1        = harmonic mean of precision and recall, 0 if denominator is 0
    """

    tp = counter["TP"]
    fp = counter["FP"]
    fn = counter["FN"]

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

## Aggregate per file (OVERALL score)

Use micro-average (best choice here):

In [None]:
def overall_micro_f1(metrics):
    """
    Compute micro-averaged precision, recall, and F1 score across all entities.

    The function aggregates true positives, false positives, and false negatives
    from the provided metrics dictionary and calculates overall precision,
    recall, and F1 using micro-averaging.

    Parameters
    ----------
    metrics : dict
        A dictionary mapping each entity to a Counter or dict containing
        counts for "TP", "FP", and "FN".

    Returns
    -------
    tuple of float
        (precision, recall, f1) representing the micro-averaged scores.
    """

    tp = fp = fn = 0

    for entity in ENTITIES:
        tp += metrics[entity]["TP"]
        fp += metrics[entity]["FP"]
        fn += metrics[entity]["FN"]

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1

## Create a summary table

In [32]:
import pandas as pd

rows = []

for file_name, metrics in all_metrics.items():
    p, r, f1 = overall_micro_f1(metrics)
    rows.append({
        "model_prompt": file_name,
        "precision": p,
        "recall": r,
        "f1": f1
    })

summary_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
summary_df

Unnamed: 0,model_prompt,precision,recall,f1
5,gpt_prompt6_raw.json,0.992593,0.8375,0.908475
11,llama_prompt6_raw.json,0.977612,0.81875,0.891156
17,qwen_prompt6_raw.json,0.984848,0.8125,0.890411
4,gpt_prompt5_raw.json,0.976923,0.79375,0.875862
10,llama_prompt5_raw.json,0.969231,0.7875,0.868966
2,gpt_prompt3_raw.json,0.963636,0.6625,0.785185
8,llama_prompt3_raw.json,0.963303,0.65625,0.780669
1,gpt_prompt2_raw.json,0.962963,0.65,0.776119
3,gpt_prompt4_raw.json,0.980769,0.6375,0.772727
7,llama_prompt2_raw.json,0.962617,0.64375,0.771536


Top performers (overall micro-F1)

1. GPT + Prompt 6 : F1 = 0.908  
2. LLaMA + Prompt 6 : F1 = 0.891  
3. Qwen + Prompt 6 : F1 = 0.890  

ðŸ‘‰ Prompt Strategy 6 (Entity-by-Entity extraction) is the winner across all models.

## Entity-wise table of top 3 configs:

In [None]:
def entity_wise_df(metrics, label):
    """
    Build a DataFrame of precision, recall, and F1 scores for each entity.

    For every entity in `ENTITIES`, the function computes precision, recall,
    and F1 using `compute_prf` and organizes the results into a pandas
    DataFrame. Each row corresponds to one entity and includes the model
    label for reference.

    Parameters
    ----------
    metrics : dict
        A dictionary mapping each entity to a Counter or dict with counts
        for "TP", "FP", and "FN".
    label : str
        Identifier for the model or prompt, stored in the "model_prompt" column.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with columns:
        - model_prompt (str)
        - entity (str)
        - precision (float)
        - recall (float)
        - f1 (float)
    """

    rows = []
    for entity in ENTITIES:
        p, r, f1 = compute_prf(metrics[entity])
        rows.append({
            "model_prompt": label,
            "entity": entity,
            "precision": p,
            "recall": r,
            "f1": f1
        })
    return pd.DataFrame(rows)

In [None]:
gpt_prompt6_entities_df = pd.concat([
    entity_wise_df(all_metrics["gpt_prompt6_raw.json"], "GPT + Prompt 6")
])

gpt_prompt6_entities_df


Unnamed: 0,model_prompt,entity,precision,recall,f1
0,GPT + Prompt 6,first_name,1.0,1.0,1.0
1,GPT + Prompt 6,last_name,1.0,1.0,1.0
2,GPT + Prompt 6,phone_number,0.0,0.0,0.0
3,GPT + Prompt 6,email,1.0,1.0,1.0
4,GPT + Prompt 6,budget,1.0,0.894737,0.944444
5,GPT + Prompt 6,current_location,1.0,1.0,1.0
6,GPT + Prompt 6,preferred_location,1.0,1.0,1.0
7,GPT + Prompt 6,profession,0.916667,0.916667,0.916667
8,GPT + Prompt 6,date_of_visit,1.0,0.9375,0.967742
9,GPT + Prompt 6,buying_timeline_weeks,1.0,0.421053,0.592593


In [36]:
llama_prompt6_entities_df = pd.concat([
    entity_wise_df(all_metrics["llama_prompt6_raw.json"], "LLaMA + Prompt 6")
])

llama_prompt6_entities_df

Unnamed: 0,model_prompt,entity,precision,recall,f1
0,LLaMA + Prompt 6,first_name,1.0,1.0,1.0
1,LLaMA + Prompt 6,last_name,1.0,1.0,1.0
2,LLaMA + Prompt 6,phone_number,0.0,0.0,0.0
3,LLaMA + Prompt 6,email,1.0,1.0,1.0
4,LLaMA + Prompt 6,budget,1.0,0.684211,0.8125
5,LLaMA + Prompt 6,current_location,1.0,0.95,0.974359
6,LLaMA + Prompt 6,preferred_location,1.0,1.0,1.0
7,LLaMA + Prompt 6,profession,0.8,1.0,0.888889
8,LLaMA + Prompt 6,date_of_visit,1.0,0.875,0.933333
9,LLaMA + Prompt 6,buying_timeline_weeks,1.0,0.526316,0.689655


In [37]:
qwen_prompt6_entities_df = pd.concat([
    entity_wise_df(all_metrics["qwen_prompt6_raw.json"], "Qwen + Prompt 6")
])

qwen_prompt6_entities_df

Unnamed: 0,model_prompt,entity,precision,recall,f1
0,Qwen + Prompt 6,first_name,1.0,1.0,1.0
1,Qwen + Prompt 6,last_name,1.0,1.0,1.0
2,Qwen + Prompt 6,phone_number,0.0,0.0,0.0
3,Qwen + Prompt 6,email,1.0,1.0,1.0
4,Qwen + Prompt 6,budget,1.0,0.894737,0.944444
5,Qwen + Prompt 6,current_location,1.0,0.95,0.974359
6,Qwen + Prompt 6,preferred_location,1.0,0.8,0.888889
7,Qwen + Prompt 6,profession,0.857143,1.0,0.923077
8,Qwen + Prompt 6,date_of_visit,1.0,0.875,0.933333
9,Qwen + Prompt 6,buying_timeline_weeks,1.0,0.473684,0.642857
