In [None]:
import pandas as pd
import numpy as np
from typing import Set, List, Dict, Any, Tuple

# Import your cleaning functions (ensure these are accessible)
from survey_assist_utils.data_cleaning.prep_data import (
    prep_clerical_codes,
    prep_model_codes,
)





In [None]:

# ----------------------------------------------------------------------
# 1. Logic Engine: "Main" (Set Theory / OO, OM, MO, MM)
# ----------------------------------------------------------------------
def evaluate_main_method(cc_set: Set[str], llm_set: Set[str]) -> Dict[str, Any]:
    """
    Evaluates matches based on Set Theory (The 'Main' Method).
    Returns the classification (OO, OM, etc) and the boolean match result.
    """
    if not cc_set or not llm_set:
        return {"main_case": "Missing", "main_result": False}

    cc_len = len(cc_set)
    llm_len = len(llm_set)
    
   
    # 'One to One' (OO)
    if cc_len == 1 and llm_len == 1:
        return {
            "main_case": "One-to-One", 
            "main_result": cc_set == llm_set
        }

    # 'One to Many' (OM) - CC is single, LLM is multiple
    # Does the CC label appear in the LLM list?
    if cc_len == 1 and llm_len > 1:
        return {
            "main_case": "One-to-Many",
            "main_result": cc_set.issubset(llm_set)
        }

    # 'Many to One' (MO) - CC is multiple, LLM is single
    # Is the LLM label in the CC list?
    if cc_len > 1 and llm_len == 1:
        return {
            "main_case": "Many-to-One",
            "main_result": llm_set.issubset(cc_set)
        }

    # 'Many to Many' (MM)
    # Do their lists have at least one item in common?
    if cc_len > 1 and llm_len > 1:
        return {
            "main_case": "Many-to-Many",
            "main_result": not cc_set.isdisjoint(llm_set)
        }
        
    return {"main_case": "Unhandled", "main_result": False}


In [None]:

# ----------------------------------------------------------------------
# Logic Engine: "Legacy / 160c" (Column / Positional)
# ----------------------------------------------------------------------
def evaluate_legacy_method(
    cc_set: Set[str], 
    llm_set: Set[str], 
    config: str = "col1_vs_col1"
) -> bool:
    """
    Evaluates matches based on Column/Positional logic (The '160c' Method).
    
    Since sets are unordered, we must sort them to simulate 'Columns'.
    Assumption: The 'Primary' code is the first item in a sorted list 
    (or however your data ingestion enforces order).
    """
    # Convert sets to sorted lists to simulate "Column 1, Column 2..."
    cc_list = sorted(list(cc_set))
    llm_list = sorted(list(llm_set))
    
    # Helper to safe-get index
    def get_val(lst, idx):
        return lst[idx] if idx < len(lst) else None

    # Logic Implementation based on config choices
    if config == "col1_vs_col1":
        # Check col 1 CC vs col1 LLM (Unambiguous flag logic usually implies strictness)
        c1 = get_val(cc_list, 0)
        l1 = get_val(llm_list, 0)
        return c1 == l1 and c1 is not None

    elif config == "all_cc_vs_col1_llm":
        # Check if Col 1 LLM exists anywhere in CC columns
        l1 = get_val(llm_list, 0)
        if l1 is None: return False
        return l1 in cc_list

    elif config == "col1_cc_vs_all_llm":
        # Check if Col 1 CC exists anywhere in LLM columns
        c1 = get_val(cc_list, 0)
        if c1 is None: return False
        return c1 in llm_list
        
    elif config == "any_vs_any":
        # Intersection check (similar to MM but usually strictly boolean without case logic)
        return bool(set(cc_list) & set(llm_list))

    return False

In [None]:
# ----------------------------------------------------------------------
# Comparison
# ----------------------------------------------------------------------
def run_comparison_pipeline(
    cc_filepath: str, 
    llm_filepath: str,
    legacy_mode: str = "col1_vs_col1"
) -> pd.DataFrame:
    
    # Load Data
    print("Loading Data...")
    cc_df = pd.read_csv(cc_filepath)
    llm_df = pd.read_parquet(llm_filepath) if llm_filepath.endswith('.parquet') else pd.read_csv(llm_filepath)

    # Prep Data 
    print("Cleaning Data...")
    # Adjust args 
    cc_clean = prep_clerical_codes(cc_df) 
    llm_clean = prep_model_codes(llm_df) 

    # Merge
    # We only care about unique_id and the cleaned sets
    merged = pd.merge(
        cc_clean[["unique_id", "clerical_codes"]],
        llm_clean[["unique_id", "model_codes"]],
        on="unique_id",
        how="inner"
    )

    results = []

    print("Running Evaluations...")
    for _, row in merged.iterrows():
        uid = row['unique_id']
        cc = row['clerical_codes']
        llm = row['model_codes']

        # Run Main
        main_res = evaluate_main_method(cc, llm)

        # Run Legacy
        legacy_res = evaluate_legacy_method(cc, llm, config=legacy_mode)

        # Record
        results.append({
            "unique_id": uid,
            "cc_codes": str(cc),
            "llm_codes": str(llm),
            "main_case": main_res['main_case'],
            "main_result": main_res['main_result'],
            "legacy_result": legacy_res,
            "legacy_config": legacy_mode,
            # Do they agree?
            "methods_agree": main_res['main_result'] == legacy_res
        })

    return pd.DataFrame(results)

In [8]:
# Example paths - update these
CC_FILE = "fake_cc_data.csv"
LLM_FILE = "fake_llm_data.parquet"

In [9]:
# Run comparison
df_comparison = run_comparison_pipeline(CC_FILE, LLM_FILE, legacy_mode="col1_vs_col1")

Loading Data...


Item '00000' has no valid codes.
Item '00000' has no valid codes.
Item '99999' has no valid codes.
Item '99999' has no valid codes.
Item '99999' has no valid codes.


Cleaning Data...
Running Evaluations...


In [11]:
# Analysis 1: Where do they disagree?
disagreements = df_comparison[~df_comparison['methods_agree']]

print(f"\nTotal Records: {len(df_comparison)}")
print(f"Disagreements: {len(disagreements)}")

if not disagreements.empty:
    print("\nSample Disagreements:")
    print(disagreements[['unique_id', 'main_case', 'main_result', 'legacy_result', 'cc_codes', 'llm_codes']].head(10))

    # Analysis 2: Disagreement by Case Type
    print("\nDisagreements by Main Case Type:")
    print(disagreements['main_case'].value_counts())

print(df_comparison.head(10))    


Total Records: 7
Disagreements: 0
            unique_id            cc_codes  llm_codes    main_case  \
0            OO_Match           {'86101'}  {'86101'}   One-to-One   
1         OO_Mismatch           {'86101'}      set()      Missing   
2            OM_Match           {'86101'}  {'86101'}   One-to-One   
3            MO_Match  {'72200', '86101'}  {'72200'}  Many-to-One   
4            MM_Match  {'72200', '86101'}  {'72200'}  Many-to-One   
5       Invalid_Input               set()      set()      Missing   
6  Legacy_Unambiguous           {'01110'}  {'01110'}   One-to-One   

   main_result  legacy_result legacy_config  methods_agree  
0         True           True  col1_vs_col1           True  
1        False          False  col1_vs_col1           True  
2         True           True  col1_vs_col1           True  
3         True           True  col1_vs_col1           True  
4         True           True  col1_vs_col1           True  
5        False          False  col1_vs_col1   