In [2]:
import pandas as pd 
import numpy as np
from typing import Dict
from sdmetrics.single_column import CategoryAdherence, BoundaryAdherence

In [10]:
original_data_splits = "../data/original_training_dataset.csv"
real_test_path = "../data/test_df.csv"

baseline_synthetic = "../data/synthetic_data_baseline_prompt.csv"
synthetic_data_no_grounding_synthetic = "../data/prompt_not_grounded_in_synthetic.csv"
synthetic_data_no_info = "../data/prompt_no_info_orignal_data.csv"


categorical_cols = [
            'numberRating', 'highestRating', 'lowestRating','medianRating'
            'numberLowRating', 'numberMediumRating', 'numberHighRating',
            'numberMessageRead', 'readAllMessage', 'numberMessageReceived', 
        ]

continuous_cols = ['sdRating']

# all columns 
all_columns = categorical_cols + continuous_cols


synthetic_df = pd.read_csv(baseline_synthetic)
synthetic_df_no_orignal_inofs = pd.read_csv(synthetic_data_no_info)
synthetic_df_no_grounding = pd.read_csv(synthetic_data_no_grounding_synthetic)

real_df_test = pd.read_csv(real_test_path)

## Boundary check

In [11]:
def boundary_adherence(real_df,synthetic_df,categorical_cols,continuous_cols):
    """
    Checks the boundry adherence for categorical and continuous columns.
    """
    results = {}

    for col in categorical_cols:
        if col in real_df.columns and col in synthetic_df.columns:
            category_score = CategoryAdherence.compute(
                real_data=real_df[col],
                synthetic_data=synthetic_df[col]
            )
            results[f"{col}_coverage"] = category_score

    for col in continuous_cols:
        if col in real_df.columns and col in synthetic_df.columns:
            boundary_score = BoundaryAdherence.compute(
                real_data=real_df[col],
                synthetic_data=synthetic_df[col]
            )
            results[f"{col}_boundary_conformity"] = boundary_score

    return results

In [12]:
# Calculate boundary adherence for different synthetic datasets
results_with_original = boundary_adherence(real_df_test, synthetic_df, categorical_cols, continuous_cols)
results_no_info = boundary_adherence(real_df_test, synthetic_df_no_orignal_inofs, categorical_cols, continuous_cols)
results_no_grounding = boundary_adherence(real_df_test, synthetic_df_no_grounding, categorical_cols, continuous_cols)



# Calculate the CAT score

In [15]:
def cat_score_exact(real_df,syn_df,cat_cols):
    """
    Calcualate the CAT score
    """
    rows = []

    for col in cat_cols:
        if col not in real_df.columns or col not in syn_df.columns:
            rows.append({
                "column": col,
                "real_n_classes": np.nan,
                "syn_n_classes": np.nan,
                "ratio_|D_syn|/|D_real|": np.nan,
                "note": "missing in one of the DataFrames"
            })
            continue

        r = real_df[col]
        s = syn_df[col]

        
        
        real_n = r.dropna().nunique()
        syn_n  = s.dropna().nunique()
            
        ratio = (syn_n / real_n) if real_n not in (0, np.nan) and real_n != 0 else np.nan
        

        rows.append({
            "column": col,
            "real_n_classes": int(real_n) if pd.notna(real_n) else np.nan,
            "syn_n_classes": int(syn_n) if pd.notna(syn_n) else np.nan,
            "ratio_|D_syn|/|D_real|": ratio,
            "note": ""
        })

    details_df = pd.DataFrame(rows).sort_values("column").reset_index(drop=True)
    cat_overall = float(details_df["ratio_|D_syn|/|D_real|"].mean(skipna=True)) if not details_df.empty else np.nan
    return cat_overall, details_df


In [5]:
cat_overall_synthetic, per_col_syn = cat_score_exact(real_df_test, synthetic_df, categorical_cols)
cat_overal_synthetic_no_info, per_col_no_info= cat_score_exact(real_df_test, synthetic_df_no_orignal_inofs, categorical_cols)
cat_overal_synthetic_no_grounding, per_col_no_grounding = cat_score_exact(real_df_test, synthetic_df_no_grounding, categorical_cols)
