In [3]:
import pandas as pd 
import numpy as np
from typing import Dict
from sdmetrics.single_column import CategoryAdherence, BoundaryAdherence

In [4]:
original_data_splits = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_run_data_preparation\data_splits\original_training_dataset.csv"
real_test_path = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_run_data_preparation\data_splits\test_df.csv"

synthetic_path=r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data.csv"



synthetic_data_no_grounding= r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data_not_grounded_synthetic.csv"

synthetic_new_no_info = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data_no_info_new.csv"



categorical_cols = [
            'numberRating', 'highestRating', 'lowestRating',
            'numberLowRating', 'numberMediumRating', 'numberHighRating',
            'numberMessageRead', 'readAllMessage', 'numberMessageReceived', 
        ]

continuous_cols = ['sdRating','medianRating']

# all columns 
all_columns = categorical_cols + continuous_cols


synthetic_df = pd.read_csv(synthetic_path)
synthetic_df_no_orignal_inofs = pd.read_csv(synthetic_new_no_info)
synthetic_df_no_grounding = pd.read_csv(synthetic_data_no_grounding)

real_df_original = pd.read_csv(original_data_splits)
real_df_test = pd.read_csv(real_test_path)

## Boundary check

In [5]:
from typing import Dict
from sdmetrics.single_column import CategoryAdherence, BoundaryAdherence

def boundary_adherence(
    real_df,
    synthetic_df,
    categorical_cols,
    continuous_cols
) -> Dict[str, float]:
    """
    Prüft, ob die synthetischen Daten die Grenzen der echten Daten respektieren.
    Nutzt Coverage Score für kategoriale und Boundary Conformity für kontinuierliche Variablen.
    """
    results = {}

    # Kategoriale Variablen
    for col in categorical_cols:
        if col in real_df.columns and col in synthetic_df.columns:
            category_score = CategoryAdherence.compute(
                real_data=real_df[col],
                synthetic_data=synthetic_df[col]
            )
            results[f"{col}_coverage"] = category_score

    # Kontinuierliche Variablen
    for col in continuous_cols:
        if col in real_df.columns and col in synthetic_df.columns:
            boundary_score = BoundaryAdherence.compute(
                real_data=real_df[col],
                synthetic_data=synthetic_df[col]
            )
            results[f"{col}_boundary_conformity"] = boundary_score

    return results



In [12]:
# Calculate boundary adherence for different synthetic datasets
results_with_original = boundary_adherence(real_df_test, synthetic_df, categorical_cols, continuous_cols)
results_no_info = boundary_adherence(real_df_test, synthetic_df_no_orignal_inofs, categorical_cols, continuous_cols)
results_no_grounding = boundary_adherence(real_df_test, synthetic_df_no_grounding, categorical_cols, continuous_cols)



In [15]:
results_no_grounding

{'numberRating_coverage': np.float64(1.0),
 'highestRating_coverage': np.float64(1.0),
 'lowestRating_coverage': np.float64(1.0),
 'numberLowRating_coverage': np.float64(1.0),
 'numberMediumRating_coverage': np.float64(1.0),
 'numberHighRating_coverage': np.float64(1.0),
 'numberMessageRead_coverage': np.float64(1.0),
 'readAllMessage_coverage': np.float64(1.0),
 'numberMessageReceived_coverage': np.float64(1.0),
 'sdRating_boundary_conformity': np.float64(1.0),
 'medianRating_boundary_conformity': np.float64(1.0)}

# Calculate the CAT score

In [16]:
def calculate_cat_score(real_df, synthetic_df, categorical_cols) -> float:
    """
    CAT: category coverage across categorical variables.
    Für jede kategoriale Spalte u:
      coverage_u = | C_syn(u) ∩ C_real(u) | / | C_real(u) |
    CAT = Durchschnitt aller coverage_u
    """
    cat_scores = []

    for col in categorical_cols:
        if col in real_df.columns and col in synthetic_df.columns:
            real_vals = real_df[col].dropna()
            syn_vals = synthetic_df[col].dropna()

            real_cats = set(real_vals.unique())
            if not real_cats:
                continue  

            syn_cats = set(syn_vals.unique())
            coverage = len(real_cats & syn_cats) / len(real_cats)
            cat_scores.append(coverage)

    return float(np.mean(cat_scores)) if cat_scores else 0.0


In [22]:
import pandas as pd
import numpy as np

def cat_score_exact(real_df: pd.DataFrame,
                    syn_df: pd.DataFrame,
                    cat_cols: list) -> (float, pd.DataFrame):
    """
    CAT = (1/J) * sum_j |D_syn^j| / |D_real^j|
    where D_*^j are the sets of distinct observed categories in column j.

    Parameters
    ----------
    real_df : DataFrame (real data)
    syn_df  : DataFrame (synthetic data)
    cat_cols: list of column names to evaluate (binary/categorical variables)
    include_na : if True, treat NaN as a category; else exclude NaN

    Returns
    -------
    cat_overall : float (mean over columns with finite denominators)
    details_df  : per-column breakdown with |D_real|, |D_syn|, and ratio
    """
    rows = []

    for col in cat_cols:
        if col not in real_df.columns or col not in syn_df.columns:
            rows.append({
                "column": col,
                "real_n_classes": np.nan,
                "syn_n_classes": np.nan,
                "ratio_|D_syn|/|D_real|": np.nan,
                "note": "missing in one of the DataFrames"
            })
            continue

        r = real_df[col]
        s = syn_df[col]

        
        
        real_n = r.dropna().nunique()
        syn_n  = s.dropna().nunique()
            
        ratio = (syn_n / real_n) if real_n not in (0, np.nan) and real_n != 0 else np.nan
        

        rows.append({
            "column": col,
            "real_n_classes": int(real_n) if pd.notna(real_n) else np.nan,
            "syn_n_classes": int(syn_n) if pd.notna(syn_n) else np.nan,
            "ratio_|D_syn|/|D_real|": ratio,
            "note": ""
        })

    details_df = pd.DataFrame(rows).sort_values("column").reset_index(drop=True)
    cat_overall = float(details_df["ratio_|D_syn|/|D_real|"].mean(skipna=True)) if not details_df.empty else np.nan
    return cat_overall, details_df


cat_overall, per_col = cat_score_exact(real_df_original, synthetic_df, categorical_cols)



In [23]:
cat_overall_synthetic, per_col_syn = cat_score_exact(real_df_test, synthetic_df, categorical_cols)
cat_overal_synthetic_no_info, per_col_no_info= cat_score_exact(real_df_test, synthetic_df_no_orignal_inofs, categorical_cols)
cat_overal_synthetic_no_grounding, per_col_no_grounding = cat_score_exact(real_df_test, synthetic_df_no_grounding, categorical_cols)


In [24]:
cat_overall_synthetic

0.8490740740740742

In [25]:
cat_overal_synthetic_no_info

0.6324074074074074

In [26]:
cat_overal_synthetic_no_grounding


0.6824074074074074

In [96]:
array = [0.375000,0.250000,0.250000,0.200000,1.00000,1.000000, 1.000000,1.000000,1.000000]
np.mean(array)


np.float64(0.675)

In [28]:
synthetic_df["medianRating"].median()

np.float64(4.0)