In [7]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon


# Load the data

In [15]:
original_data_splits = "../data/original_training_dataset.csv"
real_test_path = "../data/test_df.csv"

baseline_synthetic = "../data/synthetic_data_baseline_prompt.csv"
synthetic_data_no_grounding_synthetic = "../data/prompt_not_grounded_in_synthetic.csv"
synthetic_data_no_info = "../data/prompt_no_info_orignal_data.csv"
original_data ="../data/original_training_dataset.csv"


categorical_cols = [
            'numberRating', 'highestRating', 'lowestRating','medianRating'
            'numberLowRating', 'numberMediumRating', 'numberHighRating',
            'numberMessageRead', 'readAllMessage', 'numberMessageReceived', 
        ]

continuous_cols = ['sdRating']

ordinal_cols = ['highestRating','lowestRating']

# all columns 
all_columns = categorical_cols + continuous_cols


synthetic_df = pd.read_csv(baseline_synthetic)
synthetic_df_no_orignal_inofs = pd.read_csv(synthetic_data_no_info)
synthetic_df_no_grounding = pd.read_csv(synthetic_data_no_grounding_synthetic)
real_df_test = pd.read_csv(real_test_path)
real_df_original = pd.read_csv(original_data_splits)


### Calculate the Wasserstein Distance

In [28]:
def calculate_wasserstein_distances(real_df, synthetic_df, continuous_cols, ordinal_cols):
    """Calculate Wasserstein-1 distances for continuous and ordinal columns."""
    results = []

    # Calculate Wasserstein-1 Distance for continuous columns
    for col in continuous_cols:
        dist = wasserstein_distance(real_df[col], synthetic_df[col])
        results.append({
            'Column': col,
            'Type': 'Continuous',
            'Metric': 'Wasserstein-1',
            'Value': dist
        })
        
    # Calculate Wasserstein-1 Distance for ordinal columns with proper support
    for col in ordinal_cols:
        vc_synthetic = synthetic_df[col].value_counts()
        vc_original = real_df[col].value_counts()
        
        support = sorted(set(vc_synthetic.index).union(vc_original.index))
        
        u_vals = np.array(support, dtype=float)
        v_vals = np.array(support, dtype=float)
        
        u_weights = np.array([vc_original.get(x, 0) for x in support], dtype=float)
        v_weights = np.array([vc_synthetic.get(x, 0) for x in support], dtype=float)
        
            
        dist = wasserstein_distance(u_vals, v_vals, u_weights=u_weights, v_weights=v_weights)
        
        results.append({
            'Column': col,
            'Type': 'Ordinal',
            'Metric': 'Wasserstein-1',
            'Value': dist,
            'Support_Size': len(support),
            'Original_Count': int(vc_original.sum()),
            'Synthetic_Count': int(vc_synthetic.sum())
        })
        
    results  = pd.DataFrame(results)
        
        
    
    return results

In [21]:
original_synthetic = calculate_wasserstein_distances(synthetic_df, real_df_original, continuous_cols, ordinal_cols)
print(f"Mean = {original_synthetic['Value'].mean()} and Std = {original_synthetic['Value'].std()}")


Mean = 1.2488828426330687 and Std = 0.7063936015288309


In [22]:
original_synthetic

Unnamed: 0,Column,Type,Metric,Value,Support_Size,Original_Count,Synthetic_Count
0,sdRating,Continuous,Wasserstein-1,0.479629,,,
1,highestRating,Ordinal,Wasserstein-1,1.86841,8.0,9000.0,1435.0
2,lowestRating,Ordinal,Wasserstein-1,1.39861,8.0,9000.0,1435.0


In [23]:
no_grounding = calculate_wasserstein_distances(real_df_original, synthetic_df_no_grounding, continuous_cols, ordinal_cols)
no_grounding
print(f"Mean = {no_grounding['Value'].mean()} and Std = {no_grounding['Value'].std()}")


Mean = 1.05093648106815 and Std = 0.802098926623282


In [24]:
no_grounding

Unnamed: 0,Column,Type,Metric,Value,Support_Size,Original_Count,Synthetic_Count
0,sdRating,Continuous,Wasserstein-1,0.128977,,,
1,highestRating,Ordinal,Wasserstein-1,1.588443,8.0,1435.0,9000.0
2,lowestRating,Ordinal,Wasserstein-1,1.435389,8.0,1435.0,9000.0


In [25]:
no_dataset_info = calculate_wasserstein_distances(real_df_original, synthetic_df_no_orignal_inofs, continuous_cols, ordinal_cols)
print(f"Mean = {no_dataset_info['Value'].mean()} and Std = {no_dataset_info['Value'].std()}")

Mean = 1.8871312616788731 and Std = 1.2241796397875535


### Calculate the Jensen-Shannon Distance 

In [26]:
def calculate_jensen_shannon_distances(real_df, synthetic_df,categorical_cols):
    """Calculate Jensen-Shannon distances for categorical columns only."""
    results = []

    # Calculate Jensen-Shannon Distance for categorical columns
    for col in categorical_cols:
        # Get value counts and normalize to probabilities
        p = real_df[col].value_counts(normalize=True)
        q = synthetic_df[col].value_counts(normalize=True)

        # Align the distributions
        all_categories = sorted(list(set(p.index) | set(q.index)))
        p_aligned = p.reindex(all_categories, fill_value=0)
        q_aligned = q.reindex(all_categories, fill_value=0)

        # Calculate Jensen-Shannon Distance (square root of divergence)
        js_distance = jensenshannon(p_aligned, q_aligned, base=2)
        results.append({
            'Column': col,
            'Type': 'Categorical',
            'Metric': 'Jensen-Shannon Distance',
            'Value': js_distance
        })
        print(f"Jensen-Shannon distance for {col}: {js_distance:.6f}")

    return pd.DataFrame(results)

In [27]:
original_synthetic_js = calculate_jensen_shannon_distances(real_df_test, synthetic_df, categorical_cols)
original_synthetic_js["Value"].mean()
print(f"Mean = {original_synthetic_js["Value"].mean()} and Std = {original_synthetic_js["Value"].std()}")


Jensen-Shannon distance for numberRating: 0.369735
Jensen-Shannon distance for highestRating: 0.497747
Jensen-Shannon distance for lowestRating: 0.686265


KeyError: 'medianRatingnumberLowRating'

In [122]:
no_grounding_js = calculate_jensen_shannon_distances(real_df_test, synthetic_df_no_grounding, categorical_cols)
print(f"Mean = {no_grounding_js["Value"].mean()} and Std = {no_grounding_js["Value"].std()}")

Jensen-Shannon distance for numberRating: 0.405947
Jensen-Shannon distance for numberLowRating: 0.143579
Jensen-Shannon distance for numberMediumRating: 0.403245
Jensen-Shannon distance for numberHighRating: 0.453409
Jensen-Shannon distance for numberMessageRead: 0.090069
Jensen-Shannon distance for readAllMessage: 0.224013
Jensen-Shannon distance for numberMessageReceived: 0.021710
Jensen-Shannon distance for medianRating: 0.738945
Mean = 0.3101145322400702 and Std = 0.23574536924452766


In [8]:
no_dataset_info_js = calculate_jensen_shannon_distances(real_df_test, synthetic_df_no_dataset_info, categorical_cols)
print(f"Mean = {no_dataset_info_js["Value"].mean()} and Std = {no_dataset_info_js["Value"].std()}")

Jensen-Shannon distance for numberRating: 0.725772
Jensen-Shannon distance for numberLowRating: 0.222124
Jensen-Shannon distance for numberMediumRating: 0.775909
Jensen-Shannon distance for numberHighRating: 0.384463
Jensen-Shannon distance for numberMessageRead: 0.284005
Jensen-Shannon distance for readAllMessage: 0.159766
Jensen-Shannon distance for numberMessageReceived: 0.019570
Jensen-Shannon distance for medianRating: 0.769268
Mean = 0.4176094903365107 and Std = 0.2997830314174479
