In [2]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, mannwhitneyu, chi2_contingency

## Load the Data

In [12]:
original_data_splits = "../data/original_training_dataset.csv"
real_test_path = "../data/test_df.csv"

baseline_synthetic = "../data/synthetic_data_baseline_prompt.csv"
synthetic_data_no_grounding_synthetic = "../data/prompt_not_grounded_in_synthetic.csv"
synthetic_data_no_info = "../data/prompt_no_info_orignal_data.csv"

categorical_cols = [
            'numberRating', 'highestRating', 'lowestRating',
            'numberLowRating', 'numberMediumRating', 'numberHighRating',
            'numberMessageRead', 'readAllMessage', 'numberMessageReceived', "medianRating"
        ]

continuous_cols = ['sdRating']

all_columns = categorical_cols + continuous_cols


synthetic_df_no_grounding = pd.read_csv(synthetic_data_no_grounding_synthetic)
synthetic_df_no_data_info = pd.read_csv(synthetic_data_no_info)
synthetic_df = pd.read_csv(baseline_synthetic)
real_df_test = pd.read_csv(real_test_path)

## Run the hierarchical tests

In [13]:
def hierarchical_test(real_df,synthetic_df,columns,alpha= 0.05,categorical_cols=None):
    """
    Perform hierarchical statistical tests to compare distributions of specified columns
    """
    results = []
    if categorical_cols is None:
        categorical_cols = []

    for col in columns:
        
        real_data = real_df[col].dropna()
        synthetic_data = synthetic_df[col].dropna()
        
        result = {
            'Variable': col, 'KS_statistic': np.nan, 'KS_pvalue': np.nan, 'MW_statistic': np.nan,
            'MW_pvalue': np.nan, 'Chi2_statistic': np.nan, 'Chi2_pvalue': np.nan
        }
      
        # --- Stage 1: Kolmogorov-Smirnov Test ---
        ks_stat, ks_pvalue = ks_2samp(real_data, synthetic_data)
        result.update({'KS_statistic': ks_stat, 'KS_pvalue': ks_pvalue})
        
        if ks_pvalue >= alpha:
            result.update({'Test_Passed': 'KS', 'Final_pvalue': ks_pvalue, 'Significant_Difference': False})
            results.append(result)
            continue

        # --- Stage 2: Mann-Whitney U Test (if KS fails) ---
        mwu_stat, mwu_pvalue = mannwhitneyu(real_data, synthetic_data)
        result.update({'MW_statistic': mwu_stat, 'MW_pvalue': mwu_pvalue})
        
        if mwu_pvalue >= alpha:
            result.update({'Test_Passed': 'Mann-Whitney', 'Final_pvalue': mwu_pvalue, 'Significant_Difference': False})
            results.append(result)
            continue
            
        # --- Stage 3: Chi-Square Test ---
        if col in categorical_cols:
            contingency_table = pd.crosstab(pd.concat([real_data, synthetic_data]),['real'] * len(real_data) + ['synth'] * len(synthetic_data)
            )
            try:
                chi2_stat, pvalue, _, _ = chi2_contingency(contingency_table)
                result.update({'Chi2_statistic': chi2_stat, 'Chi2_pvalue': pvalue})
                if pvalue >= alpha:
                    result.update({'Test_Passed': 'Chi-Square', 'Final_pvalue': pvalue, 'Significant_Difference': False})
                else:
                    result.update({'Test_Passed': 'None', 'Final_pvalue': pvalue, 'Significant_Difference': True})
            except ValueError:
                result.update({'Test_Passed': 'Error', 'Final_pvalue': mwu_pvalue, 'Significant_Difference': True})
        else:
            result.update({'Test_Passed': 'None', 'Final_pvalue': mwu_pvalue, 'Significant_Difference': True})
        
        results.append(result)
        
        result_table = pd.DataFrame(results)
        
    return result_table

In [14]:
results_synthetic = hierarchical_test(real_df_test, synthetic_df, all_columns, categorical_cols=categorical_cols)
results_synthetic_no_grounding = hierarchical_test(real_df_test, synthetic_df_no_grounding, all_columns, categorical_cols=categorical_cols)
results_synthetic_no_info = hierarchical_test(real_df_test, synthetic_df_no_data_info, all_columns, categorical_cols=categorical_cols)