In [2]:
import pandas as pd

In [3]:
original_data_splits = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_run_data_preparation\data_splits\original_training_dataset.csv"

synthetic_path=r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data.csv"

synthetic_data_no_grounding= r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data_not_grounded_synthetic.csv"

real_test_path = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_run_data_preparation\data_splits\test_df.csv"
synthetic_new_no_info = r"C:\Users\User\PycharmProjects\master_thesis\simulation_data\final_data_evaluation\create_plots\evaluation_results\fully_fixed_data_no_info_new.csv"

categorical_cols = [
            'numberRating', 'highestRating', 'lowestRating',
            'numberLowRating', 'numberMediumRating', 'numberHighRating',
            'numberMessageRead', 'readAllMessage', 'numberMessageReceived', "medianRating"
        ]

continuous_cols = ['sdRating']

# all columns 
all_columns = categorical_cols + continuous_cols


synthetic_df_no_grounding = pd.read_csv(synthetic_data_no_grounding)
synthetic_df_no_data_info = pd.read_csv(synthetic_new_no_info)
synthetic_df = pd.read_csv(synthetic_path)
real_df_original = pd.read_csv(original_data_splits)
real_df_test = pd.read_csv(real_test_path)

### Run the statistical tests

In [4]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, mannwhitneyu, chi2_contingency,fisher_exact

def hierarchical_tests_with_readall_special(
    real_df: pd.DataFrame,
    synthetic_df: pd.DataFrame,
    all_cols: list,
    read_all_col: str = "readAllMessage",
    alpha: float = 0.05,
    exclude_cols: set | None = None,   # e.g., {"sdRating"} if you want to skip it
) -> pd.DataFrame:
    """
    For all columns in `all_cols` (present in both dataframes):

      - If col == read_all_col:
          Perform Chi-square test of independence on the 2 categories (Yes/No)
          vs Source (real/synthetic). Also report Fisher's exact p-value.
          Returns category counts per source for transparency.

      - Else:
          Hierarchical numeric tests:
            1) KS two-sample test (two-sided)
               - if p >= alpha -> PASS ("KS")
               - else
            2) Mann–Whitney U (two-sided)
               - if p >= alpha -> PASS ("Mann-Whitney")
               - else -> FAIL ("None")
          (No χ² fallback for these columns.)

    Notes:
      - For KS/MW, values are coerced to numeric. If insufficient numeric
        data remain, the test is skipped with a note.
    """
    if exclude_cols is None:
        exclude_cols = set()

    # Only keep columns that exist in both and not excluded
    cols = [c for c in all_cols if c in real_df.columns and c in synthetic_df.columns and c not in exclude_cols]

    rows = []
    for col in cols:
        a_raw = real_df[col].dropna()
        b_raw = synthetic_df[col].dropna()
        n_a, n_b = len(a_raw), len(b_raw)

        # defaults
        ks_stat = ks_p = mw_stat = mw_p = chi2_stat = chi2_p = fisher_p = np.nan
        test_passed = "None"
        final_p = np.nan
        significant = None
        notes = ""
        cat1 = cat2 = None
        cat1_real = cat1_synth = cat2_real = cat2_synth = np.nan

        if n_a < 2 or n_b < 2:
            rows.append({
                "Variable": col,
                "KS_statistic": ks_stat, "KS_pvalue": ks_p,
                "MW_statistic": mw_stat, "MW_pvalue": mw_p,
                "Chi2_statistic": chi2_stat, "Chi2_pvalue": chi2_p,
                "Fisher_pvalue": fisher_p,
                "Test_Passed": test_passed, "Final_pvalue": final_p,
                "Significant_Difference": significant,
                "N_real": n_a, "N_synth": n_b,
                "Cat1": cat1, "Cat1_real": cat1_real, "Cat1_synth": cat1_synth,
                "Cat2": cat2, "Cat2_real": cat2_real, "Cat2_synth": cat2_synth,
                "Notes": "Too few non-NA values",
            })
            continue

        if col == read_all_col:
            # ---- Chi-square (2x2) on readAllMessage only ----
            # Treat values as categories (no coercion to numeric).
            a_cat = a_raw.astype("object")
            b_cat = b_raw.astype("object")

            combined = pd.concat([
                pd.DataFrame({"value": a_cat.to_numpy(), "source": "real"}),
                pd.DataFrame({"value": b_cat.to_numpy(), "source": "synthetic"}),
            ], ignore_index=True)

            contingency = pd.crosstab(combined["value"], combined["source"])

            # Expect exactly 2 categories; if more/less, we still run χ² but report a note.
            if contingency.shape[0] == 2:
                cats = list(contingency.index)
                # prefer 'yes' first if present
                cats_sorted = sorted(cats, key=lambda x: (str(x).strip().lower() != "yes", str(x)))
            else:
                cats_sorted = list(contingency.index)
                notes += f"Expected 2 categories; found {len(cats_sorted)}. "

            # Fill missing columns if needed
            for src in ("real", "synthetic"):
                if src not in contingency.columns:
                    contingency[src] = 0
            contingency = contingency[["real", "synthetic"]]

            # Extract up to two categories for reporting
            if len(cats_sorted) >= 1:
                c1 = cats_sorted[0]
                cat1 = str(c1)
                cat1_real = int(contingency.loc[c1, "real"]) if c1 in contingency.index else 0
                cat1_synth = int(contingency.loc[c1, "synthetic"]) if c1 in contingency.index else 0
            if len(cats_sorted) >= 2:
                c2 = cats_sorted[1]
                cat2 = str(c2)
                cat2_real = int(contingency.loc[c2, "real"]) if c2 in contingency.index else 0
                cat2_synth = int(contingency.loc[c2, "synthetic"]) if c2 in contingency.index else 0

            # Chi-square test (works for any number of categories)
            if contingency.shape[0] > 1 and contingency.shape[1] > 1:
                chi2_stat, chi2_p, dof, expected = chi2_contingency(contingency)
                chi2_stat, chi2_p = float(chi2_stat), float(chi2_p)
                test_passed = "Chi-square" if chi2_p >= alpha else "None"
                final_p = chi2_p
                significant = (chi2_p < alpha)

                # For 2x2 specifically, also report Fisher’s exact p-value
                if contingency.shape[0] == 2:
                    try:
                        fisher_p = float(fisher_exact(contingency.to_numpy())[1])
                    except Exception as e:
                        notes += f"Fisher exact failed: {e}. "
            else:
                notes += "Insufficient categories for χ². "
                significant = None

        # ---- KS -> Mann–Whitney (numeric only) ----
        a_num = pd.to_numeric(a_raw, errors="coerce").dropna()
        b_num = pd.to_numeric(b_raw, errors="coerce").dropna()
        if len(a_num) < 2 or len(b_num) < 2:
            notes += "Not enough numeric values after coercion for KS/MW. "
            significant = None
        else:
            # KS
            try:
                res = ks_2samp(a_num, b_num, alternative="two-sided", method="auto")
                ks_stat, ks_p = float(res.statistic), float(res.pvalue)
            except Exception as e:
                notes += f"KS failed: {e}. "
                ks_stat, ks_p = np.nan, np.nan

            if np.isfinite(ks_p) and ks_p >= alpha:
                test_passed, final_p, significant = "KS", ks_p, False
            else:
                # Mann–Whitney
                try:
                    res = mannwhitneyu(a_num, b_num, alternative="two-sided", method="auto")
                    mw_stat, mw_p = float(res.statistic), float(res.pvalue)
                except Exception as e:
                    notes += f"Mann–Whitney failed: {e}. "
                    mw_stat, mw_p = np.nan, np.nan

                if np.isfinite(mw_p) and mw_p >= alpha:
                    test_passed, final_p, significant = "Mann-Whitney", mw_p, False
                else:
                    test_passed, final_p, significant = "None", mw_p, (mw_p < alpha if np.isfinite(mw_p) else None)

        rows.append({
            "Variable": col,
            "KS_statistic": ks_stat, "KS_pvalue": ks_p,
            "MW_statistic": mw_stat, "MW_pvalue": mw_p,
            "Chi2_statistic": chi2_stat, "Chi2_pvalue": chi2_p,
            "Fisher_pvalue": fisher_p,
            "Test_Passed": test_passed, "Final_pvalue": final_p,
            "Significant_Difference": significant,
            "N_real": n_a, "N_synth": n_b,
            "Cat1": cat1, "Cat1_real": cat1_real, "Cat1_synth": cat1_synth,
            "Cat2": cat2, "Cat2_real": cat2_real, "Cat2_synth": cat2_synth,
            "Notes": notes.strip(),
        })

    # keep input order
    out = pd.DataFrame(rows)
    if not out.empty:
        order = {c: i for i, c in enumerate(cols)}
        out["__ord__"] = out["Variable"].map(order)
        out = out.sort_values(["__ord__", "Variable"]).drop(columns="__ord__")
    return out


## Conduct the tests

In [5]:
results_original = hierarchical_tests_with_readall_special(real_df_test, synthetic_df, all_columns)
results_original_no_info = hierarchical_tests_with_readall_special(real_df_test, synthetic_df_no_data_info, all_columns)
results_original_no_grounding = hierarchical_tests_with_readall_special(real_df_test, synthetic_df_no_grounding, all_columns)

In [6]:
results_original

Unnamed: 0,Variable,KS_statistic,KS_pvalue,MW_statistic,MW_pvalue,Chi2_statistic,Chi2_pvalue,Fisher_pvalue,Test_Passed,Final_pvalue,Significant_Difference,N_real,N_synth,Cat1,Cat1_real,Cat1_synth,Cat2,Cat2_real,Cat2_synth,Notes
0,numberRating,0.31068,3.2363009999999996e-42,1345481.5,6.199483e-60,,,,,6.199483e-60,True,515,9000,,,,,,,
1,highestRating,0.317292,4.706246999999999e-44,2147583.0,0.0016717,,,,,0.0016717,True,515,9000,,,,,,,
2,lowestRating,0.460025,1.45871e-94,2810360.0,1.157784e-18,,,,,1.157784e-18,True,515,9000,,,,,,,
3,numberLowRating,0.010468,1.0,,,,,,KS,1.0,False,515,9000,,,,,,,
4,numberMediumRating,0.561165,1.990186e-144,729452.0,3.0125710000000005e-159,,,,,3.0125710000000005e-159,True,515,9000,,,,,,,
5,numberHighRating,0.204006,3.015132e-18,2847766.0,1.542847e-42,,,,,1.542847e-42,True,515,9000,,,,,,,
6,numberMessageRead,0.069117,0.0180977,2134747.0,0.001425328,,,,,0.001425328,True,515,9000,,,,,,,
7,readAllMessage,0.179762,3.055428e-14,1900902.5,8.226999e-88,390.154079,7.661859e-87,1.487562e-49,,8.226999e-88,True,515,9000,0.0,109.0,287.0,1.0,406.0,8713.0,
8,numberMessageReceived,0.014056,0.9999611,,,,,,KS,0.9999611,False,515,9000,,,,,,,
9,medianRating,0.503177,2.899343e-114,2647400.5,8.75133e-10,,,,,8.75133e-10,True,515,9000,,,,,,,


In [28]:
results_original_no_info

Unnamed: 0,Variable,KS_statistic,KS_pvalue,MW_statistic,MW_pvalue,Chi2_statistic,Chi2_pvalue,Fisher_pvalue,Test_Passed,Final_pvalue,Significant_Difference,N_real,N_synth,Cat1,Cat1_real,Cat1_synth,Cat2,Cat2_real,Cat2_synth,Notes
0,numberRating,0.544723,2.0253439999999998e-135,799041.0,2.921999e-153,,,,,2.921999e-153,True,515,9000,,,,,,,
1,highestRating,0.44466,4.485898e-88,1124133.5,5.236816e-96,,,,,5.236816e-96,True,515,9000,,,,,,,
2,lowestRating,0.458691,5.466025e-94,2452212.0,2.833454e-08,,,,,2.833454e-08,True,515,9000,,,,,,,
3,numberLowRating,0.096865,0.000197443,2541996.0,6.850974e-185,,,,,6.850974e-185,True,515,9000,,,,,,,
4,numberMediumRating,0.757721,1.8138740000000003e-287,480821.0,6.495406999999999e-281,,,,,6.495406999999999e-281,True,515,9000,,,,,,,
5,numberHighRating,0.313105,6.9281500000000004e-43,1667591.5,6.364327e-30,,,,,6.364327e-30,True,515,9000,,,,,,,
6,numberMessageRead,0.118447,2.059935e-06,2428827.0,0.04827065,,,,,0.04827065,True,515,9000,,,,,,,
7,readAllMessage,0.171016,6.344388e-13,2713830.0,6.05568e-15,60.161247,8.739606e-15,6.11116e-16,,6.05568e-15,True,515,9000,0.0,109.0,3444.0,1.0,406.0,5556.0,
8,numberMessageReceived,0.011501,0.9999999,,,,,,KS,0.9999999,False,515,9000,,,,,,,
9,medianRating,0.473453,1.8655509999999998e-100,2043263.0,7.188479e-08,,,,,7.188479e-08,True,515,9000,,,,,,,


In [29]:
results_original_no_grounding

Unnamed: 0,Variable,KS_statistic,KS_pvalue,MW_statistic,MW_pvalue,Chi2_statistic,Chi2_pvalue,Fisher_pvalue,Test_Passed,Final_pvalue,Significant_Difference,N_real,N_synth,Cat1,Cat1_real,Cat1_synth,Cat2,Cat2_real,Cat2_synth,Notes
0,numberRating,0.272087,2.511858e-32,2533683.5,0.0001170115,,,,,0.0001170115,True,515,9000,,,,,,,
1,highestRating,0.555229,3.9234560000000004e-141,2973455.0,6.05895e-53,,,,,6.05895e-53,True,515,9000,,,,,,,
2,lowestRating,0.462136,1.784057e-95,2693251.5,5.248608e-18,,,,,5.248608e-18,True,515,9000,,,,,,,
3,numberLowRating,0.073421,0.009914879,2488659.5,2.7290080000000005e-23,,,,,2.7290080000000005e-23,True,515,9000,,,,,,,
4,numberMediumRating,0.315824,1.2126550000000001e-43,1600383.0,1.45385e-37,,,,,1.45385e-37,True,515,9000,,,,,,,
5,numberHighRating,0.35534,1.85982e-55,3141000.0,0.0,,,,,0.0,True,515,9000,,,,,,,
6,numberMessageRead,0.076228,0.006566969,2142683.0,0.002183387,,,,,0.002183387,True,515,9000,,,,,,,
7,readAllMessage,0.169095,1.21005e-12,1925622.5,9.964491e-64,280.617739,5.50783e-63,1.190865e-39,,9.964491e-64,True,515,9000,0.0,109.0,383.0,1.0,406.0,8617.0,
8,numberMessageReceived,0.01357,0.9999831,,,,,,KS,0.9999831,False,515,9000,,,,,,,
9,medianRating,0.526102,1.1606130000000001e-125,2855830.5,3.686971e-36,,,,,3.686971e-36,True,515,9000,,,,,,,


In [22]:
results_original_no_info

Unnamed: 0,Variable,KS_statistic,KS_pvalue,MW_statistic,MW_pvalue,Chi2_statistic,Chi2_pvalue,Fisher_pvalue,Test_Passed,Final_pvalue,Significant_Difference,N_real,N_synth,Cat1,Cat1_real,Cat1_synth,Cat2,Cat2_real,Cat2_synth,Notes
0,numberRating,0.544723,2.0253439999999998e-135,799041.0,2.921999e-153,,,,,2.921999e-153,True,515,9000,,,,,,,
1,highestRating,0.44466,4.485898e-88,1124133.5,5.236816e-96,,,,,5.236816e-96,True,515,9000,,,,,,,
2,lowestRating,0.458691,5.466025e-94,2452212.0,2.833454e-08,,,,,2.833454e-08,True,515,9000,,,,,,,
3,numberLowRating,0.096865,0.000197443,2541996.0,6.850974e-185,,,,,6.850974e-185,True,515,9000,,,,,,,
4,numberMediumRating,0.757721,1.8138740000000003e-287,480821.0,6.495406999999999e-281,,,,,6.495406999999999e-281,True,515,9000,,,,,,,
5,numberHighRating,0.313105,6.9281500000000004e-43,1667591.5,6.364327e-30,,,,,6.364327e-30,True,515,9000,,,,,,,
6,numberMessageRead,0.118447,2.059935e-06,2428827.0,0.04827065,,,,,0.04827065,True,515,9000,,,,,,,
7,readAllMessage,0.171016,6.344388e-13,2713830.0,6.05568e-15,60.161247,8.739606e-15,6.11116e-16,,6.05568e-15,True,515,9000,0.0,109.0,3444.0,1.0,406.0,5556.0,
8,numberMessageReceived,0.011501,0.9999999,,,,,,KS,0.9999999,False,515,9000,,,,,,,
9,medianRating,0.473453,1.8655509999999998e-100,2043263.0,7.188479e-08,,,,,7.188479e-08,True,515,9000,,,,,,,


In [11]:
 synthetic_df["action"].value_counts()

KeyError: 'action'

In [15]:
synthetic_df["rl_action"].value_counts()

rl_action
0    2290
1    2241
2    2239
3    2230
Name: count, dtype: int64

In [16]:
# run a KS statistics
from scipy.stats import ks_2samp
ks_2samp(synthetic_df["rl_action"],real_df_test["action"], alternative="two-sided", method="auto")

KstestResult(statistic=np.float64(0.11014778856526429), pvalue=np.float64(1.3279078621782419e-05), statistic_location=np.int64(1), statistic_sign=np.int8(-1))

In [17]:
real_df_test

Unnamed: 0,serverTimestamp,day_part_x,user_id,numberRating,highestRating,lowestRating,medianRating,sdRating,numberLowRating,numberMediumRating,numberHighRating,numberMessageReceived,numberMessageRead,readAllMessage,reward,timestamp,action
0,2021-09-29,1,ML090,3.0,3.0,2.0,3.0,0.471405,1.0,2.0,0.0,0,0,0,1.50,2021-09-29 13:01:05.816545,0
1,2021-09-29,2,ML090,4.0,4.0,2.0,3.0,0.707107,1.0,3.0,0.0,0,0,0,2.00,2021-09-29 20:01:05.180532,0
2,2021-09-30,0,ML090,1.0,3.0,3.0,3.0,0.000000,0.0,1.0,0.0,1,1,1,1.00,2021-09-30 09:01:05.966740,1
3,2021-09-30,1,ML090,1.0,3.0,3.0,3.0,0.000000,0.0,1.0,0.0,2,2,1,1.00,2021-09-30 13:01:06.621816,1
4,2021-09-30,2,ML090,2.0,4.0,3.0,3.5,0.500000,0.0,2.0,0.0,2,2,1,1.50,2021-09-30 20:01:05.645037,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,2022-02-17,1,ML469,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,2,2,1,0.50,2022-02-17 13:01:34.203446,2
511,2022-02-17,2,ML469,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,3,3,1,0.50,2022-02-17 20:01:34.524268,1
512,2022-02-18,0,ML469,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1,1,1,0.50,2022-02-18 09:01:35.935410,2
513,2022-02-18,1,ML469,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1,1,1,0.50,2022-02-18 13:01:33.145101,0


In [18]:
# python
import pandas as pd
from scipy.stats import ks_2samp, chi2_contingency

alpha = 0.05

# KS (nur sinnvoll, wenn die Daten als stetig/ordinal interpretiert werden)
ks_res = ks_2samp(
    pd.to_numeric(synthetic_df["rl_action"], errors="coerce").dropna(),
    pd.to_numeric(real_df_test["action"], errors="coerce").dropna(),
    alternative="two-sided",
    method="auto",
)
print(f"KS: statistic={ks_res.statistic:.4f}, p={ks_res.pvalue:.6g}, significant={ks_res.pvalue < alpha}")

# Chi-Quadrat für kategoriale Verteilungen (empfohlen für `action`)
df = pd.concat([
    pd.DataFrame({"action": real_df_test["action"], "source": "real"}),
    pd.DataFrame({"action": synthetic_df["rl_action"], "source": "synthetic"}),
], ignore_index=True).dropna()

cont = pd.crosstab(df["action"], df["source"])
chi2, p_chi2, dof, expected = chi2_contingency(cont)
print(f"Chi-square: chi2={chi2:.4f}, dof={dof}, p={p_chi2:.6g}, significant={p_chi2 < alpha}")

KS: statistic=0.1101, p=1.32791e-05, significant=True
Chi-square: chi2=40.7861, dof=3, p=7.25924e-09, significant=True
