In [15]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
data= pd.read_excel("dataset_a_week_participants.xlsx")

Select data in Pre-Trial phase

In [7]:
data_pre=data[data['phase']=='Pre-Trial']

In [13]:
data_pre_sum = data_pre.groupby('PID').agg({
    'FV_sales': 'sum',
    'total_coupon_used': 'sum',
    'FV_out_of_pocket': 'sum',
    'NFV_sales': 'sum',
    'DP_out_of_pocket ': 'sum',
    'other_sales': 'sum',
    'portion': 'sum',
    'transaction': 'sum',
    'total_sales':'sum',
    'Intervention': 'first',
    'COHORT #': 'first'
}).reset_index()


Balance check using t-test / ANOVA (compare means), Kolmogorov–Smirnov (KS test) (compare distributions), Standardized Mean Difference (SMD / Cohen’s d)

In [24]:
df=data_pre_sum
def smd(x, y):
    """Standardized Mean Difference"""
    return (np.mean(x) - np.mean(y)) / np.sqrt((np.var(x, ddof=1) + np.var(y, ddof=1)) / 2)

variables = ["FV_sales", "NFV_sales", "DP_out_of_pocket ",
             "other_sales", "portion", "transaction", "total_sales"]

groups = df['Intervention'].unique()
results = []

for col in variables:
    # --- Overall tests ---
    group_data = [df[df['Intervention']==g][col].dropna() for g in groups]

    # ANOVA
    f_stat, p_anova = stats.f_oneway(*group_data)

    # Kruskal-Wallis
    h_stat, p_kw = stats.kruskal(*group_data)

    # Collect means
    mean_vals = {f"Mean_{g}": np.mean(df[df['Intervention']==g][col]) for g in groups}

    # --- Pairwise comparisons ---
    pairwise = []
    for i in range(len(groups)):
        for j in range(i+1, len(groups)):
            g1, g2 = groups[i], groups[j]
            x, y = df[df['Intervention']==g1][col].dropna(), df[df['Intervention']==g2][col].dropna()

            # t-test
            _, p_t = stats.ttest_ind(x, y, equal_var=False)

            # Kolmogorov-Smirnov
            _, p_ks = stats.ks_2samp(x, y)

            # SMD
            smd_val = smd(x, y)

            pairwise.append({
                "Comparison": f"{g1} vs {g2}",
                "p_ttest": p_t,
                "p_KS": p_ks,
                "SMD": smd_val
            })
    
    results.append({
        "Variable": col,
        "Overall_p_ANOVA": p_anova,
        "Overall_p_KW": p_kw,
        **mean_vals,
        "Pairwise": pairwise
    })

# Convert to nice DataFrame
balance_df = pd.DataFrame(results)

# To expand pairwise nicely (optional)
pairwise_expanded = []
for row in results:
    for pw in row["Pairwise"]:
        pairwise_expanded.append({
            "Variable": row["Variable"],
            "Comparison": pw["Comparison"],
            "p_ttest": pw["p_ttest"],
            "p_KS": pw["p_KS"],
            "SMD": pw["SMD"]
        })
pairwise_df = pd.DataFrame(pairwise_expanded)

# Show results
print("=== Overall Balance ===")
display(balance_df.drop(columns="Pairwise"))

print("\n=== Pairwise Balance ===")
display(pairwise_df)


=== Overall Balance ===


Unnamed: 0,Variable,Overall_p_ANOVA,Overall_p_KW,Mean_COUPON,Mean_DELIVERY,Mean_USUAL
0,FV_sales,0.973779,0.89869,107.434,102.0975,112.540556
1,NFV_sales,0.381024,0.243095,48.0805,80.996,76.696111
2,DP_out_of_pocket,0.86919,0.818826,155.5145,183.0935,189.236667
3,other_sales,0.218487,0.186419,575.286,967.615,681.380556
4,portion,0.987094,0.769638,171.906529,175.79204,183.109
5,transaction,0.203504,0.311925,15.15,33.2,25.111111
6,total_sales,0.331344,0.293185,730.8005,1150.7085,870.617222



=== Pairwise Balance ===


Unnamed: 0,Variable,Comparison,p_ttest,p_KS,SMD
0,FV_sales,COUPON vs DELIVERY,0.872028,0.999992,0.051285
1,FV_sales,COUPON vs USUAL,0.922735,0.780627,-0.032282
2,FV_sales,DELIVERY vs USUAL,0.841619,0.525289,-0.066592
3,NFV_sales,COUPON vs DELIVERY,0.149239,0.174533,-0.467275
4,NFV_sales,COUPON vs USUAL,0.284533,0.712544,-0.360275
5,NFV_sales,DELIVERY vs USUAL,0.886727,0.657641,0.046828
6,DP_out_of_pocket,COUPON vs DELIVERY,0.600045,0.83197,-0.167241
7,DP_out_of_pocket,COUPON vs USUAL,0.661634,0.780627,-0.146019
8,DP_out_of_pocket,DELIVERY vs USUAL,0.938385,0.596741,-0.025646
9,other_sales,COUPON vs DELIVERY,0.082914,0.335591,-0.565568


Save the balance check results

In [None]:
summary_df.to_excel('regression_summary1.xlsx')