In [None]:
# Notebook 06: compare_sampling_methods
from sampling_framework import SamplingFramework
from pyspark.sql import functions as F
import pandas as pd

sf = SamplingFramework(spark)
methods = ["random", "stratified", "cube"]
numeric_cols = ["balance", "n_web_logins", "n_mobile_logins"]

comparison_results = []

for m in methods:
    test = spark.table(f"{m}_test_group")
    ctrl = spark.table(f"{m}_ctrl_group")
    
    # 1. Distribution Balance (SMD)
    smd_df = sf.calculate_smd(test, ctrl, numeric_cols)
    avg_smd = smd_df["smd"].mean()
    
    # 2. Kolmogorov-Smirnov Test (for continuous variables)
    ks_result = sf.calculate_ks_test(test, ctrl, "balance")
    ks_stat = ks_result.get("statistic", None)
    ks_pvalue = ks_result.get("pvalue", None)
    
    # 3. Chi-Square Test (for categorical variables)
    chi2_result = sf.calculate_chi_square(test, ctrl, "visa_ind")
    chi2_stat = chi2_result.get("statistic", None)
    chi2_pvalue = chi2_result.get("pvalue", None)
    
    # 4. Total Balancing Error (Specific to Cube use-case)
    # How far is the sample total from the population total?
    pop_total = spark.table("processed_customers").select(F.sum("balance").alias("total")).collect()[0]["total"]
    sample_total = test.select(F.sum("balance").alias("total")).collect()[0]["total"] / 0.9  # Adjusted for 90%
    rel_error = abs(sample_total - pop_total) / pop_total if pop_total != 0 else 0.0
    
    comparison_results.append({
        "method": m,
        "avg_smd": avg_smd,
        "ks_statistic": ks_stat,
        "ks_pvalue": ks_pvalue,
        "chi2_statistic": chi2_stat,
        "chi2_pvalue": chi2_pvalue,
        "balance_rel_error": rel_error
    })

results_df = pd.DataFrame(comparison_results)
display(results_df)

# Qualitative Recommendation logic
"""
- Use RANDOM: For pilot studies or where speed is the only priority.
- Use STRATIFIED: When you have clear, non-overlapping segments (e.g., Visa vs No-Visa) 
  and want to guarantee representation.
- Use CUBE: When campaign ROI is highly sensitive to the exact totals of 'balance' 
  and 'logins', and you need multivariate balance without creating sparse strata.
"""