In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm
from statsmodels.stats.power import NormalIndPower


In [None]:
# Load dataset
df = pd.read_csv("../data/processed_experiment_results.csv")

# Overview of Data
print("\n Info:")
print(df.info())

print("\n Summary Statistics:")
print(df.describe())

In [None]:
# Overall summary statistics
summary_stats = df.groupby("strategy").agg(
    clicks_mean=("clicks", "mean"),
    clicks_median=("clicks", "median"),
    interest_mean=("high_interest", "mean"),
    interest_median=("high_interest", "median"),
    session_length_mean=("session_length", "mean"),
    session_length_median=("session_length", "median"),
    count=("session_id", "count")
)

# Show summary statistics
summary_stats

In [None]:
# Clicks Distribution by Strategy
plt.figure(figsize=(9,3))
sns.boxplot(data=df, x="strategy", y="clicks")
plt.title("Clicks Distribution by Strategy")
plt.show()

In [None]:
# Interest Actions by Strategy
plt.figure(figsize=(9,3))
sns.boxplot(data=df, x="strategy", y="high_interest")
plt.title("Interest Actions by Strategy")
plt.show()

In [None]:
# Session Length Distribution
plt.figure(figsize=(9,3))
sns.boxplot(data=df, x="strategy", y="session_length")
plt.title("Session Length by Strategy")
plt.show()

### Sample size

In [None]:
def calculate_sample_size(baseline_rate, mde=0.05, alpha=0.05, power=0.8):
    """
    Calculates the minimum required sample size per group for detecting a given Minimum Detectable Effect (MDE).
    
    Args:
        baseline_rate : The current conversion rate (CTR or interest rate).
        mde : The minimum detectable effect (default = 5% improvement).
        alpha : Significance level (default = 0.05).
        power : Statistical power (default = 0.8).
    
    Returns:
        int: Minimum required sample size per group.
    """
    # Convert percentages to proportions
    p0 = baseline_rate
    p1 = p0 * (1 + mde)  # Expected rate after improvement

    # Compute effect size (Cohen's h)
    effect_size = 2 * np.arcsin(np.sqrt(p1)) - 2 * np.arcsin(np.sqrt(p0))

    # Calculate sample size per group
    analysis = NormalIndPower()
    sample_size = analysis.solve_power(effect_size, power=power, alpha=alpha, ratio=1, alternative="two-sided")

    return int(np.ceil(sample_size))


In [None]:
# Assume a baseline CTR of 10% (0.10)
baseline_ctr = 0.10  # 10% click-through rate
required_sample = calculate_sample_size(baseline_ctr)

print(f"Minimum sample size per group: {required_sample}")

session_counts = df.groupby(['strategy'])['strategy'].count()

session_counts = session_counts.to_frame(name="observed_sessions")  # Convert series to DataFrame
session_counts["sufficient_data"] = session_counts["observed_sessions"] >= required_sample

print('Number of samples in a group exceeds required minimum: ')
print(session_counts)

## AB Test (against strategy_0)

#### T-Test (Student’s T-Test)

A **T-test** compares the means of two groups and determines whether the difference is statistically significant or just random variation. The variance in user behavior (clicks, interest event...) might not be known.

Compare the mean metric score (number of clickes, events, session length) between strategy 0 (no treatment) and other strategies.

The p-value tells if the difference is statistically significant:
- If p < 0.05, we reject the null hypothesis (H₀) and say the strategy makes a difference.
- If p ≥ 0.05, we fail to reject H₀ and assume there’s no real effect.


#### Z-Score Test

A Z-Test is similar to a T-Test, but it assumes a large sample size and known variance. Z-Score indicatess how far away our observed result is from the expected mean in terms of standard deviations.

Z-Score Test calculates how many standard deviations away the treatment is from strategy_0 (control).
The Z-score is converted into a p-value, which tells us whether the difference is statistically significant.

If the sample is big enough (n > 30), the **Central Limit Theorem** allows to use the normal distribution.
More stable results for large experiments (thousands of sessions).

If sample size > 30, a Z-test is more accurate than a T-test.


In [None]:
def run_z_test(group1, group2, metric):
    # Compute means and standard deviations
    x1, x2 = df[df["strategy"] == group1][metric].mean(), df[df["strategy"] == group2][metric].mean()
    s1, s2 = df[df["strategy"] == group1][metric].std(), df[df["strategy"] == group2][metric].std()
    n1, n2 = df[df["strategy"] == group1][metric].count(), df[df["strategy"] == group2][metric].count()

    # Compute Z-score
    se = np.sqrt((s1**2 / n1) + (s2**2 / n2))
    z_score = (x1 - x2) / se
    p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-tailed test

    return z_score, p_value

def run_t_test(group1, group2, metric):
    stat, p_value = ttest_ind(df[df["strategy"] == group1][metric], 
                              df[df["strategy"] == group2][metric], 
                              equal_var=False)
    return stat, p_value

def generate_test_report( baseline_strategy ):
    results = []

    for strategy in strategies:
        if strategy == baseline_strategy:
            continue  

        for metric in metrics:
            t_stat, t_p_value = run_t_test(baseline_strategy, strategy, metric)
            z_stat, z_p_value = run_z_test(baseline_strategy, strategy, metric)
            results.append([baseline_strategy, strategy, metric, t_stat, t_p_value, z_stat, z_p_value])

    # Convert results to a Pandas DataFrame
    t_test_results_df = pd.DataFrame(results, columns=["Baseline", "Compared Strategy", "Metric", "T-Statistic", "P-Value"])

    # Print results as a clean table
    print("\n📊 **T-Test Results Against Baseline (strategy_0)**")
    print(t_test_results_df.to_string(index=False))

    return t_test_results_df