<a href="https://colab.research.google.com/github/IIIaryanIII/A-B-Testing-Experiment/blob/main/A_B_Testing_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scipy




In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import zipfile

print("Loading dataset...")

# Unzip the file first if it's zipped
zip_file_path = "ab_data.csv.zip"
csv_file_name = "ab_data.csv"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".") # Extract to the current directory

df = pd.read_csv(csv_file_name)


print(df.head())
print("\nData Summary:")
print(df.describe())

# Corrected group filtering based on actual column values
groupA = df[df['group'] == 'control']
groupB = df[df['group'] == 'treatment']

convA = groupA['converted'].mean()
convB = groupB['converted'].mean()

print("\nConversion Rate A:", round(convA, 4))
print("Conversion Rate B:", round(convB, 4))

successA = groupA['converted'].sum()
failA = len(groupA) - successA

save_failA_if_zero = max(1, failA)
save_failB_if_zero = max(1, failB)

successB = groupB['converted'].sum()
failB = len(groupB) - successB

contingency_table = np.array([[successA, save_failA_if_zero],
                              [successB, save_failB_if_zero]])

chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

print("\nChi-square test results:")
print("Chi2 value:", chi2)
print("p-value:", p_value)


lift = (convB - convA) / convA * 100
print("\nLift of Variant B over A:", round(lift, 2), "%")


alpha = 0.05

print("\nFINAL RECOMMENDATION:")
if p_value < alpha and convB > convA:
    print(" Version B is statistically significantly better. Recommend rollout.")
elif p_value < alpha and convA > convB:
    print(" Version A performs better. Keep A as control.")
else:
    print(" No statistically significant difference. Keep running the experiment or collect more data.")

Loading dataset...
   user_id                   timestamp      group landing_page  converted
0   851104  2017-01-21 22:11:48.556739    control     old_page          0
1   804228  2017-01-12 08:01:45.159739    control     old_page          0
2   661590  2017-01-11 16:55:06.154213  treatment     new_page          0
3   853541  2017-01-08 18:28:03.143765  treatment     new_page          0
4   864975  2017-01-21 01:52:26.210827    control     old_page          1

Data Summary:
             user_id      converted
count  294478.000000  294478.000000
mean   787974.124733       0.119659
std     91210.823776       0.324563
min    630000.000000       0.000000
25%    709032.250000       0.000000
50%    787933.500000       0.000000
75%    866911.750000       0.000000
max    945999.000000       1.000000

Conversion Rate A: 0.1204
Conversion Rate B: 0.1189

Chi-square test results:
Chi2 value: 1.5159618356336584
p-value: 0.2182316121631168

Lift of Variant B over A: -1.23 %

FINAL RECOMMENDATION:
 N

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import zipfile
import os
import sys

# ---------- USER CONFIG ----------
ZIP_PATH = "ab_data.csv.zip"   # or set to None if not zipped
CSV_NAME = "ab_data.csv"
GROUP_COL = "group"            # adjust if your dataset uses 'variant' or 'bucket'
CONV_COL = "converted"         # adjust if your dataset uses 'is_converted' etc.
CONTROL_LABEL = "control"      # or "A"
TREATMENT_LABEL = "treatment"  # or "B"
ALPHA = 0.05
# ---------------------------------

def load_csv(zip_path, csv_name):
    if zip_path and os.path.exists(zip_path):
        print(f"Unzipping {zip_path} ...")
        with zipfile.ZipFile(zip_path, "r") as z:
            # If csv_name not in zip, extract everything and try to find csv
            if csv_name in z.namelist():
                z.extract(csv_name, ".")
                return pd.read_csv(csv_name)
            else:
                z.extractall(".")
                # try to find first csv
                for name in z.namelist():
                    if name.lower().endswith(".csv"):
                        print(f"Using extracted file: {name}")
                        return pd.read_csv(name)
                raise FileNotFoundError("No CSV found inside the zip.")
    elif os.path.exists(csv_name):
        return pd.read_csv(csv_name)
    else:
        raise FileNotFoundError("Neither zip nor CSV found. Check paths.")

def safe_int(x):
    try:
        return int(x)
    except:
        return np.nan

def main():
    print("Loading dataset...")
    try:
        df = load_csv(ZIP_PATH, CSV_NAME)
    except Exception as e:
        print("ERROR loading data:", e)
        sys.exit(1)

    print(df.head())
    print("\nData Summary:")
    print(df.describe(include="all"))

    # Basic checks
    if GROUP_COL not in df.columns or CONV_COL not in df.columns:
        print(f"Required columns not found. Expected '{GROUP_COL}' and '{CONV_COL}' in CSV.")
        print("Columns available:", df.columns.tolist())
        sys.exit(1)

    # Clean/ensure conversion column is 0/1
    df[CONV_COL] = df[CONV_COL].apply(safe_int)
    df = df.dropna(subset=[GROUP_COL, CONV_COL])

    # Filter groups robustly (case-insensitive)
    df[GROUP_COL] = df[GROUP_COL].astype(str).str.lower()
    control_label = CONTROL_LABEL.lower()
    treatment_label = TREATMENT_LABEL.lower()

    if control_label not in df[GROUP_COL].unique() or treatment_label not in df[GROUP_COL].unique():
        print("Warning: specified group labels not found. Available groups:", df[GROUP_COL].unique())
        # try to auto-detect two groups
        uniques = df[GROUP_COL].unique()
        if len(uniques) >= 2:
            control_label, treatment_label = uniques[0], uniques[1]
            print(f"Auto-using control='{control_label}', treatment='{treatment_label}'")
        else:
            print("Not enough distinct groups to run A/B test.")
            sys.exit(1)

    groupA = df[df[GROUP_COL] == control_label]
    groupB = df[df[GROUP_COL] == treatment_label]

    nA = len(groupA)
    nB = len(groupB)
    successA = int(groupA[CONV_COL].sum())
    successB = int(groupB[CONV_COL].sum())
    failA = nA - successA
    failB = nB - successB

    convA = successA / nA if nA > 0 else 0.0
    convB = successB / nB if nB > 0 else 0.0

    print(f"\nCounts: nA={nA}, successA={successA}, failA={failA}")
    print(f"        nB={nB}, successB={successB}, failB={failB}")
    print(f"\nConversion Rate A (control): {convA:.4f}")
    print(f"Conversion Rate B (treatment): {convB:.4f}")

    # Contingency table
    table = np.array([[successA, failA],
                      [successB, failB]])
    print("\nContingency table:\n", table)

    # Choose test: if expected freq < 5 anywhere -> use Fisher's exact test
    chi2, p_chi2, dof, expected = stats.chi2_contingency(table)
    print("\nExpected frequencies (from chi2):\n", expected)
    if (expected < 5).any() or (table == 0).any():
        print("\nUsing Fisher's exact test (small counts or zero cell detected).")
        # fisher_exact expects a 2x2 table of integers
        try:
            oddsratio, p_value = stats.fisher_exact(table)
            test_used = "fisher_exact"
        except Exception as e:
            print("Fisher failed:", e)
            p_value = p_chi2
            test_used = "chi2_fallback"
    else:
        p_value = p_chi2
        test_used = "chi2"

    # Lift (handle convA == 0)
    if convA == 0:
        lift = float('inf') if convB > 0 else 0.0
    else:
        lift = (convB - convA) / convA * 100

    print(f"\nTest used: {test_used}")
    print(f"Chi2 stat (if computed): {chi2:.4f}")
    print(f"p-value: {p_value:.6f}")
    if np.isfinite(lift):
        print(f"\nLift of Variant B over A: {lift:.2f} %")
    else:
        print("\nLift of Variant B over A: infinite (control conv=0)")

    # Recommendation (two-sided p-value + direction)
    if p_value < ALPHA:
        if convB > convA:
            print("\n Recommendation: Treatment (B) is significantly better than Control (A). Consider rolling out B.")
        else:
            print("\n Recommendation: Control (A) is significantly better than Treatment (B). Keep A.")
    else:
        print("\n Recommendation: No statistically significant difference (p >= {}). Collect more data or run longer.".format(ALPHA))

if __name__ == "__main__":
    main()


Loading dataset...
Unzipping ab_data.csv.zip ...
   user_id                   timestamp      group landing_page  converted
0   851104  2017-01-21 22:11:48.556739    control     old_page          0
1   804228  2017-01-12 08:01:45.159739    control     old_page          0
2   661590  2017-01-11 16:55:06.154213  treatment     new_page          0
3   853541  2017-01-08 18:28:03.143765  treatment     new_page          0
4   864975  2017-01-21 01:52:26.210827    control     old_page          1

Data Summary:
              user_id                   timestamp      group landing_page  \
count   294478.000000                      294478     294478       294478   
unique            NaN                      294478          2            2   
top               NaN  2017-01-16 12:40:24.467417  treatment     old_page   
freq              NaN                           1     147276       147239   
mean    787974.124733                         NaN        NaN          NaN   
std      91210.823776         