In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Primary Metric**

### **Calculate Conversion Rate for Each Category**


In [14]:
import pandas as pd

path = "/content/drive/MyDrive/extracted_data.csv"
df = pd.read_csv(path)

df["variant"] = df["variant"].astype(str).str.upper().str.strip()
df["converted_7d"] = df["converted_7d"].fillna(0).astype(int)

summary = (
    df.groupby("variant")
      .agg(users=("user_id", "nunique"),
           converted=("converted_7d", "sum"))
      .assign(conversion_rate=lambda x: x["converted"] / x["users"])
      .reset_index()
      .sort_values("variant")
)

summary


Unnamed: 0,variant,users,converted,conversion_rate
0,A,6426,302,0.046997
1,B,6426,329,0.051198


### **Calculate Absolute uplift**

In [16]:
cr_a = summary[summary["variant"] == "A"]["conversion_rate"].values[0]
cr_b = summary[summary["variant"] == "B"]["conversion_rate"].values[0]

absolute_lift = cr_b - cr_a
absolute_lift

np.float64(0.0042016806722689065)

### **A/B Testing**

In [28]:

import numpy as np
from scipy.stats import norm

# counts
n_a = summary[summary["variant"] == "A"]["users"].values[0]
x_a = summary[summary["variant"] == "A"]["converted"].values[0]
p_a = x_a / n_a

n_b = summary[summary["variant"] == "B"]["users"].values[0]
x_b = summary[summary["variant"] == "B"]["converted"].values[0]
p_b = x_b / n_b

# pooled proportion under H0: pA = pB
p_pool = (x_a + x_b) / (n_a + n_b)

# standard error (pooled)
se_pool = np.sqrt(p_pool * (1 - p_pool) * (1/n_a + 1/n_b))

# z statistic
z = (p_b - p_a) / se_pool

# one-tailed p-value for H1: pB > pA
p_value_one_tailed = 1 - norm.cdf(z)

print(f'the p-value is: {p_value_one_tailed}')


the p-value is: 0.1351759686677776


# **Guardrail Metric**

### **Refund Rate**

In [33]:
import numpy as np
from scipy.stats import norm

df['refunded_14d'] = df['refunded_14d'].fillna(0).astype(int)

# Group by variant and calculate users and refunds for 'refunded_14d'
summary_refunded = (
    df.groupby("variant")
      .agg(users=("user_id", "nunique"),
           refunded=("refunded_14d", "sum"))
      .assign(refund_rate=lambda x: x["refunded"] / x["users"])
      .reset_index()
      .sort_values("variant")
)

# Extract counts for Variant A
n_a_refund = summary_refunded[summary_refunded["variant"] == "A"]["users"].values[0]
x_a_refund = summary_refunded[summary_refunded["variant"] == "A"]["refunded"].values[0]
p_a_refund = x_a_refund / n_a_refund

# Extract counts for Variant B
n_b_refund = summary_refunded[summary_refunded["variant"] == "B"]["users"].values[0]
x_b_refund = summary_refunded[summary_refunded["variant"] == "B"]["refunded"].values[0]
p_b_refund = x_b_refund / n_b_refund

# Calculate pooled proportion under H0: pA = pB
p_pool_refund = (x_a_refund + x_b_refund) / (n_a_refund + n_b_refund)

# Calculate standard error (pooled)
se_pool_refund = np.sqrt(p_pool_refund * (1 - p_pool_refund) * (1/n_a_refund + 1/n_b_refund))

# Calculate z statistic
z_refund = (p_b_refund - p_a_refund) / se_pool_refund

# Calculate two-tailed p-value for H1: pA != pB
p_value_two_tailed_refund = 2 * (1 - norm.cdf(abs(z_refund)))

print(f'Z-statistic for refunded_14d: {z_refund}')
print(f'Two-tailed p-value for refunded_14d: {p_value_two_tailed_refund}')

# Display the modified summary_refunded to show the artificial refunds
display(summary_refunded)

Z-statistic for refunded_14d: 2.23650306943812
Two-tailed p-value for refunded_14d: 0.02531883647068489


Unnamed: 0,variant,users,refunded,refund_rate
0,A,6426,0,0.0
1,B,6426,5,0.000778


### **Churn Rate**

In [34]:
import numpy as np
from scipy.stats import norm

# Ensure 'early_churn_30d' is in an appropriate format (e.g., int, 0/1)
df['early_churn_30d'] = df['early_churn_30d'].fillna(0).astype(int)

# Group by variant and calculate users and churns for 'early_churn_30d'
summary_churn = (
    df.groupby("variant")
      .agg(users=("user_id", "nunique"),
           churned=("early_churn_30d", "sum"))
      .assign(churn_rate=lambda x: x["churned"] / x["users"])
      .reset_index()
      .sort_values("variant")
)

# Extract counts for Variant A
n_a_churn = summary_churn[summary_churn["variant"] == "A"]["users"].values[0]
x_a_churn = summary_churn[summary_churn["variant"] == "A"]["churned"].values[0]
p_a_churn = x_a_churn / n_a_churn

# Extract counts for Variant B
n_b_churn = summary_churn[summary_churn["variant"] == "B"]["users"].values[0]
x_b_churn = summary_churn[summary_churn["variant"] == "B"]["churned"].values[0]
p_b_churn = x_b_churn / n_b_churn

# Calculate pooled proportion under H0: pA = pB
p_pool_churn = (x_a_churn + x_b_churn) / (n_a_churn + n_b_churn)

# Calculate standard error (pooled)
se_pool_churn = np.sqrt(p_pool_churn * (1 - p_pool_churn) * (1/n_a_churn + 1/n_b_churn))

# Calculate z statistic
z_churn = (p_b_churn - p_a_churn) / se_pool_churn

# Calculate two-tailed p-value for H1: pA != pB
p_value_two_tailed_churn = 2 * (1 - norm.cdf(abs(z_churn)))

print(f'Z-statistic for early_churn_30d: {z_churn}')
print(f'Two-tailed p-value for early_churn_30d: {p_value_two_tailed_churn}')

# Display the summary of churn rates
display(summary_churn)


Z-statistic for early_churn_30d: 1.1422186518308086
Two-tailed p-value for early_churn_30d: 0.25336314490851963


Unnamed: 0,variant,users,churned,churn_rate
0,A,6426,302,0.046997
1,B,6426,330,0.051354


# **Secondary Metric**

### **Click CTA Rate**

In [35]:
import numpy as np
from scipy.stats import norm

# Ensure 'clicked_cta_7d' is in an appropriate format (e.g., int, 0/1)
df['clicked_cta_7d'] = df['clicked_cta_7d'].fillna(0).astype(int)

# Group by variant and calculate users and clicks for 'clicked_cta_7d'
summary_cta = (
    df.groupby("variant")
      .agg(users=("user_id", "nunique"),
           clicked=("clicked_cta_7d", "sum"))
      .assign(click_rate=lambda x: x["clicked"] / x["users"])
      .reset_index()
      .sort_values("variant")
)

# Extract counts for Variant A
n_a_cta = summary_cta[summary_cta["variant"] == "A"]["users"].values[0]
x_a_cta = summary_cta[summary_cta["variant"] == "A"]["clicked"].values[0]
p_a_cta = x_a_cta / n_a_cta

# Extract counts for Variant B
n_b_cta = summary_cta[summary_cta["variant"] == "B"]["users"].values[0]
x_b_cta = summary_cta[summary_cta["variant"] == "B"]["clicked"].values[0]
p_b_cta = x_b_cta / n_b_cta

# Calculate pooled proportion under H0: pA = pB
p_pool_cta = (x_a_cta + x_b_cta) / (n_a_cta + n_b_cta)

# Calculate standard error (pooled)
se_pool_cta = np.sqrt(p_pool_cta * (1 - p_pool_cta) * (1/n_a_cta + 1/n_b_cta))

# Calculate z statistic
z_cta = (p_b_cta - p_a_cta) / se_pool_cta

# Calculate two-tailed p-value for H1: pA != pB
p_value_two_tailed_cta = 2 * (1 - norm.cdf(abs(z_cta)))

print(f'Z-statistic for clicked_cta_7d: {z_cta}')
print(f'Two-tailed p-value for clicked_cta_7d: {p_value_two_tailed_cta}')

# Display the summary of click rates
display(summary_cta)


Z-statistic for clicked_cta_7d: 9.762802803509384
Two-tailed p-value for clicked_cta_7d: 0.0


Unnamed: 0,variant,users,clicked,click_rate
0,A,6426,2583,0.401961
1,B,6426,3133,0.487551
