In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel
from scipy.stats import ttest_ind
import pandas as pd

In [4]:
spark = SparkSession.builder.appName("ABTesting").getOrCreate()

In [5]:
# Load test data and trained models
test_df = spark.read.parquet("test_data.parquet")
model_A = PipelineModel.load("model_A")
model_B = PipelineModel.load("model_B")

In [6]:
# Make predictions with both models
predictions_A = model_A.transform(test_df)
predictions_B = model_B.transform(test_df)

In [8]:
# Evaluate models
evaluator = MulticlassClassificationEvaluator(labelCol="churn", predictionCol="prediction", metricName="f1")
f1_A = evaluator.evaluate(predictions_A)
f1_B = evaluator.evaluate(predictions_B)

print(f"Model A (Logistic Regression) F1 Score: {f1_A:.4f}")
print(f"Model B (Gradient Boosted Tree) F1 Score: {f1_B:.4f}")

Model A (Logistic Regression) F1 Score: 0.7274
Model B (Gradient Boosted Tree) F1 Score: 0.7314


In [9]:
# Perform a statistical t-test on the predictions
# We need to collect the predictions to do this with SciPy
df_A_pandas = predictions_A.select("churn", "prediction").toPandas()
df_B_pandas = predictions_B.select("churn", "prediction").toPandas()

In [10]:
# Calculate whether the prediction was correct (1) or not (0)
df_A_pandas['is_correct'] = (df_A_pandas['churn'] == df_A_pandas['prediction']).astype(int)
df_B_pandas['is_correct'] = (df_B_pandas['churn'] == df_B_pandas['prediction']).astype(int)

In [11]:
# Perform a t-test on the correctness scores
t_stat, p_value = ttest_ind(df_A_pandas['is_correct'], df_B_pandas['is_correct'])

print(f"\nResults of the A/B Test (t-test):")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")


Results of the A/B Test (t-test):
t-statistic: -0.4864
p-value: 0.6267


In [12]:
# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("\nConclusion: The difference in performance is statistically significant.")
    if f1_B > f1_A:
        print("Model B (Gradient Boosted Tree) is the winner!")
    else:
        print("Model A (Logistic Regression) is the winner!")
else:
    print("\nConclusion: The difference in performance is not statistically significant.")
    print("There is no clear winner between the two models.")


Conclusion: The difference in performance is not statistically significant.
There is no clear winner between the two models.
