In [523]:
!pip install xgboost==1.7.6
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, abs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from xgboost.spark import SparkXGBClassifier
from pyspark.sql.functions import when
from pyspark.sql import Row
from pyspark.sql.functions import lit
import uuid
import warnings
import pandas as pd
warnings.filterwarnings('ignore')



In [524]:
spark = SparkSession.builder.appName("ChurnXGBoost").master("local[*]").getOrCreate()

In [525]:
def get_churn_data(fileName: str) -> DataFrame:
    try:
        churnDataframe = spark.read.csv(fileName, header=True, inferSchema=True)
        print("Churn data loaded...")
        return churnDataframe
    except Exception as e:
        print(f"Could not load data: {e}")
        return None

In [526]:
def get_churn_metadata(churnDataFrame) -> None:
    tempPandasDf = churnDataFrame.toPandas() # Localised spark df conversion to pandas
    tempPandasDf.info() # Display metadata

In [527]:
def split_dataframe(churnDataFrame):
    churnlabeled = churnDataFrame.filter(col('churn_risk_score').isNotNull())
    churnunlabeled = churnDataFrame.filter(col('churn_risk_score').isNull())
    reserve = 0.01
    reservedFlag = False
    if churnunlabeled.count() == 0:
        reservedFlag = True
        print('no empty churn_risk_score rows...')
        print(f"Reserving {int(1)}% of labeled data for demo predictions...")
        trainDf, reservedDf = churnlabeled.randomSplit([1 - reserve, reserve], seed=42)
        churnlabeled = trainDf
        churnunlabeled = reservedDf
        print(f"Reserve dataframe with {churnunlabeled.count()} rows")
        churnunlabeled = churnunlabeled.withColumnRenamed("churn_risk_score", "actual_churn_risk_score")
        churnunlabeled = churnunlabeled.withColumn("predicted_churn_risk_score", lit(None))
    return churnlabeled, churnunlabeled, reservedFlag

In [528]:
def separate_feature_target_cols(churnDataFrame) -> [list,str]:
    featureCols = churnDataFrame.columns[:-1]
    targetCol = churnDataFrame.columns[-1]
    print(f"All Feature Columns: {featureCols}")
    print()
    print(f"Target Column: {targetCol}")
    return featureCols, targetCol

In [529]:
def correlation_analysis(churnDataFrame: DataFrame, featureCols: list,
                         targetCol: str, moderateVal: float, strongVal: float) -> None:
    print("Begin correlation analysis of all features...\n")

    corrVals = []
    for feature in featureCols:
        corr = churnDataFrame.stat.corr(feature, targetCol)
        corrVals.append((feature, corr))

    corrDf = spark.createDataFrame(corrVals, ["feature", "pearson_correlation"]) \
                  .orderBy(abs(col("pearson_correlation")).desc())

    corrDf = corrDf.withColumn("feature_strength",
         when(abs(col("pearson_correlation")) > strongVal, "Strong")
        .when(abs(col("pearson_correlation")) > moderateVal, "Moderate")
        .otherwise("Weak"))
    print("All Correlation Scores:")
    corrDf.show(n=corrDf.count(), truncate=False)
    print()
    print("Features displaying the strongest predictive signal:")
    filteredCorrDf = corrDf.filter(col("feature_strength").isin("Strong", "Moderate"))
    filteredCorrDf.orderBy(abs(col("pearson_correlation")).desc()).show(truncate=False)
    strongCorrFeatures = filteredCorrDf.select("feature").rdd.flatMap(lambda x: x).collect()
    return strongCorrFeatures

In [530]:
def feature_assemble(churnDataframe, featureCols, trainSplit:float, testSplit:float) -> [DataFrame, DataFrame]:
    print("Assembling features...")
    print("Combining all features into single vector...")
    assembler_all = VectorAssembler(inputCols = featureCols,
                                    outputCol="features") # Combine all features into a vector
    pipeline = Pipeline(stages=[assembler_all])
    vectorDf = pipeline.fit(churnDataframe).transform(churnDataframe)
    print("Splitting data...")
    print(f"{trainSplit*100}/{testSplit*100} Test Train Split...")
    trainDf, testDf = vectorDf.randomSplit([trainSplit, testSplit], seed=42)
    return trainDf, testDf

In [531]:
def baseline_model(trainDf, testDf, targetCol) -> [DataFrame, str]:
    xgbClassifier = SparkXGBClassifier(
            features_col="features",
            label_col=targetCol,
            prediction_col="prediction",
            num_workers=spark.sparkContext.defaultParallelism )
    xgbModel = xgbClassifier.fit(trainDf)
    predictions = xgbModel.transform(testDf)
    evaluator = MulticlassClassificationEvaluator(
        labelCol=targetCol,
        predictionCol="prediction",
        metricName="f1")
    baselineF1Score = round(evaluator.evaluate(predictions),5)
    print(f"Baseline model F1 Score - All Features: {baselineF1Score}")
    return xgbModel, baselineF1Score

In [532]:
def feature_information_gain(model, featureCols) -> [str]:
    importances = model.get_booster().get_score(importance_type="gain")
    print("All Feature Importances by Information Gain:")

    featureMap = {f"f{i}": name for i, name in enumerate(featureCols)}
    sortedImportances = sorted(importances.items(), key=lambda x: x[1], reverse=True)

    featureDict = {}
    for feature, score in sortedImportances:
        feature = featureMap.get(feature, feature)
        featureDict[feature] = score
    scoreList = [[featureMap.get(feat, feat), score] for feat, score in sortedImportances]
    allFeatureGainDf = spark.createDataFrame(scoreList, ["feature", "gain"])
    allFeatureGainDf = allFeatureGainDf.withColumn("gain_strength",
         when(abs(col("gain")) > 30, "Strong")
        .when(abs(col("gain")) > 10, "Moderate")
        .otherwise("Weak"))
    allFeatureGainDf.show(n=allFeatureGainDf.count(), truncate=False)
    print("Features with stronger information gain scores...")
    strongGainDf = allFeatureGainDf.filter(col("gain_strength").isin("Strong", "Moderate"))
    strongGainDf.orderBy(abs(col("gain_strength")).desc()).show(truncate=False)
    strongGainFeatures = strongGainDf.select("feature").rdd.flatMap(lambda x: x).collect()
    return strongGainFeatures

In [533]:
def get_strong_features(corrFeatures: list, gainFeatures: list) -> [str]:
        strongFeatures = list(set(corrFeatures + gainFeatures))
        print(f"Strong feature list: {strongFeatures}")
        return strongFeatures

In [None]:
def kfold_cross_validation(trainDf, testDf, featureCols, TargetCol, modelType = "GBT", k=3):
    print(f"Running {k}-fold CV on training data using model: {modelType}")

    assembler = VectorAssembler(inputCols= featureCols, outputCol="strongFeatures")
    train = assembler.transform(trainDf).select("strongFeatures", TargetCol)
    test = assembler.transform(testDf).select("strongFeatures", TargetCol)

    model = GBTClassifier(labelCol=TargetCol, featuresCol="strongFeatures")
    paramGrid = ParamGridBuilder() \
        .addGrid(model.maxDepth, [3, 5]) \
        .addGrid(model.maxIter, [10, 20]) \
        .addGrid(model.stepSize, [0.05, 0.1]) \
        .addGrid(model.subsamplingRate, [0.8, 1.0]) \
        .build()

    # Evaluator
    evaluator = MulticlassClassificationEvaluator(
        labelCol=TargetCol,
        predictionCol="prediction",
        metricName="f1")

    # CrossValidator
    cv = CrossValidator(
        estimator=model,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=k,
        parallelism=spark.sparkContext.defaultParallelism)

    cvModel = cv.fit(train)
    predictions = cvModel.transform(test)
    print(predictions)
    optimisedF1Score = round(evaluator.evaluate(predictions),5)
    print(f"F1 score on test data: {optimisedF1Score}")
    return cvModel.bestModel, optimisedF1Score

In [535]:
def select_best_model(baselineF1, optimisedF1, baseModel, optimisedModel, strongFeatures, allFeatures):
    print(f"Base model F1 Score: {baselineF1}")
    print(f"Optimised model F1 Score: {optimisedF1}")
    if optimisedF1 > baselineF1:
        model = optimisedModel
        modelType = "optimised"
        features = strongFeatures
        print(f"Optimised model selected...")
    else:
        model = baseModel
        modelType = "base"
        features = allFeatures
        print(f"Base model selected...")
    return model, modelType, features

In [None]:
def predict_churn_score(modelType, model, churnDataUnlabelled: DataFrame, features, reservedFlag):

    if modelType == "optimised":
        feature_vector_col = "strongFeatures"
        cols_to_select = ['actual_churn_risk_score'] + features
    else:
        feature_vector_col = "features"
        cols_to_select = ['actual_churn_risk_score'] + features

    churnDataUnlabelled = churnDataUnlabelled.select(*[c for c in cols_to_select if c in churnDataUnlabelled.columns])

    assembler = VectorAssembler(inputCols=features, outputCol=feature_vector_col)
    churnunlabeledPredict = assembler.transform(churnDataUnlabelled)
    predictions = model.transform(churnunlabeledPredict)

    predictions = predictions.withColumnRenamed("prediction", "predicted_churn_risk_score")

    selected_df = predictions.select("actual_churn_risk_score", "predicted_churn_risk_score")
    print(f"\n----- Prediction Results ({modelType} model) -----")
    selected_df.show(truncate=False)

    if "actual_churn_risk_score" in predictions.columns:
        correct = predictions.filter(col("actual_churn_risk_score") == col("predicted_churn_risk_score")).count()
        total = predictions.count()
        accuracy = correct / total if total else 0
        print(f"Prediction Accuracy: {accuracy:.2%} ({correct}/{total})")

In [None]:
def execute_ml_pipeline():

    print("Spark session created: ChurnXGBoost ")
    print()

    randomId = uuid.uuid4()
    stringID = str(randomId)
    print(f"Executing Machine Learning Pipeline...")
    print() # Creates a gap for for clean logging output
    print(f"Run ID: {stringID}")
    print()

    print("***** DATA PREPARATION *****", end="\n")
    print()

    print("===== Load Cleaned Data =====", end="\n")
    churnData = get_churn_data("churn_clean.csv")
    print()

    print("===== Cleaned Churn Metadata =====", end="\n")
    get_churn_metadata(churnData)
    print()

    print("===== Split Dataframe by labels =====", end="\n")
    churnDataLabelled, churnDataUnlabelled, reservedFlag = split_dataframe(churnData)
    print()

    print("***** TRAIN AND SCORE BASELINE MODEL *****", end="\n")
    print()

    print("===== Baseline Model: Separate Feature and Target Column/s =====", end="\n")
    allFeatures, targetCol = separate_feature_target_cols(churnDataLabelled)
    print()

    print("===== Baseline Model: Correlation Analysis =====", end="\n")
    strongCorrFeatures = correlation_analysis(churnDataLabelled, allFeatures, targetCol, 0.2, 0.4)
    print()

    print("===== Baseline Model: Assemble Features =====", end="\n")
    trainSetDf, testSetDf = feature_assemble(churnDataLabelled, allFeatures, trainSplit=0.8, testSplit=0.2)
    print()

    print("===== Baseline Model: Create and Score =====",end="\n")
    baseModel, baselineF1Score = baseline_model(trainSetDf, testSetDf, targetCol)
    print()

    print("===== Baseline Model: Feature Information Gain =====",end="\n")
    strongGainFeatures = feature_information_gain(baseModel, allFeatures)
    print()

    print("***** TRAIN AND SCORE OPTIMISED MODEL *****",end="\n")
    print()

    print("===== Optimised Model: Strongest Features =====",end="\n")
    strongFeatures = get_strong_features(strongCorrFeatures, strongGainFeatures)
    print()

    print("===== Optimised Model: K-Fold Cross Validation =====",end="\n")
    optimisedModel, optimisedF1Score = kfold_cross_validation(trainSetDf, testSetDf, strongFeatures, targetCol)
    print()

    print("***** SELECT BEST MODEL AND PREDICT UNSEEN DATA *****",end="\n")
    print()

    print("===== Select Best Model =====",end="\n")
    bestModel, modelType, features = select_best_model(baselineF1Score, optimisedF1Score,
                                             baseModel, optimisedModel, strongFeatures, allFeatures)
    print()

    print("===== Apply Model to Unseen Data =====",end="\n")
    predict_churn_score(modelType, bestModel, churnDataUnlabelled, features, reservedFlag)

In [538]:
execute_ml_pipeline()

Spark session created: ChurnXGBoost 

Executing Machine Learning Pipeline...

Run ID: 9427b6f9-a144-4ee9-852f-19ccbdd3ee9b

***** DATA PREPARATION *****

===== Load Cleaned Data =====
Churn data loaded...

===== Cleaned Churn Metadata =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           20000 non-null  int32  
 1   gender                        20000 non-null  int32  
 2   region_category               20000 non-null  int32  
 3   membership_category           20000 non-null  int32  
 4   joined_through_referral       20000 non-null  int32  
 5   preferred_offer_types         20000 non-null  int32  
 6   medium_of_operation           20000 non-null  int32  
 7   internet_option               20000 non-null  int32  
 8   days_since_last_login         20000 non-null  int32  
 

DataFrame[strongFeatures: vector, churn_risk_score: int, rawPrediction: vector, probability: vector, prediction: double]
F1 score on test data: 0.93301

***** SELECT BEST MODEL AND PREDICT UNSEEN DATA *****

===== Select Best Model =====
Base model F1 Score: 0.93422
Optimised model F1 Score: 0.93301
Base model selected...

===== Apply Model to Unseen Data =====

----- Prediction Results (base model) -----
+-----------------------+--------------------------+
|actual_churn_risk_score|predicted_churn_risk_score|
+-----------------------+--------------------------+
|1                      |1.0                       |
|0                      |0.0                       |
|0                      |0.0                       |
|1                      |1.0                       |
|0                      |0.0                       |
|1                      |1.0                       |
|1                      |1.0                       |
|1                      |1.0                       |
|0      