In [418]:
!pip install xgboost==1.7.6
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, abs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when
import uuid
import warnings
import pandas as pd
warnings.filterwarnings('ignore')



In [419]:
spark = SparkSession.builder.appName("ChurnXGBoost").master("local[*]").getOrCreate()

In [420]:
def get_churn_data(fileName: str) -> DataFrame:
    try:
        churnDataframe = spark.read.csv(fileName, header=True, inferSchema=True)
        print("Churn data loaded...")
        return churnDataframe
    except Exception as e:
        print(f"Could not load data: {e}")
        return None

In [421]:
def get_churn_metadata(churnDataFrame) -> None:
    tempPandasDf = churnDataFrame.toPandas() # Localised spark df conversion to pandas
    tempPandasDf.info() # Display metadata

In [422]:
def separate_feature_target_cols(churnDataFrame) -> [list,str]:
    featureCols = churnDataFrame.columns[:-1]
    targetCol = churnDataFrame.columns[-1]
    print(f"All Feature Columns: {featureCols}")
    print()
    print(f"Target Column: {targetCol}")
    return featureCols, targetCol

In [423]:
def correlation_analysis(churnDataFrame: DataFrame, featureCols: list,
                         targetCol: str, moderateVal: float, strongVal: float) -> None:
    print("Begin correlation analysis of all features...\n")

    corrVals = []
    for feature in featureCols:
        corr = churnDataFrame.stat.corr(feature, targetCol)
        corrVals.append((feature, corr))

    corrDf = spark.createDataFrame(corrVals, ["feature", "pearson_correlation"]) \
                  .orderBy(abs(col("pearson_correlation")).desc())

    corrDf = corrDf.withColumn("feature_strength",
         when(abs(col("pearson_correlation")) > strongVal, "Strong")
        .when(abs(col("pearson_correlation")) > moderateVal, "Moderate")
        .otherwise("Weak"))
    print("All Correlation Scores:")
    corrDf.show(n=corrDf.count(), truncate=False)
    print()
    print("Features displaying the strongest predictive signal:")
    filteredCorrDf = corrDf.filter(col("feature_strength").isin("Strong", "Moderate"))
    filteredCorrDf.orderBy(abs(col("pearson_correlation")).desc()).show(truncate=False)
    strongCorrFeatures = filteredCorrDf.select("feature").rdd.flatMap(lambda x: x).collect()
    return strongCorrFeatures

In [424]:
def feature_assemble(churnDataframe, featureCols, trainSplit:float, testSplit:float) -> [DataFrame, DataFrame]:
    print("Assembling features...")
    print("Combining all features into single vector...")
    assembler_all = VectorAssembler(inputCols = featureCols,
                                    outputCol="features") # Combine all features into a vector
    pipeline = Pipeline(stages=[assembler_all])
    vectorDf = pipeline.fit(churnDataframe).transform(churnDataframe)
    print("Splitting data...")
    print(f"{trainSplit*100}/{testSplit*100} Test Train Split...")
    trainDf, testDf = vectorDf.randomSplit([trainSplit, testSplit], seed=42)
    return trainDf, testDf

In [425]:
def baseline_model(trainDf, testDf, targetCol) -> [DataFrame, str]:
    xgbClassifier = SparkXGBClassifier(
            features_col="features",
            label_col=targetCol,
            prediction_col="prediction",
            num_workers=spark.sparkContext.defaultParallelism )
    xgbModel = xgbClassifier.fit(trainDf)
    predictions = xgbModel.transform(testDf)
    evaluator = MulticlassClassificationEvaluator(
        labelCol=targetCol,
        predictionCol="prediction",
        metricName="f1")
    f1Score = evaluator.evaluate(predictions)
    print(f"Baseline model F1 Score - All Features: {f1Score}")
    return xgbModel, f1Score

In [426]:
def feature_information_gain(model, featureCols) -> [str]:
    importances = model.get_booster().get_score(importance_type="gain")
    print("All Feature Importances by Information Gain:")

    featureMap = {f"f{i}": name for i, name in enumerate(featureCols)}
    sortedImportances = sorted(importances.items(), key=lambda x: x[1], reverse=True)

    featureDict = {}
    for feature, score in sortedImportances:
        feature = featureMap.get(feature, feature)
        featureDict[feature] = score
    scoreList = [[featureMap.get(feat, feat), score] for feat, score in sortedImportances]
    allFeatureGainDf = spark.createDataFrame(scoreList, ["feature", "gain"])
    allFeatureGainDf = allFeatureGainDf.withColumn("gain_strength",
         when(abs(col("gain")) > 10, "Strong")
        .when(abs(col("gain")) > 3, "Moderate")
        .otherwise("Weak"))
    allFeatureGainDf.show(n=allFeatureGainDf.count(), truncate=False)
    print("Features with stronger information gain scores...")
    strongGainDf = allFeatureGainDf.filter(col("gain_strength").isin("Strong", "Moderate"))
    strongGainDf.orderBy(abs(col("gain_strength")).desc()).show(truncate=False)
    strongGainFeatures = strongGainDf.select("feature").rdd.flatMap(lambda x: x).collect()
    return strongGainFeatures

In [427]:
def get_strong_features(corrFeatures: list, gainFeatures: list) -> [str]:
        strongFeatures = list(set(corrFeatures + gainFeatures))
        print(f"Strong feature list: {strongFeatures}")
        return strongFeatures

In [428]:
def execute_ml_pipeline():

    print("Spark session created: ChurnXGBoost ")
    print()

    randomId = uuid.uuid4()
    stringID = str(randomId)
    print(f"Executing Machine Learning Pipeline...")
    print() # Creates a gap for for clean logging output
    print(f"Run ID: {stringID}")
    print()
    
    print("***** GET DATA *****", end="\n")
    print()

    print("----- Load Cleaned Data -----", end="\n")
    churnData = get_churn_data("churn_clean.csv")
    print()
    
    print("----- Cleaned Churn Metadata -----", end="\n")
    get_churn_metadata(churnData)
    print()
    
    print("***** TRAIN AND SCORE BASELINE MODEL *****", end="\n")
    print()

    print("----- Baseline Model: Separate Feature and Target Column/s -----", end="\n")
    featureCols, targetCol = separate_feature_target_cols(churnData)
    print()

    print("----- Baseline Model: Correlation Analysis -----", end="\n")
    strongCorrFeatures = correlation_analysis(churnData, featureCols, targetCol, 0.2, 0.4)
    print()

    print("----- Baseline Model: Assemble Features -----", end="\n")
    trainSetDf, testSetDf = feature_assemble(churnData, featureCols, trainSplit=0.8, testSplit=0.2)
    print()

    print("----- Baseline Model: Create and Score -----",end="\n")
    xgbModel, BaselineF1score = baseline_model(trainSetDf, testSetDf, targetCol)
    print()
    
    print("----- Baseline Model: Feature Information Gain -----",end="\n")
    strongGainFeatures = feature_information_gain(xgbModel, featureCols)
    print()
    
    print("***** TRAIN AND SCORE OPTIMISED MODEL *****",end="\n")
    print()
    
    print("----- Optimised Model: Get strongest features -----",end="\n")
    strongFeatures = get_strong_features(strongCorrFeatures,strongGainFeatures)
    print()
    

In [429]:
execute_ml_pipeline()

Spark session created: ChurnXGBoost 

Executing Machine Learning Pipeline...

Run ID: b27ab952-d314-49d1-9b00-476d837be206

***** GET DATA *****

----- Load Cleaned Data -----
Churn data loaded...

----- Cleaned Churn Metadata -----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           20000 non-null  int32  
 1   gender                        20000 non-null  int32  
 2   region_category               20000 non-null  int32  
 3   membership_category           20000 non-null  int32  
 4   joined_through_referral       20000 non-null  int32  
 5   preferred_offer_types         20000 non-null  int32  
 6   medium_of_operation           20000 non-null  int32  
 7   internet_option               20000 non-null  int32  
 8   days_since_last_login         20000 non-null  int32  
 9   avg_

In [430]:
### ─────────────────────────────────────────────────────────────
#            PHASE 1: BASELINE TRAINING WITH ALL FEATURES
# ─────────────────────────────────────────────────────────────

# def load_data(file_path):
#     # Read CSV, infer schema, return Spark DataFrame
#     return df

# def separate_features_and_target(df):
#     features = df.columns[:-1]
#     target = df.columns[-1]
#     return features, target

# def run_correlation_analysis(df, features, target):
#     # Compute Pearson correlation between each feature and target
#     # Optionally label as 'Strong', 'Moderate', 'Weak'
#     return correlation_df

# def assemble_features(df, featureCols):
#     # Use VectorAssembler to create 'features' column
#     return assembled_df

# def train_baseline_model(train_df, target_col):
#     # Fit XGBoost on full feature set
#     return model

# def evaluate_model(model, test_df, target_col):
#     # Get predictions, compute F1 score, return evaluation
#     return f1_score, predictions

# def log_baseline_results(f1_score, feature_importance, correlation_df):
#     # Store results for comparison
#     pass

# # ─────────────────────────────────────────────────────────────
# #          PHASE 2: FEATURE + PARAMETER OPTIMIZATION
# # ─────────────────────────────────────────────────────────────

# def filter_features_by_correlation(correlation_df, threshold=0.2):
#     # Return only features with abs(corr) > threshold
#     return selected_features

# def run_kfold_cv(train_df, estimator, param_grid, evaluator, k=5):
#     # Setup CrossValidator with K folds and param grid
#     # Return best model and best params
#     return best_model, best_params

# def log_optimized_results(f1_score, best_params, feature_set):
#     # Save to tracking system or file
#     pass

# # ─────────────────────────────────────────────────────────────
# #                  PHASE 3: PIPELINE DRIVER
# # ─────────────────────────────────────────────────────────────

# def full_pipeline(file_path):
#     df = load_data(file_path)
#     featureCols, targetCol = separate_features_and_target(df)

#     # PHASE 1: BASELINE
#     correlation_df = run_correlation_analysis(df, featureCols, targetCol)
#     df_all = assemble_features(df, featureCols)
#     train_df, test_df = split_data(df_all)

#     baseline_model = train_baseline_model(train_df, targetCol)
#     f1_base, _ = evaluate_model(baseline_model, test_df, targetCol)
#     log_baseline_results(f1_base, get_importance(baseline_model), correlation_df)

#     # PHASE 2: OPTIMIZATION
#     good_feats = filter_features_by_correlation(correlation_df, threshold=0.2)
#     df_opt = assemble_features(df, good_feats)
#     train_opt, test_opt = split_data(df_opt)

#     best_model, best_params = run_kfold_cv(train_opt, xgb_estimator, param_grid, evaluator)
#     f1_opt, _ = evaluate_model(best_model, test_opt, targetCol)
#     log_optimized_results(f1_opt, best_params, good_feats)