In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import pandas as pd



In [2]:
spark = SparkSession.builder \
    .appName("ChurnModelTraining") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [3]:
df = spark.read.parquet("hdfs://namenode:8020/user/telco/cleaned/telco_cleaned.parquet")

In [4]:
df.printSchema()

root
 |-- customerid: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- seniorcitizen: string (nullable = true)
 |-- partner: string (nullable = true)
 |-- dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- phoneservice: string (nullable = true)
 |-- multiplelines: string (nullable = true)
 |-- internetservice: string (nullable = true)
 |-- onlinesecurity: string (nullable = true)
 |-- onlinebackup: string (nullable = true)
 |-- deviceprotection: string (nullable = true)
 |-- techsupport: string (nullable = true)
 |-- streamingtv: string (nullable = true)
 |-- streamingmovies: string (nullable = true)
 |-- contract: string (nullable = true)
 |-- paperlessbilling: string (nullable = true)
 |-- paymentmethod: string (nullable = true)
 |-- monthlycharges: double (nullable = true)
 |-- totalcharges: double (nullable = true)
 |-- churn: integer (nullable = true)



In [5]:
df_pandas = df.toPandas()

In [6]:
df_pandas[['churn']]

Unnamed: 0,churn
0,0
1,0
2,1
3,0
4,1
...,...
7038,0
7039,0
7040,0
7041,1


In [7]:
df_pandas.corr()

Unnamed: 0,tenure,monthlycharges,totalcharges,churn
tenure,1.0,0.2479,0.828025,-0.352229
monthlycharges,0.2479,1.0,0.651139,0.193356
totalcharges,0.828025,0.651139,1.0,-0.198342
churn,-0.352229,0.193356,-0.198342,1.0


In [8]:
df_pandas.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [9]:
df = df.withColumn("churn", when(col("churn") == "Yes", 1).otherwise(0))

## We Want to detect the outliers

In [10]:
def detect_outliers(df, column):
    quantiles = df.approxQuantile(column, [0.25, 0.75], 0.05)
    Q1 = quantiles[0]
    Q3 = quantiles[1]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df.filter((col(column) < lower_bound) | (col(column) > upper_bound)) 
    print(f"Number of outliers in {column}: {outliers.count()}")
    return lower_bound, upper_bound


In [11]:
t_lower_bound, t_upper_bound = detect_outliers(df , "totalcharges")

Number of outliers in totalcharges: 0


In [12]:
detect_outliers(df , "monthlycharges")

Number of outliers in monthlycharges: 0


(-31.250000000000007, 157.15)

In [13]:
detect_outliers(df , "tenure")

Number of outliers in tenure: 0


(-55.5, 116.5)

In [14]:
def cap_outliers(df, col_name, lower_bound, upper_bound):
    return df.withColumn(
        col_name,
        when(col(col_name) < lower_bound, lower_bound)
        .when(col(col_name) > upper_bound, upper_bound)
        .otherwise(col(col_name))
    )

In [15]:
df = cap_outliers(df, "totalcharges", t_lower_bound, t_upper_bound)

In [16]:
detect_outliers(df , "totalcharges")

Number of outliers in totalcharges: 0


(-3927.1499999999996, 7662.049999999999)

## Outliers removed (winsorizing)

## Split the data

In [17]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [18]:
trainCols = train_data.dtypes
train_data.dtypes

[('customerid', 'string'),
 ('gender', 'string'),
 ('seniorcitizen', 'string'),
 ('partner', 'string'),
 ('dependents', 'string'),
 ('tenure', 'int'),
 ('phoneservice', 'string'),
 ('multiplelines', 'string'),
 ('internetservice', 'string'),
 ('onlinesecurity', 'string'),
 ('onlinebackup', 'string'),
 ('deviceprotection', 'string'),
 ('techsupport', 'string'),
 ('streamingtv', 'string'),
 ('streamingmovies', 'string'),
 ('contract', 'string'),
 ('paperlessbilling', 'string'),
 ('paymentmethod', 'string'),
 ('monthlycharges', 'double'),
 ('totalcharges', 'double'),
 ('churn', 'int')]

In [19]:
stringCols = [f for (f , v) in trainCols if v == "string" and f not in  ["customerid" , "streamingmovies", "deviceprotection", "streamingtv", "paymentmethod"]]

In [20]:
numericCols = [f for (f, v) in trainCols if ((v == "double") & (f != "churn"))]

In [21]:
strIndexCols = [col+"_index" for col in stringCols]
strIndexCols

['gender_index',
 'seniorcitizen_index',
 'partner_index',
 'dependents_index',
 'phoneservice_index',
 'multiplelines_index',
 'internetservice_index',
 'onlinesecurity_index',
 'onlinebackup_index',
 'techsupport_index',
 'contract_index',
 'paperlessbilling_index']

In [22]:
oheCols = [col+"_ohe" for col in stringCols]
oheCols

['gender_ohe',
 'seniorcitizen_ohe',
 'partner_ohe',
 'dependents_ohe',
 'phoneservice_ohe',
 'multiplelines_ohe',
 'internetservice_ohe',
 'onlinesecurity_ohe',
 'onlinebackup_ohe',
 'techsupport_ohe',
 'contract_ohe',
 'paperlessbilling_ohe']

In [23]:
allDataCols = oheCols + numericCols 

In [24]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid='keep') for col in stringCols]

ohe = OneHotEncoder(inputCols=strIndexCols, outputCols=oheCols)

assembler = VectorAssembler(inputCols=allDataCols, outputCol="features_unscaled")

scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")

lr = LogisticRegression(featuresCol="features" , labelCol="churn",predictionCol="churnPrediction")

In [25]:
pipeline = Pipeline(stages = indexers +  [ohe ,  assembler  , scaler , lr])

pipeline_model = pipeline.fit(train_data)

In [26]:
predictions = pipeline_model.transform(test_data)

In [27]:
evaluator = BinaryClassificationEvaluator(
    labelCol="churn", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc:.4f}")

IllegalArgumentException: requirement failed: rawPredictionCol vectors must have length=2, but got 1

In [28]:
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="precisionByLabel"
)
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="recallByLabel"
)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="f1"
)

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1 = f1_evaluator.evaluate(predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Precision: 1.0000
Recall:    1.0000
F1 Score:  1.0000


In [29]:
lr_model = pipeline_model.stages[-1]  # last stage is RandomForestClassifier
importances = lr_model.coefficients.toArray()
features = pipeline_model.stages[-3].getInputCols()  # VectorAssembler input features

# Zip and sort
importance = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)

# Print feature importance
for feature, score in importance:
    print(f"{feature:30s}: {score:.4f}")

gender_ohe                    : 0.0000
seniorcitizen_ohe             : 0.0000
partner_ohe                   : 0.0000
dependents_ohe                : 0.0000
phoneservice_ohe              : 0.0000
multiplelines_ohe             : 0.0000
internetservice_ohe           : 0.0000
onlinesecurity_ohe            : 0.0000
onlinebackup_ohe              : 0.0000
techsupport_ohe               : 0.0000
contract_ohe                  : 0.0000
paperlessbilling_ohe          : 0.0000
monthlycharges                : 0.0000
totalcharges                  : 0.0000


## Try a new pipeline with RandomForest

In [30]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid='keep') for col in stringCols]

ohe = OneHotEncoder(inputCols=strIndexCols, outputCols=oheCols)

assembler = VectorAssembler(inputCols=allDataCols, outputCol="features_unscaled")

scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")

rf = RandomForestClassifier(featuresCol="features" , labelCol="churn",predictionCol="churnPrediction")

In [31]:
pipeline = Pipeline(stages = indexers +  [ohe ,  assembler  , scaler , rf])

pipeline_model = pipeline.fit(train_data)

In [32]:
predictions = pipeline_model.transform(test_data)

In [36]:
evaluator = BinaryClassificationEvaluator(
    labelCol="churn", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc:.4f}")

IllegalArgumentException: requirement failed: rawPredictionCol vectors must have length=2, but got 1

In [None]:
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="precisionByLabel"
)
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="recallByLabel"
)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="churn", predictionCol="churnPrediction", metricName="f1"
)

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1 = f1_evaluator.evaluate(predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Precision: 0.7959
Recall:    0.9508
F1 Score:  0.7599


In [None]:
rf_model = pipeline_model.stages[-1]  # last stage is RandomForestClassifier
importances = rf_model.featureImportances.toArray()
features = pipeline_model.stages[-3].getInputCols()  # VectorAssembler input features

# Zip and sort
importance = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)

# Print feature importance
for feature, score in importance:
    print(f"{feature:30s}: {score:.4f}")


totalcharges                  : 0.0650
phoneservice_ohe              : 0.0075
dependents_ohe                : 0.0028
internetservice_ohe           : 0.0018
onlinesecurity_ohe            : 0.0018
monthlycharges                : 0.0014
partner_ohe                   : 0.0012
gender_ohe                    : 0.0010
contract_ohe                  : 0.0006
paperlessbilling_ohe          : 0.0006
multiplelines_ohe             : 0.0006
seniorcitizen_ohe             : 0.0005
onlinebackup_ohe              : 0.0000
techsupport_ohe               : 0.0000


In [None]:
predictions.select("customerid", "churn", "churnPrediction").show(5)

+----------+-----+---------------+
|customerid|churn|churnPrediction|
+----------+-----+---------------+
|0004-TLHLJ|    1|            1.0|
|0013-SMEOE|    0|            0.0|
|0015-UOCOJ|    0|            0.0|
|0019-EFAEP|    0|            0.0|
|0023-HGHWL|    1|            1.0|
+----------+-----+---------------+
only showing top 5 rows



## The `logisiticRegression` model is more balanced than the `RandomForest`