# Logistic Regression

General Pipeline:

- Importing Data
- Vectorize and RFormula transformation
- Split into train and test
- Building the model
- Prediction on the test set
- Evaluation

## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("logistic").getOrCreate()

In [23]:
from pyspark.ml.feature    import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Loading Data

In [3]:
churn = spark.read.load(
    "../../data/Churn.csv",
    format="csv",
    sep=";",
    header = True, 
    inferSchema=True)

churn.show(2)

+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|      0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1|8380786|            1|        0|             1|       11254258|     0|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
only showing top 2 rows



## Data Preparation

In [4]:
rformula = RFormula(
    formula='Exited ~ .',
    featuresCol="features",
    labelCol="target"
)
churn = rformula.fit(churn).transform(churn)

In [5]:
churn.select("features", "target").show(10)

+--------------------+------+
|            features|target|
+--------------------+------+
|[619.0,1.0,0.0,0....|   1.0|
|[608.0,0.0,0.0,0....|   0.0|
|[502.0,1.0,0.0,0....|   1.0|
|(11,[0,1,4,5,7,10...|   0.0|
|[850.0,0.0,0.0,0....|   0.0|
|[645.0,0.0,0.0,1....|   1.0|
|[822.0,1.0,0.0,1....|   0.0|
|[376.0,0.0,1.0,0....|   1.0|
|[501.0,1.0,0.0,1....|   0.0|
|[684.0,1.0,0.0,1....|   0.0|
+--------------------+------+
only showing top 10 rows



## Split into Train and Test

In [7]:
churnTrain, churnTest = churn.randomSplit([0.7, 0.3], seed=11)

In [8]:
churnTrain.count(), churnTest.count()

(7028, 2972)

## Model Development and Training

In [10]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="target",
    maxIter=100,
    regParam=0.08
)

model = lr.fit(churnTrain)

In [15]:
model_summary = model.summary

acc = model_summary.accuracy
precision = model_summary.weightedPrecision
recall = model_summary.weightedRecall
auc =  model_summary.areaUnderROC

Model Metrics:

In [19]:
print("Acc: ", acc, '\nPrecision: ', precision, '\nRecall: ', recall, '\nAUC: ', auc)

Acc:  0.806061468412066 
Precision:  0.7807859812023539 
Recall:  0.8060614684120659 
AUC:  0.7703570589972973


## Predicting on Test Set

In [22]:
predictions = model.transform(churnTest)
predictions.select("target", "prediction", "probability", "rawPrediction").show(truncate=False)

+------+----------+----------------------------------------+------------------------------------------+
|target|prediction|probability                             |rawPrediction                             |
+------+----------+----------------------------------------+------------------------------------------+
|1.0   |1.0       |[0.3604245259765515,0.6395754740234485] |[-0.5735220589021472,0.5735220589021472]  |
|1.0   |0.0       |[0.5914189707831217,0.40858102921687833]|[0.36983443209111444,-0.36983443209111444]|
|1.0   |0.0       |[0.8034155096303097,0.19658449036969028]|[1.407779711398495,-1.407779711398495]    |
|1.0   |0.0       |[0.7061757122454291,0.2938242877545709] |[0.8768821627574368,-0.8768821627574368]  |
|1.0   |0.0       |[0.7252036034728151,0.2747963965271849] |[0.9704220006170146,-0.9704220006170146]  |
|1.0   |0.0       |[0.8862059085371382,0.11379409146286179]|[2.052558726033117,-2.052558726033117]    |
|1.0   |0.0       |[0.6740301735463888,0.32596982645361117]|[0.7

## Model Evaluation

In [24]:
evaluation = BinaryClassificationEvaluator(
    rawPredictionCol="rawPrediction",
    labelCol="target",
    metricName="areaUnderROC"
)

auroc = evaluation.evaluate(predictions)

print(auroc)

0.7573441745476315
