# Logistic Regression (Customer Churn)
## 1.) Brief
* Classification model to predict which customers are most likely to churn
* The business can then decide which account managers to assign to high risk clients
* The model will be trained on historical data and then be used on future data

In [1]:
# find spark
import findspark

# point to spark
findspark.init('/home/matt/spark-3.0.2-bin-hadoop3.2')

# load spark lib
import pyspark
from pyspark.sql import SparkSession

# create session
spark = SparkSession.builder.appName('churn').getOrCreate()

# read in data
df = spark.read.csv('Data/customer_churn.csv', inferSchema=True, header=True)

# show schema
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [3]:
# peek at data
df.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [5]:
# import libs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

# drop irrelevant columns
# drop name, onboard date, location and company
# name and date not relevant to churn, account manager randomly assigned but keep just in case

# assemble data into vectors
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'],
                            outputCol='features')

# vectorize data
output = assembler.transform(df)

# extract final data
df_final = output.select('features', 'churn')

# split train/test data
train, test = df_final.randomSplit([0.7, 0.3])

# build logreg model instance
logreg = LogisticRegression(labelCol='churn')

# train model on train data
logreg_fit = logreg.fit(train)

# summarise predictions
train_summary = logreg_fit.summary
train_summary.predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[25.0,9672.03,0.0...|  0.0|[4.87367702951321...|[0.99241280373953...|       0.0|
|[26.0,8939.61,0.0...|  0.0|[6.46156069957429...|[0.99844008170767...|       0.0|
|[27.0,8628.8,1.0,...|  0.0|[5.48329700525221...|[0.99586159156313...|       0.0|
|[28.0,11128.95,1....|  0.0|[4.3068478816979,...|[0.98670322625939...|       0.0|
|[28.0,11204.23,0....|  0.0|[2.02574404872598...|[0.88347365099455...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [6]:
# make test predictions
pred = logreg_fit.evaluate(test)

# classifier libs
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# create evaluator
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                        labelCol='churn')

# evaluate model
AUC = my_eval.evaluate(pred.predictions)

# check AUC
AUC

0.7429384170076854

In [10]:
# create final model on entire dataset
final_model = logreg.fit(df_final)

# read in new customer data
new_df = spark.read.csv('Data/new_customers.csv', inferSchema=True, header=True)

# transform new data
new_df_tf = assembler.transform(new_df)

# make predictions on new customers
new_pred = final_model.transform(new_df_tf)

# show predictions
new_pred.select('Company', 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

