### Notebook written by Jesús González Ferrer

In [39]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('churnlogreg').getOrCreate()

In [40]:
df = spark.read.csv('customer_churn.csv', header=True, inferSchema=True)

In [41]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [42]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [43]:
df.head()

Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)

In [44]:
df.select('company').distinct().count() #THis one could be a little bit interesting but not much. It would lead to overfitting

873

In [45]:
df.select('Location').distinct().count() #This one does not look interesting

900

In [46]:
from pyspark.ml.feature import (VectorAssembler, OneHotEncoder, VectorIndexer, StringIndexer)

In [47]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 
                                       'Account_Manager','Num_Sites'], outputCol='features')

In [48]:
output = assembler.transform(df)

In [49]:
final_dt = output.select(['features','churn'])
type(final_dt)

pyspark.sql.dataframe.DataFrame

In [50]:
train_data, test_data = final_dt.randomSplit([0.7,0.3],seed=231120)

In [51]:
train_data.describe().show()

+-------+-------------------+
|summary|              churn|
+-------+-------------------+
|  count|                631|
|   mean|0.15213946117274169|
| stddev| 0.3594409497994419|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [52]:
from pyspark.ml.classification import LogisticRegression

In [53]:
lr = LogisticRegression(labelCol='churn')
model = lr.fit(train_data)

In [54]:
result = model.evaluate(test_data)

In [55]:
result.pr.show()

+--------------------+------------------+
|              recall|         precision|
+--------------------+------------------+
|                 0.0|               1.0|
|0.037037037037037035|               1.0|
| 0.07407407407407407|               1.0|
|  0.1111111111111111|               1.0|
| 0.14814814814814814|               1.0|
| 0.16666666666666666|               0.9|
|  0.2037037037037037|0.9166666666666666|
| 0.24074074074074073|0.9285714285714286|
| 0.25925925925925924|             0.875|
|  0.2962962962962963|0.8888888888888888|
|  0.3148148148148148|              0.85|
| 0.35185185185185186|0.8636363636363636|
| 0.37037037037037035|0.8333333333333334|
|  0.4074074074074074|0.8461538461538461|
|  0.4444444444444444|0.8571428571428571|
| 0.48148148148148145|0.8666666666666667|
|  0.5185185185185185|             0.875|
|  0.5370370370370371|0.8529411764705882|
|  0.5555555555555556|0.8333333333333334|
|  0.5555555555555556|0.7894736842105263|
+--------------------+------------

In [56]:
result.fMeasureByThreshold.show()

+-------------------+-------------------+
|          threshold|          F-Measure|
+-------------------+-------------------+
|  0.932519696525025|0.07142857142857142|
| 0.8975608979502081|0.13793103448275862|
| 0.8494763014876786|0.19999999999999998|
| 0.7977005741310882|0.25806451612903225|
| 0.7158272245476318|            0.28125|
| 0.7034220629358172| 0.3333333333333333|
| 0.6681109070588281| 0.3823529411764706|
| 0.6533149186777937|0.39999999999999997|
| 0.6176888803064793| 0.4444444444444444|
| 0.6035278781091192| 0.4594594594594595|
| 0.5678666202081647| 0.5000000000000001|
|  0.544636092356009| 0.5128205128205128|
| 0.5369684224900719| 0.5499999999999999|
|0.49570849800464484| 0.5853658536585367|
| 0.4816661237269361| 0.6190476190476191|
| 0.4683948211116506| 0.6511627906976744|
|0.39779547722631997| 0.6590909090909091|
|0.37658588312410957| 0.6666666666666667|
| 0.3713561260866751| 0.6521739130434783|
|0.36721040030929136| 0.6595744680851063|
+-------------------+-------------

In [57]:
result.areaUnderROC

0.8980189491817403

In [58]:
result.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                269|                269|
|   mean|0.20074349442379183|0.10037174721189591|
| stddev|0.40130315382864634|0.30105512725506384|
|    min|                  0|                0.0|
|    max|                  1|                1.0|
+-------+-------------------+-------------------+



In [59]:
result.predictions.select('churn','prediction').show()

+-----+----------+
|churn|prediction|
+-----+----------+
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [60]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 
                                    'Num_Sites'], outputCol='features')

In [61]:
output = assembler.transform(df)
final_dt = output.select(['features','churn'])
train_data, test_data = final_dt.randomSplit([0.7,0.3], seed=231120)

In [62]:
lr = LogisticRegression(labelCol='churn')
model = lr.fit(train_data)
result = model.evaluate(test_data)
result.areaUnderROC

0.8956072351421193

If we include the account manager inclusion, we get better results in our prediction but this is not good for our model because we already know that if we include an account manager for that business they will stay as our clients but we want to know when this is neccesary and when this is not. 
THe model without the account manager still gives us good results.