In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Customer_churn_predictor').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
data = spark.read.csv('/home/ubuntu/Course_Notes/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv',inferSchema=True,header=True)

In [5]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [9]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,VectorIndexer

In [10]:
loc_indexer  = StringIndexer(inputCol="Location",outputCol="Loc_index")

In [11]:
comp_indexer = StringIndexer(inputCol='Company',outputCol='Comp_index')

In [12]:
indexed_data = loc_indexer.fit(data).transform(data)

In [13]:
indexed_data = comp_indexer.fit(indexed_data).transform(indexed_data)

In [14]:
indexed_data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)
 |-- Loc_index: double (nullable = true)
 |-- Comp_index: double (nullable = true)



In [15]:
from pyspark.sql.functions import year,month

In [17]:
indexed_data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn',
 'Loc_index',
 'Comp_index']

In [34]:
assembler = VectorAssembler(inputCols=['Age',
                                         'Total_Purchase',
                                         'Years',
                                         'Num_Sites',
                                         'Loc_index',
                                         'Comp_index'],outputCol = 'features')

In [35]:
final_data = assembler.transform(indexed_data)

In [37]:
finalized_data = final_data.select(['features','Churn'])

In [38]:
train,test = finalized_data.randomSplit([0.7,0.3])

In [39]:
mod = LogisticRegression(labelCol='Churn')

In [40]:
log_model = mod.fit(train)

In [41]:
evaluaor = log_model.evaluate(test)

In [53]:
roc_data = evaluaor.roc.show()

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|0.004587155963302...|0.022727272727272728|
|0.004587155963302...| 0.06818181818181818|
|0.004587155963302...| 0.11363636363636363|
|0.004587155963302...|  0.1590909090909091|
|0.004587155963302...| 0.20454545454545456|
|0.009174311926605505| 0.22727272727272727|
|0.009174311926605505|  0.2727272727272727|
|0.013761467889908258| 0.29545454545454547|
|0.013761467889908258|  0.3409090909090909|
| 0.01834862385321101| 0.36363636363636365|
| 0.01834862385321101|  0.4090909090909091|
|0.022935779816513763|  0.4318181818181818|
|0.022935779816513763|  0.4772727272727273|
|0.027522935779816515|                 0.5|
| 0.03211009174311927|  0.5227272727272727|
| 0.03211009174311927|  0.5681818181818182|
| 0.03669724770642202|  0.5909090909090909|
| 0.04128440366972477|  0.6136363636363636|
| 0.05045871559633028|  0.613636

In [56]:
evaluaor.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[28.0,9090.43,5.7...|    0|[1.62974796513985...|[0.83613510960236...|       0.0|
|[28.0,11245.38,6....|    0|[3.45077045695433...|[0.96925410901071...|       0.0|
|[29.0,13240.01,4....|    0|[6.80648932039890...|[0.99889465224100...|       0.0|
|[29.0,13255.05,4....|    0|[4.24658632162209...|[0.98588896083100...|       0.0|
|[31.0,5387.75,6.8...|    0|[2.13360140847095...|[0.89412641760864...|       0.0|
|[31.0,11743.24,5....|    0|[6.42455003192392...|[0.99838136160768...|       0.0|
|[32.0,6367.22,2.8...|    0|[2.85236818996961...|[0.94544096851340...|       0.0|
|[32.0,9885.12,6.9...|    1|[1.90555165564585...|[0.87051857420108...|       0.0|
|[32.0,12547.91,7....|    0|[0.11480872843149...|[0.52867069654774...|       0.0|
|[33.0,5738.82,7

In [57]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [68]:
bi_evalutor = BinaryClassificationEvaluator(rawPredictionCol= 'prediction',labelCol= 'Churn')

In [69]:
auc = bi_evalutor.evaluate(evaluaor.predictions)

In [70]:
auc

0.6840075062552127

In [71]:
final_lrZ_model  = mod.fit(final_data)

In [74]:
new_custs = spark.read.csv('/home/ubuntu/Course_Notes/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv',inferSchema=True,header=True)

In [75]:
new_custs.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [76]:
indexed_test = loc_indexer.fit(new_custs).transform(new_custs)

In [77]:
indexed_test = comp_indexer.fit(indexed_test).transform(indexed_test)

In [78]:
new_cust_test = assembler.transform(indexed_test)

In [79]:
new_cust_test.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Loc_index: double (nullable = true)
 |-- Comp_index: double (nullable = true)
 |-- features: vector (nullable = true)



In [80]:
final_test = log_model.transform(new_cust_test)

In [81]:
final_test.select(['Company','prediction']).show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

