# Logistic Regression Consulting Project

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LogProject').getOrCreate()

In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [2]:
raw_data = spark.read.csv('customer_churn.csv', inferSchema=True, header=True)

In [4]:
raw_data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [5]:
raw_data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [6]:
cols = raw_data.select(['Age',
                        'Total_Purchase',
                        'Years',
                        'Num_Sites',
                        'Churn'
                        ])


In [7]:
cleaned_data = cols.na.drop()

In [9]:
cleaned_data.printSchema()

root
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Churn: integer (nullable = true)



In [13]:
assembler = VectorAssembler(inputCols=['Age',
                                       'Total_Purchase',
                                       'Years',
                                       'Num_Sites'
                                       ],
                                       outputCol='features')


In [14]:
output = assembler.transform(raw_data)

In [15]:
final_data = output.select('features', 'Churn')

In [16]:
train, test = final_data.randomSplit([0.7,0.3])

In [17]:
lr_churn = LogisticRegression(labelCol='Churn')

In [18]:
fitted_churn_model = lr_churn.fit(train)

In [19]:
training_summary = fitted_churn_model.summary

In [20]:
training_summary.predictions.describe().show()

+-------+-------------------+------------------+
|summary|              Churn|        prediction|
+-------+-------------------+------------------+
|  count|                625|               625|
|   mean|             0.1712|             0.128|
| stddev|0.37698534284232965|0.3343574012862161|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



In [30]:
pred_and_labels = fitted_churn_model.evaluate(test)

In [33]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,5.4...|    1|[0.75083439654010...|[0.67936048281598...|       0.0|
|[28.0,8670.98,3.9...|    0|[6.76602400075232...|[0.99884905922332...|       0.0|
|[28.0,9090.43,5.7...|    0|[1.55273785504629...|[0.82530881146474...|       0.0|
|[28.0,11128.95,5....|    0|[4.03586634495532...|[0.98263645532100...|       0.0|
|[29.0,9617.59,5.4...|    0|[3.78197100167917...|[0.97772951932518...|       0.0|
|[29.0,11274.46,4....|    0|[4.34527014678139...|[0.98719801173174...|       0.0|
|[29.0,12711.15,5....|    0|[4.73931164776281...|[0.99133114271715...|       0.0|
|[29.0,13255.05,4....|    0|[4.11305483438520...|[0.98390554081190...|       0.0|
|[30.0,6744.87,5.1...|    0|[2.82502085807464...|[0.94401302334842...|       0.0|
|[30.0,8874.83,5

In [34]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [35]:
AUC = churn_eval.evaluate(pred_and_labels.predictions)

In [36]:
AUC

0.7566659983961508

In [37]:
final_lr_model = lr_churn.fit(final_data)

In [38]:
new_customers = spark.read.csv('new_customers.csv', inferSchema=True, header=True)

In [39]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [40]:
test_new_customers = assembler.transform(new_customers)

In [41]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [42]:
final_results = final_lr_model.transform(test_new_customers)

In [45]:
final_results.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

