In [35]:
# Binary Customer Churn

# A marketing agency has many customers that use their service to produce ads for the
# client/customer websites. They've noticed that they have quite a bit of churn in clients.
# They basically randomly assign account managers right now, but want you to create a machine 
# learning model that will help predict which customers will churn (stop buying their service)
# so that they can correctly assign the customers most at risk to churn an account manager. 
# We will create a classification algorithm that will help classify whether or not a customer 
# churned. Then the company can test this against incoming data for future customers to predict which customers will churn 
# and assign them an account manager.

# The data is saved as customer_churn.csv. Here are the fields and their definitions:

# Name : Name of the latest contact at Company
# Age: Customer Age
# Total_Purchase: Total Ads Purchased
# Account_Manager: Binary 0=No manager, 1= Account manager assigned
# Years: Totaly Years as a customer
# Num_sites: Number of websites that use the service.
# Onboard_date: Date that the name of the latest contact was onboarded
# Location: Client HQ Address
# Company: Name of Client Company

In [1]:
# Import the findspark module and initialize it with the specified Spark path
import findspark
findspark.init('/home/mina/python-spark/spark-3.4.0-bin-hadoop3/')

# Import the pyspark module and the SparkSession class
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session with the specified app name
spark = SparkSession.builder.appName('Custmoer_Churn').getOrCreate()

23/09/13 16:34:55 WARN Utils: Your hostname, mina-VirtualBox resolves to a loopback address: 127.0.1.1; using 192.168.1.143 instead (on interface enp0s3)
23/09/13 16:34:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/13 16:34:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/13 16:34:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Read a CSV file named 'customer_churn.csv' into a DataFrame
# The 'inferSchema=True' option infers data types for columns, and 'header=True' treats the first row as column names
dataset = spark.read.csv('customer_churn.csv', header=True, inferSchema=True)

<h1>Check out the data</h1>

In [38]:
# Loop through the first two rows of the dataset and print each row
for record in dataset.head(2):
    print(record, '\n')

Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date=datetime.datetime(2013, 8, 30, 7, 0, 40), Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1) 

Row(Names='Kevin Mueller', Age=41.0, Total_Purchase=11916.22, Account_Manager=0, Years=6.5, Num_Sites=11.0, Onboard_date=datetime.datetime(2013, 8, 13, 0, 38, 46), Location='6157 Frank Gardens Suite 019 Carloshaven, RI 17756', Company='Wilson PLC', Churn=1) 



In [39]:
# Print the schema of the dataset
dataset.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [40]:
# Compute summary statistics and display them
dataset.summary().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [41]:
# show the names of columns
dataset.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

<h1>Format for MLlib</h1>

In [42]:
# Import the necessary modules for creating VectorAssembly
from pyspark.ml.feature import VectorAssembler

# Create a VectorAssembler to assemble selected columns into a feature vector
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase',
                                       'Account_Manager' ,'Years',
                                       'Num_Sites'],
                                        outputCol='Features')

# Transform the DataFrame using the VectorAssembler to add the 'Features' column
output = assembler.transform(dataset)

# Select the desired columns 'Features' and 'Churn' from the transformed DataFrame
final_dataset = output.select('Features','Churn')

# Split the final dataset into training and testing sets
train_churn,test_churn = final_dataset.randomSplit([0.7,0.3])

<h1>Fit the Model</h1>

In [43]:
# Import the necessary modules
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression model
lr_churn = LogisticRegression(featuresCol='Features',labelCol='Churn')

# Fit the model on the training data
fit_model = lr_churn.fit(train_churn)

# Get the summary of the model's training
training_sum = fit_model.summary

# Show columns from the predictions DataFrame
training_sum.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            Features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|  0.0|[4.63431745785773...|[0.99038069561911...|       0.0|
|[25.0,9672.03,0.0...|  0.0|[4.49286992774660...|[0.98893530961276...|       0.0|
|[26.0,8787.39,1.0...|  1.0|[0.71154311629332...|[0.67074204239835...|       0.0|
|[26.0,8939.61,0.0...|  0.0|[6.11524442531096...|[0.99779593285916...|       0.0|
|[28.0,11128.95,1....|  0.0|[4.16352113113749...|[0.98468548374846...|       0.0|
|[28.0,11204.23,0....|  0.0|[1.67555171650752...|[0.84231460619983...|       0.0|
|[28.0,11245.38,0....|  0.0|[3.60424545734254...|[0.97351269896182...|       0.0|
|[29.0,9617.59,0.0...|  0.0|[4.23322410369632...|[0.98570185474190...|       0.0|
|[29.0,10203.18,1....|  0.0|[3.76817665844374...|[0.97742716661120...|       0.0|
|[29.0,12711.15,

In [44]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                615|                615|
|   mean|0.17560975609756097|0.12682926829268293|
| stddev| 0.3807975223122206|0.33305250212923654|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [45]:
# Evaluate the model on the test dataset
pre_and_label = fit_model.evaluate(test_churn)

# Show the predictions made on the test dataset
pre_and_label.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            Features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[27.0,8628.8,1.0,...|    0|[5.39365940844732...|[0.99547525663991...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.44872594739648...|[0.99941815586819...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.57429680591052...|[0.82839528947316...|       0.0|
|[29.0,5900.78,1.0...|    0|[4.05432909154966...|[0.98294867665427...|       0.0|
|[29.0,8688.17,1.0...|    1|[2.71373002577927...|[0.93783197639881...|       0.0|
|[29.0,9378.24,0.0...|    0|[4.53169247112310...|[0.98935215136931...|       0.0|
|[29.0,11274.46,1....|    0|[4.45037849011203...|[0.98846056547699...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.28031662080738...|[0.96374734747347...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.04744413501567...|[0.95467205418320...|       0.0|
|[30.0,8874.83,0

In [46]:
pre_and_label.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                285|                285|
|   mean|0.14736842105263157|0.12982456140350876|
| stddev|0.35509632850873796| 0.3367015397342311|
|    min|                  0|                0.0|
|    max|                  1|                1.0|
+-------+-------------------+-------------------+



<h1>Evaluate Results</h1>

In [47]:
# Import Evaluation module
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create a BinaryClassificationEvaluator specifying the raw prediction and label columns
churn_evl = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')

# Calculate the AUC by evaluating the predictions
AUC = churn_evl.evaluate(pre_and_label.predictions)

# Display the calculated AUC
AUC

0.8148148148148149

<h1>Predict on brand new unlabeled data</h1>

In [48]:
# Create and fit a logistic regression model on the final dataset
final_lr_model = lr_churn.fit(final_dataset)

# Read data from 'new_customers.csv' into a DataFrame
new_customers = spark.read.csv('new_customers.csv',inferSchema=True,
                              header=True)

# Print the schema of the 'new_customers' DataFrame
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [50]:
# Transform the 'new_customers' DataFrame using the VectorAssembler
test_new_customers = assembler.transform(new_customers)

# Print the schema of the 'test_new_customers' DataFrame
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Features: vector (nullable = true)



In [53]:
# Use the trained logistic regression model to make predictions on 'test_new_customers'
final_results = final_lr_model.transform(test_new_customers)

# Select the 'Company' and 'prediction' columns from the results
final_results.select('Company','prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

