In [2]:
import findspark
findspark.init('/home/mysparkub/spark-3.0.0-bin-hadoop2.7')

In [36]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import countDistinct
from pyspark.ml.classification import LogisticRegression

In [4]:
spark = SparkSession.builder.appName('customer_churn').getOrCreate()

In [5]:
data = spark.read.csv('files/customer_churn.csv', inferSchema=True, header=True)

In [6]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [9]:
for i in data.head(1)[0]:
    print(i)

Cameron Williams
42.0
11066.8
0
7.22
8.0
2013-08-30 07:00:40
10265 Elizabeth Mission Barkerburgh, AK 89518
Harvey LLC
1


In [22]:
company_count = data.groupBy('Company').count()
# company_count.orderBy(company_count['count'].desc()).show()
data.select(countDistinct('Company')).show()

+-----------------------+
|count(DISTINCT Company)|
+-----------------------+
|                    873|
+-----------------------+



In [25]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [24]:
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [31]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager',
                                       'Years', 'Num_Sites'],
                           outputCol='features')

In [52]:
final_data = assembler.transform(data)

In [53]:
final_data = final_data.select('features', 'Churn')

In [54]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [55]:
lgr = LogisticRegression(labelCol='Churn')

In [56]:
lgr = lgr.fit(train_data)

In [57]:
test_results = lgr.evaluate(test_data)

In [58]:
test_results.accuracy

0.8952702702702703

In [59]:
test_results.areaUnderROC

0.9199834543616138

In [60]:
test_df = spark.read.csv('files/new_customers.csv', inferSchema=True, header=True)

In [61]:
test_df.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

In [62]:
final_df = assembler.transform(test_df)

In [63]:
final_df = final_df.select('features')

In [66]:
pred = lgr.transform(final_df)

In [67]:
pred.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[37.0,9935.53,1.0...|[2.01308665282866...|[0.88216425990873...|       0.0|
|[23.0,7526.94,1.0...|[-6.4833515708504...|[0.00152634531777...|       1.0|
|[65.0,100.0,1.0,1...|[-3.8113638330809...|[0.02163937361001...|       1.0|
|[32.0,6487.5,0.0,...|[-5.2145320932962...|[0.00540757601356...|       1.0|
|[32.0,13147.71,1....|[0.91181954710562...|[0.71337235401934...|       0.0|
|[22.0,8445.26,1.0...|[-1.8849352127704...|[0.13182303255750...|       1.0|
+--------------------+--------------------+--------------------+----------+

