In [27]:
import findspark
findspark.init('/home/ubuntu/Spark/spark-3.3.0-bin-hadoop3')

In [28]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_Regression_Test').getOrCreate()

In [29]:
df = spark.read.format("csv").option("header", "true").load("customer_churn.csv")

In [30]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [31]:
df.head(1)

[Row(Names='Cameron Williams', Age='42.0', Total_Purchase='11066.8', Account_Manager='0', Years='7.22', Num_Sites='8.0', Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn='1')]

In [32]:
df = df.selectExpr("cast(Names as string) Names", "cast(Age as float) Age", "cast(Total_Purchase as float) Total_Purchase", 
                   "cast(Account_Manager as int) Account_Manager",
                     "cast(Years as float) Years", 
                  "cast(Num_Sites as float) Num_Sites", "cast(Onboard_date as string) Onboard_date", 
                  "cast(Location as string) Location", "cast(Company as string) Company", 
                  "cast(Churn as int) Churn")
df.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.7998046875, Account_Manager=0, Years=7.21999979019165, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [33]:
my_cols = df.select([
    'Age',
    'Total_Purchase',
    'Years',
    'Num_Sites',
    'Onboard_date',
    'Churn',
    'Account_Manager',
    'Location',
    'Company'
])

In [34]:
#Deal with missing data - keeping it simple
my_final_data = my_cols.na.drop()

In [35]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder

In [36]:
date_indexer = StringIndexer(inputCol='Onboard_date', outputCol='Onboard_dateIndex')
date_indexer.setHandleInvalid('keep')
date_encoder = OneHotEncoder(inputCol='Onboard_dateIndex', outputCol='Onboard_dateVec')

In [11]:
# location_indexer = StringIndexer(inputCol='Location', outputCol='LocationIndex')
# location_indexer.setHandleInvalid('keep')
# location_encoder = OneHotEncoder(inputCol='LocationIndex', outputCol='LocationVec')

In [12]:
# company_indexer = StringIndexer(inputCol='Company', outputCol='CompanyIndex')
# company_indexer.setHandleInvalid('keep')
# company_encoder = OneHotEncoder(inputCol='CompanyIndex', outputCol='CompanyVec')

In [37]:
assembler = VectorAssembler(inputCols=[  #'LocationVec', 'CompanyVec',
                                        'Onboard_dateVec', 'Age'
                                       ,'Total_Purchase','Years','Num_Sites','Account_Manager',], outputCol='features')

In [38]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [39]:
log_reg_churn = LogisticRegression(featuresCol='features', labelCol='Churn')

In [40]:
pipeline = Pipeline(stages=[date_indexer, date_encoder, 
                            assembler, log_reg_churn,
#                            company_encoder, company_indexer,
#                             location_indexer, location_encoder
                           ])

In [41]:
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [42]:
fit_model = pipeline.fit(train_data)

In [43]:
results = fit_model.transform(test_data)

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [45]:
myeval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')

In [46]:
myeval.evaluate(results)

0.5

In [47]:
results.select('Churn', 'prediction').show()

+-----+----------+
|Churn|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [48]:
AUC = myeval.evaluate(results)

In [49]:
AUC

0.5