In [1]:
# Initialising the SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logistic_regression').getOrCreate()

In [2]:
# Loading the dataset
df = spark.sql('customer_churn_csv')

In [3]:
# Examining the dataset
df.show()

In [4]:
df.describe().show()

In [5]:
# Transforming the dataframe into one accepted by PySpark
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'],
                           outputCol='features')
final_df = assembler.transform(df).select('features', 'Churn')

In [6]:
# Train test split
train_data, test_data = final_df.randomSplit([0.7, 0.3])

In [7]:
# Creating the logistic regression model
from pyspark.ml.classification import LogisticRegression
classifier = LogisticRegression(featuresCol='features', labelCol='Churn', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

In [8]:
# Evaluate
summary = fitted_classifier.summary
summary.predictions.describe().show()

In [9]:
# Evaluating using the testset
pred_vs_actual = fitted_classifier.evaluate(test_data)
pred_vs_actual.predictions.show()

In [10]:
# Area under the curve (ROC) evalutation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
evaluator.evaluate(pred_vs_actual.predictions)

In [11]:
# Predicting using unlabeled data
# Loading the dataset
df_unlabeled = spark.sql('SELECT * FROM new_customers_csv')

In [12]:
df_unlabeled.show()

In [13]:
# Transforming the dataframe into one accepted by PySpark
final_df_unlabeled = assembler.transform(df_unlabeled)

In [14]:
final_df_unlabeled.show()

In [15]:
# Creating a new model using the entire dataset
classifier_all = classifier.fit(final_df)
results = classifier_all.transform(final_df_unlabeled)

In [16]:
results.select('Company', 'prediction').show()