In [None]:
#Copyright 2023 Google LLC.
#SPDX-License-Identifier: Apache-2.0
#Importing libraries
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder as LE
import sys
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression,\
                    RandomForestClassifier, GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:

spark = SparkSession.builder \
                    .appName('churn model') \
                    .getOrCreate()

In [None]:
#Reading the arguments and storing them in variables
project_name=<<your_project_name>>
dataset_name=<<your_bq_dataset_name>>
bucket_name=<<your_code_bucket>>
user_name=<<your_username_here>>

In [None]:
#Reading Data into Spark Dataframe
churn_dataset_df = spark.read.options(inferSchema = True, header= True).csv('gs://'+bucket_name+'/customer-churn-prediction-vertex-ai/01-datasets/customer_churn_train_data.csv')

In [None]:
#Replacing spaces with null values in total charges column
from pyspark.sql.functions import *
dfWithEmptyReplaced = churn_dataset_df.withColumn('TotalCharges', when(col('TotalCharges') == ' ', None).otherwise(col('TotalCharges')).cast("float"))
dfWithEmptyReplaced = dfWithEmptyReplaced.na.drop()

In [None]:
#Replacing 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']

In [None]:
#replace values
for col_name in replace_cols:
    dfwithNo = dfWithEmptyReplaced.withColumn(col_name, when(col(col_name)== "No internet service","No").otherwise(col(col_name)))
    dfWithEmptyReplaced = dfwithNo
dfwithNo.createOrReplaceTempView("datawrangling")

In [None]:
# Using Spark SQL to create categories
df_wrangling = spark.sql("""
select distinct
         customerID
        ,gender
        ,SeniorCitizen
        ,Partner
        ,Dependents
        ,tenure
        ,case when (tenure<=12) then "Tenure_0-12"
              when (tenure>12 and tenure <=24) then "Tenure_12-24"
              when (tenure>24 and tenure <=48) then "Tenure_24-48"
              when (tenure>48 and tenure <=60) then "Tenure_48-60"
              when (tenure>60) then "Tenure_gt_60"
        end as tenure_group
        ,PhoneService
        ,MultipleLines
        ,InternetService
        ,OnlineSecurity
        ,OnlineBackup
        ,DeviceProtection
        ,TechSupport
        ,StreamingTV
        ,StreamingMovies
        ,Contract
        ,PaperlessBilling
        ,PaymentMethod
        ,MonthlyCharges
        ,TotalCharges
        ,Churn
    from datawrangling
""")

(trainingData, testData) = df_wrangling.randomSplit([0.7, 0.3], seed=200)
spark.conf.set("parentProject", project_name)
bucket = bucket_name+"/customer-churn-prediction-vertex-ai"
spark.conf.set("temporaryGcsBucket",bucket)
trainingData.write.format('bigquery') \
.mode("overwrite")\
.option('table', project_name+':'+dataset_name+'.'+user_name+'_training_data') \
.save()

testData.write.format('bigquery') \
.mode("overwrite")\
.option('table', project_name+':'+dataset_name+'.'+user_name+'_test_data') \
.save()

In [None]:
#Training
trainingData = spark.read \
  .format('bigquery') \
  .load(project_name+'.'+dataset_name+'.'+user_name+'_training_data')


trainingData=trainingData.withColumn("Partner",trainingData.Partner.cast('string')).withColumn("Dependents",trainingData.Dependents.cast('string')).withColumn("PhoneService",trainingData.PhoneService.cast('string')).withColumn("PaperlessBilling",trainingData.PaperlessBilling.cast('string')).withColumn("Churn",trainingData.Churn.cast('string'))


In [None]:
testData = spark.read \
  .format('bigquery') \
  .load(project_name+'.'+dataset_name+'.'+user_name+'_test_data')

testData=testData.withColumn("Partner",testData.Partner.cast('string')).withColumn("Dependents",testData.Dependents.cast('string')).withColumn("PhoneService",testData.PhoneService.cast('string')).withColumn("PaperlessBilling",testData.PaperlessBilling.cast('string')).withColumn("Churn",testData.Churn.cast('string'))


In [None]:
categoricalColumns = ['gender','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines','InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract','PaperlessBilling','PaymentMethod']
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [None]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="Churn", outputCol="label")
stages += [label_stringIdx]

In [None]:
# Transform all features into a vector using VectorAssembler
numericCols = ['MonthlyCharges', 'TotalCharges']#'TotalRmbRCN1',
assemblerInputs = numericCols + [c + "classVec" for c in categoricalColumns]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
IDcols = ['customerID']

evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                          metricName='accuracy')

rf=RandomForestClassifier(labelCol="label", featuresCol="features")

stages +=[rf]
pipeline_rf = Pipeline(stages=stages)


rf_model = pipeline_rf.fit(trainingData)

test_pred=rf_model.transform(testData)

accurac=evaluator.evaluate(test_pred)

print(accurac)

spark.conf.set("parentProject", project_name)
bucket = bucket_name
spark.conf.set("temporaryGcsBucket",bucket)
test_pred.write.format('bigquery') \
.mode("overwrite")\
.option('table', project_name+':'+dataset_name+'.'+user_name+'_predictions_data') \
.save()

rf_model.write().overwrite().save('gs://'+bucket_name+'/customer-churn-prediction-vertex-ai/'+user_name+'_churn_model/model_files')


In [None]:
#Test
sparkDF = spark.read.options(inferSchema = True, header= True).csv('gs://'+bucket_name+'/customer-churn-prediction-vertex-ai/01-datasets/customer_churn_test_model_data.csv')


sparkDF=sparkDF.withColumn("Partner",sparkDF.Partner.cast('string')).withColumn("Dependents",sparkDF.Dependents.cast('string')).withColumn("PhoneService",sparkDF.PhoneService.cast('string')).withColumn("PaperlessBilling",sparkDF.PaperlessBilling.cast('string'))
sparkDF=sparkDF.head(1)
sparkDF=spark.createDataFrame(sparkDF)

from pyspark.ml import PipelineModel
rf_model = PipelineModel.load(os.path.join('gs://'+bucket_name+'/customer-churn-prediction-vertex-ai/'+user_name+'_churn_model/model_files'))


In [None]:
#Replacing 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']

In [None]:
#replace values
for col_name in replace_cols:
    dfwithNo = sparkDF.withColumn(col_name, when(col(col_name)== "No internet service","No").otherwise(col(col_name)))
    sparkDF = dfwithNo

predic = rf_model.transform(dfwithNo)


spark.conf.set("parentProject", project_name)
bucket = bucket_name
spark.conf.set("temporaryGcsBucket",bucket)
predic.write.format('bigquery') \
.mode("overwrite")\
.option('table', project_name+':'+dataset_name+'.'+user_name+'_test_output') \
.save()

print(predic.show(truncate=False))