In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Random_Forest_Customer_Banking").getOrCreate()
customer_banking = spark.read.csv("data-resources/customer_banking/", header = True, inferSchema = True)

In [None]:
customer_banking.show(5)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator

In [None]:
df = customer_banking.select('age', 'job', 'marital', 
                             'education', 'default', 
                             'balance', 'housing', 
                             'loan', 'contact', 
                             'duration', 'campaign', 
                             'pdays', 'previous', 
                             'poutcome', 'deposit')
cols = df.columns
df.printSchema()

In [None]:
categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
stages = []

In [None]:
indexers = [StringIndexer(inputCol=column, outputCol="{0}_index".format(column)) for column in categoricalColumns]
pipeline = Pipeline(stages=indexers)
string_index_df = pipeline.fit(df).transform(df)

string_index_df.limit(10).toPandas()

In [None]:
encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}_encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

In [None]:
encode_vector = encoder.fit(string_index_df)
encode_vector_df = encode_vector.transform(string_index_df)

In [None]:
encode_vector_df.limit(5).toPandas()

In [None]:
labeling_deposit = StringIndexer(inputCol = 'deposit', outputCol = 'label')

In [None]:
labeling_string = labeling_deposit.fit(encode_vector_df)
label_df = labeling_string.transform(encode_vector_df)

In [None]:
label_df.limit(5).toPandas()

In [None]:
string_indexer_cols = []
for c in range(len(indexers)):
    string_indexer_cols.append(indexers[c].getOutputCol())

In [None]:
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
assemblerInputs = string_indexer_cols + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [None]:
assemble_model = assembler.transform(label_df)
assemble_model.limit(3).toPandas()

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(customer_banking)
customer_banking = pipelineModel.transform(customer_banking)
selectedCols = ['label', 'features'] + cols
customer_banking = customer_banking.select(selectedCols)
customer_banking.printSchema()

In [None]:
customer_banking.show()

In [None]:
train, test = customer_banking.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

## Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

In [None]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))