In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
bank_train_location = "/FileStore/tables/bank_train.csv"
bank_test_location = "/FileStore/tables/bank_test.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
bank_train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_train_location)

bank_test = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(bank_test_location)

bank_train.show()
bank_test.show()

+---+----------+--------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-----+
|age|       job| marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|label|
+---+----------+--------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-----+
| 45|    admin.| married|  unknown|     no|   2033|     no|  no| cellular| 28|  may|      48|       4|   -1|       0| unknown|    0|
| 56|    admin.| married|  primary|     no|    202|    yes|  no|  unknown|  9|  may|     178|       2|   -1|       0| unknown|    0|
| 50| housemaid|  single|secondary|     no|    799|     no|  no|telephone| 28|  jan|      63|       1|   -1|       0| unknown|    0|
| 58|    admin.| married|secondary|     no|   1464|    yes| yes|  unknown|  5|  jun|      53|      29|   -1|       0| unknown|    0|
| 43|management|  single| tertiary|     no|  11891|     no|  no| cell

Build ML model to predict whether the customer will subscribe bank deposit service or not. Train the model using training set and evaluate the model performance (e.g. accuracy) using testing set. 
* You can explore different methods to pre-process the data and select proper features
* You can utilize different machine learning models and tune model hyperparameters
* Present the final testing accuracy.

In [None]:
# data preparation (4m)
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import ChiSqSelector

# Convert categorical variables into Integer values
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']]

# OneHotEncoder
encoders = [OneHotEncoder(inputCol=column+"_index", outputCol=column+"_vec") for column in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']]

# Combine all features
assembler = VectorAssembler(inputCols=['age', 'job_vec', 'marital_vec', 'education_vec', 'default_vec', 'balance', 'housing_vec', 'loan_vec', 'contact_vec', 'day', 'month_vec', 'duration', 'campaign', 'pdays', 'previous', 'poutcome_vec'], outputCol="all_features")

# Select proper number of features
selector = ChiSqSelector(numTopFeatures=45, featuresCol="all_features", outputCol="selected_features", labelCol="label")


In [None]:
# model building (4m)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(numTrees=10, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=42, featuresCol="selected_features", labelCol="label")

pipeline = Pipeline(stages=indexers + encoders + [assembler, selector, rf])

# Grid Search Targets (numTopFeatures)
# 1st time: Optimal number of top features: 45 (30-50)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 15, 20]) \
    .addGrid(rf.maxDepth, [4, 6, 8]) \
    .addGrid(selector.numTopFeatures, [42, 44, 46, 48, 50]) \
    .build()

accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Cross Validation
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=accuracy_evaluator,
    numFolds=3,
    seed=42
)

cv_model = crossval.fit(bank_train)

# Print most appropriate number of features
optimal_num_top_features = cv_model.bestModel.stages[-2].getNumTopFeatures()
print("Optimal number of top features: {}".format(optimal_num_top_features))


Optimal number of top features: 42


In [None]:
# model evaluation (2m)
predictions = cv_model.transform(bank_test)

accuracy = accuracy_evaluator.evaluate(predictions)

print(accuracy)


0.8356471115091805
