# Machine Learning models in PySpark

## Data Loading and exploration

In [1]:
# importing requried packages to initiate spark session
from pyspark.sql import SparkSession
from pyspark import SparkConf

# Setting application name and number of working processors
conf = SparkConf().setAppName("Assignment 2").setMaster("local[*]").set('spark.sql.session.timeZone', 'UTC')
spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

### Defining Data schema

In [2]:
from pyspark.sql.types import *

# Defining the schema
pedestrian_schema = StructType([
    StructField('ID', IntegerType(), True),
    StructField('Date_Time', TimestampType(), True),
    StructField('Year', IntegerType(), True),
    StructField('Month', StringType(), True),
    StructField('Mdate', IntegerType(), True),
    StructField('Day', StringType(), True),
    StructField('Time', IntegerType(), True),
    StructField('Sensor_ID', IntegerType(), True),
    StructField('Sensor_Name', StringType(), True),
    StructField('Hourly_Counts', IntegerType(), True)
])

### Load the pedestrian count csv files into dataframe.

In [3]:
# Reading the csv file
pedestrian_df = spark.read.format("csv")\
    .option("header", "true")\
    .schema(pedestrian_schema)\
    .option("timestampFormat", "M/d/y h:m:s a")\
    .load("Pedestrian_Counting_System_-_Monthly__counts_per_hour.csv")

In [4]:
# Checking schema of the read file
pedestrian_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Date_Time: timestamp (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Mdate: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Sensor_ID: integer (nullable = true)
 |-- Sensor_Name: string (nullable = true)
 |-- Hourly_Counts: integer (nullable = true)



In [5]:
# Checking example datas of the read file
pedestrian_df.show(5)

+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+
|     ID|          Date_Time|Year|   Month|Mdate|   Day|Time|Sensor_ID|         Sensor_Name|Hourly_Counts|
+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+
|2887628|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       34|Flinders St-Spark La|          300|
|2887629|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       39|        Alfred Place|          604|
|2887630|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       37|     Lygon St (East)|          216|
|2887631|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       40|Lonsdale St-Sprin...|          627|
|2887632|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       36|     Queen St (West)|          774|
+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+
only showing top 5 rows



### Create an additional column “above_threshold”

In [6]:
from pyspark.sql.functions import when

# Creating the column named above_threshold based on hourly counts
pedestrian_df = pedestrian_df.withColumn("above_threshold", when(pedestrian_df["Hourly_Counts"] >= 2000, 1.00).otherwise(0.00))

In [7]:
pedestrian_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Date_Time: timestamp (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Mdate: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Sensor_ID: integer (nullable = true)
 |-- Sensor_Name: string (nullable = true)
 |-- Hourly_Counts: integer (nullable = true)
 |-- above_threshold: double (nullable = false)



In [8]:
pedestrian_df.show(5)

+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+
|     ID|          Date_Time|Year|   Month|Mdate|   Day|Time|Sensor_ID|         Sensor_Name|Hourly_Counts|above_threshold|
+-------+-------------------+----+--------+-----+------+----+---------+--------------------+-------------+---------------+
|2887628|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       34|Flinders St-Spark La|          300|            0.0|
|2887629|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       39|        Alfred Place|          604|            0.0|
|2887630|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       37|     Lygon St (East)|          216|            0.0|
|2887631|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       40|Lonsdale St-Sprin...|          627|            0.0|
|2887632|2019-11-01 17:00:00|2019|November|    1|Friday|  17|       36|     Queen St (West)|          774|            0.0|
+-------+-------

## Exploring the data

### Show basic statistics

In [9]:
# Creating list to store columns based on type
df_types = pedestrian_df.dtypes
num_cols = []
# Store columns of type integer
for x,y in df_types:
    if y == "int":
        num_cols.append(x)

        # Using describe to get the basic statistics of numerical datas
basic_statistics = pedestrian_df[num_cols]
basic_stats = basic_statistics.describe()
stats = basic_statistics.collect()

In [10]:
import numpy as np
from pyspark.sql import Row

# Creating lists for each numerical columns
ID = []
Year = []
Mdate = []
Time = []
Sensor = []
Hourly_Count = []

for row in stats:
    ID.append(row["ID"])
    Year.append(row["Year"])
    Mdate.append(row["Mdate"])
    Time.append(row["Time"])
    Sensor.append(row["Sensor_ID"])
    Hourly_Count.append(row["Hourly_Counts"])

percentiles = [25, 50, 75]
perc_list = []

# using the numpy package, calculate the percentile for numerical columns
for perc in percentiles:
    id_per = np.percentile(ID, perc)
    yr_per = np.percentile(Year, perc)
    mdt_per = np.percentile(Mdate, perc)
    tim_per = np.percentile(Time, perc)
    sen_per = np.percentile(Sensor, perc)
    hco_per = np.percentile(Hourly_Count, perc)
    perc_list.append(Row((str(perc)+"%"), str(id_per), str(yr_per), str(mdt_per), str(tim_per), str(sen_per), str(hco_per)))
    
percSchema = StructType([       
    StructField('summary', StringType(), True),
    StructField('ID', StringType(), True),
    StructField('Year', StringType(), True),
    StructField('Mdate', StringType(), True),
    StructField('Time', StringType(), True),
    StructField('Sensor_ID', StringType(), True),
    StructField('Hourly_Counts', StringType(), True)
])

perc_df = spark.createDataFrame(data=perc_list, schema = percSchema)

# Union the percentile dataframe with the result from .describe() method
statistics = basic_stats.union(perc_df)
statistics.toPandas()

Unnamed: 0,summary,ID,Year,Mdate,Time,Sensor_ID,Hourly_Counts
0,count,3435106.0,3435106.0,3435106.0,3435106.0,3435106.0,3435106.0
1,mean,1717553.5,2016.003233088004,15.751918863639142,11.459955238644746,22.978422791028866,560.7805942524044
2,stddev,991629.8312350252,3.1237869143646275,8.79918757461428,6.943473866829414,16.229792156265397,809.9942576353371
3,min,1.0,2009.0,1.0,0.0,1.0,0.0
4,max,3435106.0,2020.0,31.0,23.0,71.0,15979.0
5,25%,858777.25,2014.0,8.0,5.0,9.0,50.0
6,50%,1717553.5,2016.0,16.0,11.0,19.0,210.0
7,75%,2576329.75,2019.0,23.0,17.0,34.0,721.0


### Show the count of above threshold column

In [11]:
above = pedestrian_df.filter(pedestrian_df["above_threshold"] == "1").count()
below = pedestrian_df.filter(pedestrian_df["above_threshold"] == "0").count()

print("These are the count for above and below threshold values:")
print("Above: ", str(above))
print("Below: ", str(below))

These are the count for above and below threshold values:
Above:  250942
Below:  3184164


It seems that there is a class imbalance going on. The dataset is skewed such that the above to below ratio is roughly 1:12. In a imbalanced/skewed datset, the machine learning model might overfit to the class that is more respresented in our data set, in this case, the "below" threshold. This might cause the machine learning model to fail when placed into real life usage. For example, in our case, the model can just keep predicting that the hourly count will be below threshold everytime and get a good accuracy score as the chances of hourly counts being above threshold is low. However this same model might fail when its actually placed in "real world" situation where it might be expected to deal with well balanced real time data.

# Feature extraction and ML training

## Preparing Spark ML Transformers/Estimators for features, labels and models

In [12]:
from pyspark.sql.functions import month, dayofweek, weekofyear

# DDing the additional columns for month, day of week and week of year
pedestrian_df = pedestrian_df.withColumn("Month", month(pedestrian_df["Date_Time"]))
pedestrian_df = pedestrian_df.withColumn("day_of_week", dayofweek(pedestrian_df["Date_Time"]))
pedestrian_df = pedestrian_df.withColumn("week_of_year", weekofyear(pedestrian_df["Date_Time"]))

In [13]:
pedestrian_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Date_Time: timestamp (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Mdate: integer (nullable = true)
 |-- Day: string (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Sensor_ID: integer (nullable = true)
 |-- Sensor_Name: string (nullable = true)
 |-- Hourly_Counts: integer (nullable = true)
 |-- above_threshold: double (nullable = false)
 |-- day_of_week: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)



In [14]:
pedestrian_df["Month", "day_of_week", "week_of_year"].show(5)

+-----+-----------+------------+
|Month|day_of_week|week_of_year|
+-----+-----------+------------+
|   11|          6|          44|
|   11|          6|          44|
|   11|          6|          44|
|   11|          6|          44|
|   11|          6|          44|
+-----+-----------+------------+
only showing top 5 rows



In [15]:
# Filtering data between 2014 to 2019 only for hours between 9am to 11pm
# Filter from 2014 to 2019 first
pedes_df_14_19 = pedestrian_df.filter((pedestrian_df["Year"] >= 2014) & (pedestrian_df["Year"] <= 2019))

#Filter from 9am to 11pm
pedes_df_14_19 = pedes_df_14_19.filter((pedes_df_14_19["Time"] >= 9) & (pedes_df_14_19["Time"] <= 23))

pedes_df_14_19.describe().toPandas()

Unnamed: 0,summary,ID,Year,Month,Mdate,Day,Time,Sensor_ID,Sensor_Name,Hourly_Counts,above_threshold,day_of_week,week_of_year
0,count,1388451.0,1388451.0,1388451.0,1388451.0,1388451,1388451.0,1388451.0,1388451,1388451.0,1388451.0,1388451.0,1388451.0
1,mean,1856380.176237404,2016.7631367617585,6.626148852210124,15.736916174931633,,15.999957506602682,24.93814185736479,,855.0832171967178,0.1195202423420055,4.00004969566805,27.01301594366672
2,stddev,641221.2897014186,1.6902188308840698,3.4430627097752864,8.798087424359263,,4.320488603233932,15.201145123230535,,906.5378513311168,0.3243997993306639,2.0005723183741915,15.037502299412996
3,min,745919.0,2014.0,1.0,1.0,Friday,9.0,1.0,Alfred Place,0.0,0.0,1.0,1.0
4,max,2966838.0,2019.0,12.0,31.0,Wednesday,23.0,62.0,Webb Bridge,15979.0,1.0,7.0,53.0


### Fine tuning for best parameters

In [16]:
# Fine tuning parameters, this might take some time to run

# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator,CrossValidatorModel
# from pyspark.ml.evaluation import BinaryClassificationEvaluator
# # Create ParamGrid for Cross Validation

# (train, test) = pedes_df_14_19.randomSplit([0.8, 0.2], seed=202)

# dt_pipeline = Pipeline(stages=[encoder, assembler, dt])
# train = train.withColumnRenamed('above_threshold', 'label')

# # hello = trans_pipeline.fit(train)

# dtparamGrid = (ParamGridBuilder()
#              .addGrid(dt.maxDepth, [1, 2, 3])
#              .addGrid(dt.maxBins, [2, 3, 5])
#              .build())

# dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# dtcv = CrossValidator(estimator = dt_pipeline,
#                       estimatorParamMaps = dtparamGrid,
#                       evaluator = dtevaluator,
#                       numFolds = 3)

# dtcvModel = dtcv.fit(train)

# bestModel= dtcvModel.bestModel
# print(bestModel.stages)
# print('Best Param for DT: ', bestModel.stages[-1]._java_obj.paramMap())

### Write code for tranformers/estimators

In [17]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

# Using OHE on sensorID as sensorID is unique and should play no ordinal role
in_OHE = ["Sensor_ID"]
out_OHE = ["Sensor_ID_vec"]

encoder = OneHotEncoder(inputCols=in_OHE, outputCols=out_OHE).setHandleInvalid("keep")

in_features_vec = ["Month", "Mdate", "week_of_year", "day_of_week", "Time"] + out_OHE

assembler = VectorAssembler(inputCols = in_features_vec, outputCol = "features")

# Creating ML model estimators for Three ML methods, Logistic Regression, Decision Tree, Random Forest
lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=100, regParam = 0.0001)
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 4, maxBins = 3)
rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=4)

### Pipelines


In [18]:
from pyspark.ml import Pipeline

# Chain indexers and tree in a Pipeline
lr_pipeline = Pipeline(stages=[encoder, assembler, lr])
dt_pipeline = Pipeline(stages=[encoder, assembler, dt])
rf_pipeline = Pipeline(stages=[encoder, assembler, rf])

## Preparing the training data and testing data

In [19]:
pedes_df_14_19 = pedes_df_14_19.withColumnRenamed('above_threshold', 'label')

# Creating train and testing data set
train = pedes_df_14_19.filter((pedes_df_14_19["Year"] >= 2014) & (pedes_df_14_19["Year"] <= 2018))
test = pedes_df_14_19.filter(pedes_df_14_19["Year"] == 2019)

## Training and evaluating models
### Train Model and  Performing predictions

In [20]:
lr_model = lr_pipeline.fit(train)
dt_model = dt_pipeline.fit(train)
rf_model = rf_pipeline.fit(train)

In [21]:
# perform predictions on test
lr_predictions = lr_model.transform(test)
dt_predictions = dt_model.transform(test)
rf_predictions = rf_model.transform(test)

###  Measure classification performance
# Logistic Regression Model Performance

In [22]:
lr_predictions = lr_predictions.withColumnRenamed('label', 'above_threshold')
lr_predictions.groupBy('above_threshold', 'prediction').count().show()

+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       1.0| 11186|
|            0.0|       1.0|  7949|
|            1.0|       0.0| 19855|
|            0.0|       0.0|246542|
+---------------+----------+------+



In [23]:
from pyspark.mllib.evaluation import MulticlassMetrics

for_eval_lr = lr_predictions.select("prediction", "above_threshold").collect()

pre_and_label_lr = sc.parallelize(for_eval_lr)

lr_metrics = MulticlassMetrics(pre_and_label_lr)

print("Logistic Regression Model statistics:")
print("Accuracy: ", str(lr_metrics.accuracy))
print("Recall: ", str(lr_metrics.recall(1.0)))
print("Precision: ", str(lr_metrics.precision(1.0)))

Logistic Regression Model statistics:
Accuracy:  0.9026238740316321
Recall:  0.36036210173641314
Precision:  0.5845832244577999


# Decision Tree Model Performance

In [24]:
dt_predictions = dt_predictions.withColumnRenamed('label', 'above_threshold')
dt_predictions.groupBy('above_threshold', 'prediction').count().show()

+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       1.0|  4549|
|            0.0|       1.0|  2421|
|            1.0|       0.0| 26492|
|            0.0|       0.0|252070|
+---------------+----------+------+



In [25]:
for_eval_dt = dt_predictions.select("prediction", "above_threshold").collect()

pre_and_label_dt = sc.parallelize(for_eval_dt)

dt_metrics = MulticlassMetrics(pre_and_label_dt)

print("Decision Tree Model statistics:")
print("Accuracy: ", str(dt_metrics.accuracy))
print("Recall: ", str(dt_metrics.recall(1.0)))
print("Precision: ", str(dt_metrics.precision(1.0)))

Decision Tree Model statistics:
Accuracy:  0.8987398960536822
Recall:  0.14654811378499405
Precision:  0.6526542324246772


# Random Forest Model Performance

In [26]:
rf_predictions = rf_predictions.withColumnRenamed('label', 'above_threshold')
rf_predictions.groupBy('above_threshold', 'prediction').count().show()

+---------------+----------+------+
|above_threshold|prediction| count|
+---------------+----------+------+
|            1.0|       1.0|  4300|
|            0.0|       1.0|  2270|
|            1.0|       0.0| 26741|
|            0.0|       0.0|252221|
+---------------+----------+------+



In [27]:
for_eval_rf = rf_predictions.select("prediction", "above_threshold").collect()

pre_and_label_rf = sc.parallelize(for_eval_rf)

rf_metrics = MulticlassMetrics(pre_and_label_rf)

print("Random Forest Model statistics:")
print("Accuracy: ", str(rf_metrics.accuracy))
print("Recall: ", str(rf_metrics.recall(1.0)))
print("Precision: ", str(rf_metrics.precision(1.0)))

Random Forest Model statistics:
Accuracy:  0.8983966770799771
Recall:  0.138526464997906
Precision:  0.654490106544901


# Thoughts and Persisting the best model
According to the metrics, the better model in my case appears to be the logistic regression model. The overall scores for the logistic regression metric seems to be higher than the others. For random forest, it has a high accuracy score, but the recall and precision score is lower comapred to the rest. As for the Decision Tree metric, its score are similar to that of Logistic Regression, but the recall score is around 2 times lower than that of logistic regression. Logistic Regression metric might not have the best performance for some of the metric scores, but on average it seems to have a better overall score when compared to the rest. Therefore, the logistic regression model is persisted.

In [28]:
lr_model.write().overwrite().save('pedestrian_count_logistic_regression_models')

from pyspark.ml import PipelineModel
pipelineModel = PipelineModel.load('pedestrian_count_logistic_regression_models')
print(pipelineModel.stages[-1]._java_obj.paramMap())

{
	LogisticRegression_8fe6a33dbe10-featuresCol: features,
	LogisticRegression_8fe6a33dbe10-labelCol: label,
	LogisticRegression_8fe6a33dbe10-maxIter: 100,
	LogisticRegression_8fe6a33dbe10-regParam: 1.0E-4
}


## Write code to print out the leaf node splitting criteria

In [29]:
dtModel = dt_pipeline.fit(train)
va = dtModel.stages[-2]
tree = dtModel.stages[-1]

## Print leaf node splitting Criteria

In [30]:
print(tree.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1f47bacc6ece, depth=4, numNodes=17, numClasses=2, numFeatures=64
  If (feature 43 in {1.0})
   If (feature 4 <= 18.5)
    Predict: 1.0
   Else (feature 4 > 18.5)
    If (feature 3 <= 5.5)
     If (feature 0 <= 4.5)
      Predict: 1.0
     Else (feature 0 > 4.5)
      Predict: 0.0
    Else (feature 3 > 5.5)
     Predict: 1.0
  Else (feature 43 not in {1.0})
   If (feature 9 in {1.0})
    If (feature 4 <= 18.5)
     Predict: 1.0
    Else (feature 4 > 18.5)
     Predict: 0.0
   Else (feature 9 not in {1.0})
    If (feature 8 in {1.0})
     If (feature 4 <= 18.5)
      Predict: 1.0
     Else (feature 4 > 18.5)
      Predict: 0.0
    Else (feature 8 not in {1.0})
     Predict: 0.0



## Print top 3 features and their corresponding feature importance

In [31]:
top_3_feats = list(zip(va.getInputCols(), tree.featureImportances))

top_3_feats = sorted(top_3_feats, key=lambda tup:(-tup[1], tup[0]))

for i in range(3):
    print("Feature", str(i+1), ",", top_3_feats[i][0], ": " , str(top_3_feats[i][1]))

Feature 1 , Time :  0.15118188175825525
Feature 2 , day_of_week :  0.009186798183601566
Feature 3 , Month :  0.000793353004851594
