# Gradient Boosting with Pipeline

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("gbt")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [4]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [6]:
# print runtime versions
# Python version
sys.version

'3.8.5 (default, Jul 28 2020, 12:59:40) \n[GCC 9.3.0]'

In [7]:
# Spark version
spark.version

'3.0.1'

### Exploring Data

In [8]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/bike/hour.csv', header=True, inferSchema=True)
df = spark.read.csv('data/bike/hour.csv', header=True, inferSchema=True)

In [9]:
#validate the size of data
df.count(), len(df.columns)

(17379, 17)

In [10]:
# First 5 rows of Iris dataset
df.show(5)

+-------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+----------+---+
|instant|    dteday|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|casual|registered|cnt|
+-------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+----------+---+
|      1|2011-01-01|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0|     3|        13| 16|
|      2|2011-01-01|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|     8|        32| 40|
|      3|2011-01-01|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|     5|        27| 32|
|      4|2011-01-01|     1|  0|   1|  3|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0|     3|        10| 13|
|      5|2011-01-01|     1|  0|   1|  4|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0

In [11]:
df.printSchema()

root
 |-- instant: integer (nullable = true)
 |-- dteday: string (nullable = true)
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- cnt: integer (nullable = true)



### Feature Engineering

In [12]:
df.select('casual', 'registered', 'cnt').show(10)

+------+----------+---+
|casual|registered|cnt|
+------+----------+---+
|     3|        13| 16|
|     8|        32| 40|
|     5|        27| 32|
|     3|        10| 13|
|     0|         1|  1|
|     0|         1|  1|
|     2|         0|  2|
|     1|         2|  3|
|     1|         7|  8|
|     8|         6| 14|
+------+----------+---+
only showing top 10 rows



In [13]:
# discard 'instant', 'dteday', 'casual' and 'registered' columns
df = df.drop('instant').drop('dteday').drop('casual').drop('registered')

In [14]:
df.show(10)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+---+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|cnt|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+---+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0| 16|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 40|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 32|
|     1|  0|   1|  3|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0| 13|
|     1|  0|   1|  4|      0|      6|         0|         1|0.24|0.2879|0.75|      0.0|  1|
|     1|  0|   1|  5|      0|      6|         0|         2|0.24|0.2576|0.75|   0.0896|  1|
|     1|  0|   1|  6|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|  2|
|     1|  0|   1|  7|      0|      6|         0|         1| 0.2|0.2576|0.86|      0.0|  3|

In [15]:
df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- yr: integer (nullable = true)
 |-- mnth: integer (nullable = true)
 |-- hr: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weathersit: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: integer (nullable = true)



### Split Data - Train & Test sets

In [16]:
# use Logistic Regression to train on the training set
train_df, test_df = df.randomSplit([0.70, 0.30], seed=42)

In [17]:
train_df.count(), len(train_df.columns)

(12234, 13)

In [18]:
test_df.count(), len(test_df.columns)

(5145, 13)

### Build GBT Regression Model using Pipeline 

In [19]:
featuresCols = df.columns[:-1]

In [20]:
featuresCols

['season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'windspeed']

In [21]:
# concatenates all feature columns into a single feature vector
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="features")

In [22]:
# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="cnt")

In [23]:
paramGrid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [2, 5])\
    .addGrid(gbt.maxIter, [10, 100])\
    .build()

In [24]:
# define evaluation metric  
# tell CrossValidator how well we are doing by comparing the true labels with predictions
evaluator = RegressionEvaluator(metricName="rmse", 
            labelCol=gbt.getLabelCol(), 
            predictionCol=gbt.getPredictionCol())

In [25]:
# declare the CrossValidator, which runs model tuning for us
cv = CrossValidator(estimator=gbt, evaluator=evaluator, 
        estimatorParamMaps=paramGrid)

In [26]:
# tie feature processing and model training stages together into a single Pipeline
pipeline = Pipeline(stages=[vectorAssembler, cv])

In [27]:
%%time
# train the Pipeline
pipelineModel = pipeline.fit(train_df)

CPU times: user 386 ms, sys: 148 ms, total: 534 ms
Wall time: 2min 41s


In [28]:
pipelineModel.stages[1].bestModel._java_obj.getMaxDepth()

5

In [29]:
pipelineModel.stages[1].bestModel._java_obj.getMaxIter()

100

### Evaluate Model

In [30]:
# make predictions on test data 
predictions = pipelineModel.transform(test_df)

In [31]:
# view the residual errors based on predictions 
predictions.select('cnt', 'prediction').show(10,False)

+---+------------------+
|cnt|prediction        |
+---+------------------+
|33 |33.19305689960539 |
|5  |9.287642994994783 |
|7  |16.7675170064617  |
|12 |9.652341195079874 |
|7  |7.555943059568435 |
|17 |15.845011568803482|
|3  |16.058875542147785|
|14 |12.691276974128776|
|9  |11.87620282915386 |
|13 |26.12295702251729 |
+---+------------------+
only showing top 10 rows



In [32]:
# RMSE
rmse = evaluator.evaluate(predictions)

In [33]:
rmse

44.073736367721025