# Hyperparameter tuning

## Spark

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Apache_Spark_logo.svg/1280px-Apache_Spark_logo.svg.png" width="400">

**Hardware**: 10 nodes - r5.8xlarge (32 CPU, 256GB RAM)

In [2]:
from ml_utils import MLUtils

ml_utils = MLUtils(
    ml_task='tip',
    tool='spark',
    model='elastic_net',
)

# Load data and feature engineering

In [3]:
import numpy as np
import datetime
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T


spark = SparkSession.builder.getOrCreate()

In [4]:
%%time
tip_train = spark.read.parquet(f'{ml_utils.taxi_path}/data/ml/tip_train_sample')
tip_train.count()

CPU times: user 3.09 ms, sys: 318 µs, total: 3.41 ms
Wall time: 9.37 s


10994647

In [5]:
tip_train.head()

Row(id='eba77ab3c8b746838627233a930715b8', pickup_datetime=datetime.datetime(2016, 7, 16, 18, 24, 38), dropoff_datetime=datetime.datetime(2016, 7, 16, 18, 28, 48), pickup_taxizone_id=230.0, dropoff_taxizone_id=162.0, pickup_weekday=5, pickup_weekofyear=28, pickup_hour=18, pickup_minute=24, pickup_week_hour=138, passenger_count=1.0, tip_fraction=0.23333333333333334)

<br>

Let's take the same sample we used in the single node scikit example

In [6]:
sample = tip_train.sample(fraction=0.1, withReplacement=False, seed=42)
sample.count()

1098005

# Run grid search

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.pipeline import Pipeline

features = ml_utils.tip_vars.features
y_col = ml_utils.tip_vars.y_col

indexers = [
    StringIndexer(
    inputCol=c, 
    outputCol=f'{c}_idx', handleInvalid='keep')
    for c in ml_utils.tip_vars.categorical_feat
]
encoders = [
    OneHotEncoder(
        inputCol=f'{c}_idx',
        outputCol=f'{c}_onehot',
    ) 
    for c in ml_utils.tip_vars.categorical_feat
]
num_assembler = VectorAssembler(
    inputCols=ml_utils.tip_vars.numeric_feat,
    outputCol='num_features',
)
scaler = StandardScaler(inputCol='num_features', outputCol='num_features_scaled')

assembler = VectorAssembler(
    inputCols=[f'{c}_onehot' for c in ml_utils.tip_vars.categorical_feat] + ['num_features_scaled'],
    outputCol='features',
)

lr = LinearRegression(standardization=False, maxIter=100)
pipeline = Pipeline(
    stages=indexers + encoders + [num_assembler, scaler, assembler, lr])

params = ml_utils.tip_vars.elastic_net_grid_search_params
grid = (
    ParamGridBuilder()
    .addGrid(lr.elasticNetParam, params['clf__l1_ratio'])
    .addGrid(lr.regParam, params['clf__alpha'])
    .build()
)

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=grid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

In [8]:
sample = sample.withColumn('label', sample[y_col])
sample.cache()

DataFrame[id: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, pickup_taxizone_id: double, dropoff_taxizone_id: double, pickup_weekday: bigint, pickup_weekofyear: bigint, pickup_hour: bigint, pickup_minute: bigint, pickup_week_hour: bigint, passenger_count: double, tip_fraction: double, label: double]

In [9]:
%%time
with ml_utils.time_fit():
    fitted = crossval.fit(sample)

CPU times: user 1min 50s, sys: 27.5 s, total: 2min 17s
Wall time: 48min 9s


In [35]:
print(f'regParam: {fitted.bestModel.stages[-1]._java_obj.getRegParam()}')
print(f'elasticNetParam: {fitted.bestModel.stages[-1]._java_obj.getElasticNetParam()}')

regParam: 0.5
elasticNetParam: 0.0


## Predict on test set

In [15]:
tip_test = spark.read.parquet(f'{ml_utils.taxi_path}/data/ml/tip_test')
preds = fitted.transform(tip_test)

In [38]:
preds.head()

Row(id='11d37225291e4ba4aa7eebad9f7b272a', pickup_datetime=datetime.datetime(2019, 9, 9, 10, 19, 44), dropoff_datetime=datetime.datetime(2019, 9, 9, 10, 31, 26), pickup_taxizone_id=162.0, dropoff_taxizone_id=170.0, pickup_weekday=0, pickup_weekofyear=37, pickup_hour=10, pickup_minute=19, pickup_week_hour=10, passenger_count=1.0, tip_fraction=0.11764705882352941, pickup_taxizone_id_idx=3.0, dropoff_taxizone_id_idx=3.0, pickup_taxizone_id_onehot=SparseVector(253, {3: 1.0}), dropoff_taxizone_id_onehot=SparseVector(258, {3: 1.0}), num_features=DenseVector([0.0, 37.0, 10.0, 10.0, 19.0, 1.0]), num_features_scaled=DenseVector([0.0, 2.461, 1.5954, 0.2166, 1.0963, 0.7996]), features=SparseVector(517, {3: 1.0, 256: 1.0, 512: 2.461, 513: 1.5954, 514: 0.2166, 515: 1.0963, 516: 0.7996}), prediction=0.286677157032011)

In [17]:
path = f'{ml_utils.taxi_path}/ml_results/predictions/{ml_utils.ml_task}__{ml_utils.tool}__{ml_utils.model}'
path

's3://saturn-titan/nyc-taxi/ml_results/predictions/tip__spark__elastic_net'

In [37]:
(preds
 .select(preds.id, preds[y_col].alias('actual'), preds.prediction.alias('predicted'))
 .show(5))

+--------------------+-------------------+-------------------+
|                  id|             actual|          predicted|
+--------------------+-------------------+-------------------+
|11d37225291e4ba4a...|0.11764705882352941|  0.286677157032011|
|8bc943e88a654bb2a...| 0.2168421052631579|0.29062283606559014|
|3431b20f16c5456aa...|               0.15|0.29266696956827093|
|52cf7e80dd104f84a...|            0.10625|0.23165384510777337|
|e0ae8ce6dbff40b19...|                0.0|0.23370741592474725|
+--------------------+-------------------+-------------------+
only showing top 5 rows



In [19]:
%%time
(preds
 .select(preds.id, preds[y_col].alias('actual'), preds.prediction.alias('predicted'))
 .write.parquet(path, mode='overwrite')
)

CPU times: user 7.7 ms, sys: 0 ns, total: 7.7 ms
Wall time: 6.14 s


In [36]:
evaluator = RegressionEvaluator(
    labelCol=y_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(preds)
ml_utils.write_metric_df('rmse', rmse)

Unnamed: 0,ml_task,tool,model,metric,value,fit_seconds
0,tip,spark,elastic_net,rmse,14.763659,2889.182799
