<h1>Random Forest - Average Driver Pay<h1>

In [1]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, Normalization

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window
from pyspark.ml.tuning import CrossValidator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import sum,avg,max,min,mean,count
import numpy as np
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "8g")
    .config("spark.sql.parquet.enableVectorizedReader", False)
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/16 23:36:53 WARN Utils: Your hostname, Sens-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.11 instead (on interface en0)
22/08/16 23:36:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/16 23:36:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
sdf = spark.read.parquet('../data/curated/combined_data')

In [None]:
sdf = sdf.groupBy('Date', 'Hour', 'PU_Location_ID').agg(avg("Temperature_C").alias("Temperature_C"), \
                                                           avg("Humidity_%").alias("Humidity_%"), \
                                                           avg("Speed_kmh").alias("Speed_kmh"), \
                                                           avg("Precip_Rate_mm").alias("Precip_rate_mm"), \
                                                           avg("Driver_pay").alias("Avg_driver_pay"), \
                                                           avg("Day_of_week").alias("Day_of_week"), \
                                                           count('Temperature_C').alias("Num_trips"))

In [None]:
sdf = sdf.orderBy('Date', 'PU_Location_ID', 'Hour')

In [None]:
feature_list = []
for col in sdf.columns:
    if col == 'Date' or col == 'Avg_driver_pay' or col == 'Num_trips':
        continue
    else:
        feature_list.append(col)

assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [None]:
model_sdf = assembler.transform(sdf.dropna('any'))

In [None]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=270).fit(model_sdf)

Split after ordering to get the same split as neural network models

In [None]:
model_sdf = model_sdf.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy('Date', 'PU_Location_ID', 'Hour')))
train_sdf = model_sdf.where("rank <= .8").drop("rank")
test_sdf = model_sdf.where("rank > .8").drop("rank")

In [None]:
rf = RandomForestRegressor(featuresCol="indexedFeatures", labelCol='Avg_driver_pay', maxBins = 270)

pipeline = Pipeline(stages=[featureIndexer, rf])

In [None]:
model = pipeline.fit(train_sdf)

In [None]:
# Make predictions.
predictions = model.transform(test_sdf).select('prediction').toPandas()

In [None]:
y_test = test_sdf.select('Avg_driver_pay').toPandas()

In [None]:
errors = np.array(np.array(predictions) - y_test)
squared_errors = errors**2

mean_squared_error = squared_errors.mean()

print(f'MSE: {mean_squared_error}')

In [None]:
tot_sum_squares = (np.array(y_test - y_test.mean())**2).sum()
r2 = 1 - (squared_errors.sum() / tot_sum_squares)
print(f'Model R^2: {r2:.4f}')

<h3>Save predictions for further analysis<h3>

In [None]:
pd.DataFrame(predictions).to_csv('../data/curated/model_data/avg_driver_pay_pred_rf.csv')