# Gold - Machine Learning

## 0. Setup

In [1]:
import findspark

import shared

findspark.init()

In [2]:
from pyspark import StorageLevel
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

from utils import setup_spark

# .master("spark://group-11:7077")
spark = (
    SparkSession.builder.appName("DAT535-Taxi-Trips")
    .master("spark://group-11:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "1g")
    .config("spark.log.level", "WARN")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/28 22:39:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "WARN".


In [3]:
from schemas import schema_merged_silver
from shared import PATH_SILVER_MERGED

FARE_AMOUNT_MIN, FARE_AMOUNT_MAX = 1, 1000
SPEED_MPH_MIN = 1
TIP_AMOUNT_MIN, TIP_AMOUNT_MAX = 1, 1000
DISTANCE_MILES_MIN, DISTANCE_MILES_MAX = 1, 1000
DURATION_MIN, DURATION_MAX = 1, 1000

df = spark.read.schema(schema_merged_silver).parquet(PATH_SILVER_MERGED)


## 1. Linear Regression

In [4]:


df_regression = df.filter(
    (f.col("payment_type") == shared.PaymentType.CREDIT_CARD.value)
    & (f.col("fare_amount").between(FARE_AMOUNT_MIN, FARE_AMOUNT_MAX))
    & (f.col("speed_mph") >= SPEED_MPH_MIN)
    & (f.col("tip_amount").between(TIP_AMOUNT_MIN, TIP_AMOUNT_MAX))
    & (f.col("distance_miles").between(DISTANCE_MILES_MIN, DISTANCE_MILES_MAX))
    & (f.col("duration_minutes").between(DURATION_MIN, DURATION_MAX))
).select("city", "fare_amount", "speed_mph", "tip_amount")

In [5]:
train, test = df_regression.randomSplit([80.0, 20.0], seed=42)
train, test = train.cache(), test.cache()

In [20]:
stages = Pipeline(
    stages=[
        StringIndexer(inputCols=["city"], outputCols=["cityIndex"]),
        OneHotEncoder(inputCols=["cityIndex"], outputCols=["cityVec"]),
        VectorAssembler(inputCols=["cityVec", "fare_amount", "speed_mph"], outputCol="features"),
        StandardScaler(inputCol="features", outputCol="featuresScaler"),
        LinearRegression(
            maxIter=50, regParam=0.1, elasticNetParam=0.5, labelCol="tip_amount", featuresCol="featuresScaler"
        ),
    ]
)

In [21]:
# Training
model = stages.fit(train)

                                                                                

In [22]:
# Testing
predictions_test = model.transform(test)

In [23]:
# predictions.select("tip_amount", "prediction").show(50)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = RegressionEvaluator(predictionCol="prediction", labelCol="tip_amount", metricName="mse")

# metrics.evaluate(predictions_train)
metrics.evaluate(predictions_test)
#print(model.stages[-1].coefficients)
#model.stages[-1].summary.r2


                                                                                

2.408293317685801

In [24]:

df = df_regression  

# Basic distribution of tips
df.select("tip_amount").summary().show()

# What fraction of trips have zero tip?
zero_frac = df.filter(F.col("tip_amount") == 0).count() / df.count()
print("zero-tip fraction:", zero_frac)

# Correlation between tip and fare / speed
corrs = df.select(
    F.corr("tip_amount", "fare_amount").alias("corr_tip_fare"),
    F.corr("tip_amount", "speed_mph").alias("corr_tip_speed"),
).collect()[0]
print(corrs)


                                                                                

+-------+------------------+
|summary|        tip_amount|
+-------+------------------+
|  count|          28100022|
|   mean|3.2847423196486036|
| stddev| 2.845011419459388|
|    min|               1.0|
|    25%|              1.76|
|    50%|              2.34|
|    75%|              3.65|
|    max|             888.2|
+-------+------------------+



                                                                                

zero-tip fraction: 0.0




Row(corr_tip_fare=0.8374642747611836, corr_tip_speed=0.18070588705355897)


                                                                                

In [25]:
spark.stop()

#### Randomforest

In [16]:
from shared import City

stages = Pipeline(
    stages=[
        StringIndexer(
            inputCols=["city"],
            outputCols=["cityIndex"],
        ),
        VectorAssembler(inputCols=["cityIndex", "fare_amount", "speed_mph"], outputCol="features"),
        RandomForestRegressor(
            featuresCol="features",
            labelCol="tip_amount",
            numTrees=80,
            maxDepth=10,
            minInstancesPerNode=50,
            subsamplingRate=0.7,
            featureSubsetStrategy="sqrt",
            maxBins=64,
            seed=42,
        ),
    ]
)


model = stages.fit(train)
predictions_test = model.transform(test).show()


25/11/28 20:50:22 WARN DAGScheduler: Broadcasting large task binary with size 1542.2 KiB
25/11/28 20:52:08 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/11/28 20:54:07 WARN DAGScheduler: Broadcasting large task binary with size 4.9 MiB
25/11/28 20:55:36 ERROR TaskSchedulerImpl: Lost executor 0 on 192.168.11.69: Command exited with code 137
25/11/28 20:55:36 WARN TaskSetManager: Lost task 8.0 in stage 61.0 (TID 777) (192.168.11.69 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Command exited with code 137
25/11/28 20:55:36 WARN TaskSetManager: Lost task 7.0 in stage 61.0 (TID 776) (192.168.11.69 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Command exited with code 137
25/11/28 20:55:36 WARN TaskSetManager: Lost task 6.0 in stage 61.0 (TID 775) (192.168.11.69 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Command exi

+-------+-----------+---------+----------+---------+--------------------+------------------+
|   city|fare_amount|speed_mph|tip_amount|cityIndex|            features|        prediction|
+-------+-----------+---------+----------+---------+--------------------+------------------+
|Chicago|       3.25|      6.0|       2.0|      1.0|      [1.0,3.25,6.0]|2.6256583083349265|
|Chicago|       3.25|     8.25|       2.0|      1.0|     [1.0,3.25,8.25]| 2.279885332968586|
|Chicago|       3.25|      9.5|      8.25|      1.0|      [1.0,3.25,9.5]|2.1765616285716662|
|Chicago|       3.25|10.285715|       3.0|      1.0|[1.0,3.25,10.2857...|2.0119863039988495|
|Chicago|       3.25|     12.0|       1.0|      1.0|     [1.0,3.25,12.0]|2.0116219029323816|
|Chicago|       3.25|12.900001|       3.0|      1.0|[1.0,3.25,12.9000...|2.0131405727776297|
|Chicago|       3.25|14.416666|       3.0|      1.0|[1.0,3.25,14.4166...| 2.018669171744324|
|Chicago|       3.25|21.795918|       5.0|      1.0|[1.0,3.25,21.7959.

                                                                                

In [17]:
predictions_test = model.transform(test)


In [23]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_mae = RegressionEvaluator(labelCol="tip_amount", predictionCol="prediction", metricName="r2")
evaluator_mse = RegressionEvaluator(labelCol="tip_amount", predictionCol="prediction", metricName="mse")
evaluator_r2 = RegressionEvaluator(labelCol="tip_amount", predictionCol="prediction", metricName="mae")


print("RF R2:", evaluator_r2.evaluate(predictions_test))
print("RF MSE:", evaluator_mse.evaluate(predictions_test))
print("RF MAE:", evaluator_mae.evaluate(predictions_test))



                                                                                

RF R2: 0.7174909327966459


                                                                                

RF MSE: 2.4729636745931662




RF MAE: 0.6945249205987529


                                                                                