In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [3]:
sdf_data = spark.read.parquet('../data/curated/result_data/')
sdf_predict = spark.read.parquet('../data/curated/result_predict/')
sdf_data.limit(5)

PULocationID,day,shift,fare,temp,Date
43,Tue,Night,6.8,25,2022-02-01
166,Tue,Night,5.8,25,2022-02-01
89,Tue,Night,50.3,25,2022-02-01
7,Tue,Night,25.05,25,2022-02-01
166,Tue,Night,9.8,25,2022-02-01


In [4]:
# Get Borough data to link to location IDs
sdf_zone = spark.read.option("header",True).csv("../data/raw/taxi+_zone_lookup.csv")
sdf_zone = sdf_zone.withColumnRenamed('LocationID', 'PULocationID')
sdf_zone = sdf_zone.select('PULocationID', 'Borough')
sdf_zone.limit(5)

PULocationID,Borough
1,EWR
2,Queens
3,Bronx
4,Manhattan
5,Staten Island


In [5]:
sdf_merge = sdf_data.join(sdf_zone, on='PULocationID', how='left')
sdf_pred = sdf_predict.join(sdf_zone, on='PULocationID', how='left')
sdf_merge.limit(5)

PULocationID,day,shift,fare,temp,Date,Borough
43,Tue,Night,6.8,25,2022-02-01,Manhattan
166,Tue,Night,5.8,25,2022-02-01,Manhattan
89,Tue,Night,50.3,25,2022-02-01,Brooklyn
7,Tue,Night,25.05,25,2022-02-01,Queens
166,Tue,Night,9.8,25,2022-02-01,Manhattan


In [6]:
sdf_merge = sdf_merge.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_pred = sdf_pred.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_merge.limit(5)

Borough,day,shift,temp,fare
Manhattan,Tue,Night,25,6.8
Manhattan,Tue,Night,25,5.8
Brooklyn,Tue,Night,25,50.3
Queens,Tue,Night,25,25.05
Manhattan,Tue,Night,25,9.8


In [7]:
# In order to fulfill full rank model for linear regression, 
#  we combine and get the fare average for duplicates
sdf_agg = sdf_merge \
            .groupBy('Borough', 'day', 'shift','temp') \
            .agg(
                F.round(F.mean("fare"),2).alias("fare"),
            ) 
sdf_agg.limit(5)

Borough,day,shift,temp,fare
Bronx,Sun,Night,27,17.49
Brooklyn,Mon,Morning,32,15.76
Staten Island,Wed,Night,44,72.85
Brooklyn,Mon,Night,45,17.14
Manhattan,Thu,Morning,56,14.9


In [8]:
# Change to indexes from strings in order to one hot encode
string_indexer = StringIndexer(inputCols=['Borough','day','shift'], outputCols=['Borough_idx','day_idx','shift_idx']).fit(sdf_agg)
sdf_agg = string_indexer.transform(sdf_agg)
sdf_pred = string_indexer.transform(sdf_pred)
sdf_agg.show()

+-------------+---+-------+----+-----+-----------+-------+---------+
|      Borough|day|  shift|temp| fare|Borough_idx|day_idx|shift_idx|
+-------------+---+-------+----+-----+-----------+-------+---------+
|        Bronx|Sun|  Night|  27|17.49|        0.0|    4.0|      1.0|
|     Brooklyn|Mon|Morning|  32|15.76|        1.0|    1.0|      0.0|
|Staten Island|Wed|  Night|  44|72.85|        4.0|    6.0|      1.0|
|     Brooklyn|Mon|  Night|  45|17.14|        1.0|    1.0|      1.0|
|    Manhattan|Thu|Morning|  56| 14.9|        2.0|    3.0|      0.0|
|       Queens|Sat|  Night|  45|16.89|        3.0|    2.0|      1.0|
|    Manhattan|Sat|Morning|  45|13.48|        2.0|    2.0|      0.0|
|        Bronx|Fri|  Night|  68|22.58|        0.0|    0.0|      1.0|
|    Manhattan|Thu|  Night|  73|14.61|        2.0|    3.0|      1.0|
|    Manhattan|Tue|Morning|  80|14.59|        2.0|    5.0|      0.0|
|        Bronx|Sun|Morning|  78|34.18|        0.0|    4.0|      0.0|
|       Queens|Thu|  Night|  41|14

In [9]:
# Now we can one hot encode
OHE = (OneHotEncoder()
       .setInputCols(['Borough_idx','day_idx','shift_idx'])
       .setOutputCols(['Borough_ohe','day_ohe','shift_ohe']))
model = OHE.fit(sdf_agg)
sdf_ohe = model.transform(sdf_agg)
sdf_ohe_pred = model.transform(sdf_pred)
sdf_ohef = sdf_ohe.select('Borough_ohe', 'day_ohe','shift_ohe', 'temp', 'fare')
sdf_ohef_pred = sdf_ohe_pred.select('Borough_ohe', 'day_ohe','shift_ohe', 'temp', 'fare')


sdf_ohef.limit(5)

Borough_ohe,day_ohe,shift_ohe,temp,fare
"(4,[0],[1.0])","(6,[4],[1.0])","(1,[],[])",27,17.49
"(4,[1],[1.0])","(6,[1],[1.0])","(1,[0],[1.0])",32,15.76
"(4,[],[])","(6,[],[])","(1,[],[])",44,72.85
"(4,[1],[1.0])","(6,[1],[1.0])","(1,[],[])",45,17.14
"(4,[2],[1.0])","(6,[3],[1.0])","(1,[0],[1.0])",56,14.9


In [10]:
# Check to see that the test and train data are in sync
sdf_ohe.select('Borough','Borough_ohe').distinct().sort('Borough_ohe').show()

+-------------+-------------+
|      Borough|  Borough_ohe|
+-------------+-------------+
|Staten Island|    (4,[],[])|
|        Bronx|(4,[0],[1.0])|
|     Brooklyn|(4,[1],[1.0])|
|    Manhattan|(4,[2],[1.0])|
|       Queens|(4,[3],[1.0])|
+-------------+-------------+



In [11]:
sdf_ohe_pred.select('Borough','Borough_ohe').distinct().sort('Borough_ohe').show()

+-------------+-------------+
|      Borough|  Borough_ohe|
+-------------+-------------+
|Staten Island|    (4,[],[])|
|        Bronx|(4,[0],[1.0])|
|     Brooklyn|(4,[1],[1.0])|
|    Manhattan|(4,[2],[1.0])|
|       Queens|(4,[3],[1.0])|
+-------------+-------------+



In [12]:
sdf_ohe.select('day','day_ohe').distinct().sort('day_ohe').show()

+---+-------------+
|day|      day_ohe|
+---+-------------+
|Wed|    (6,[],[])|
|Fri|(6,[0],[1.0])|
|Mon|(6,[1],[1.0])|
|Sat|(6,[2],[1.0])|
|Thu|(6,[3],[1.0])|
|Sun|(6,[4],[1.0])|
|Tue|(6,[5],[1.0])|
+---+-------------+



In [13]:
sdf_ohe_pred.select('day','day_ohe').distinct().sort('day_ohe').show()

+---+-------------+
|day|      day_ohe|
+---+-------------+
|Wed|    (6,[],[])|
|Fri|(6,[0],[1.0])|
|Mon|(6,[1],[1.0])|
|Sat|(6,[2],[1.0])|
|Thu|(6,[3],[1.0])|
|Sun|(6,[4],[1.0])|
|Tue|(6,[5],[1.0])|
+---+-------------+



In [14]:
sdf_ohe.select('shift','shift_ohe').distinct().sort('shift_ohe').show()

+-------+-------------+
|  shift|    shift_ohe|
+-------+-------------+
|  Night|    (1,[],[])|
|Morning|(1,[0],[1.0])|
+-------+-------------+



In [15]:
sdf_ohe_pred.select('shift','shift_ohe').distinct().sort('shift_ohe').show()

+-------+-------------+
|  shift|    shift_ohe|
+-------+-------------+
|  Night|    (1,[],[])|
|Morning|(1,[0],[1.0])|
+-------+-------------+



In [16]:
# We want to normalize temp so it can be compared to dummy variables for data consistency in scale
mean_temp, sd_temp = sdf_ohef.select(F.mean("temp"), F.stddev("temp")).first()
sdf_ohef = sdf_ohef.withColumn("temp_norm", (F.col("temp") - mean_temp) / sd_temp)
sdf_ohef_pred = sdf_ohef_pred.withColumn("temp_norm", (F.col("temp") - mean_temp) / sd_temp)

In [17]:
# How much data we are working with for model
sdf_ohef.count()

1718

In [19]:
# Create assember object for predictors
features = 'features'
input_cols = ['Borough_ohe', 'day_ohe','shift_ohe', 'temp_norm']

assembler = VectorAssembler(
                                inputCols=input_cols,
                                outputCol= features
                            )
sdf_model = assembler.transform(sdf_ohef)
sdf_model_pred = assembler.transform(sdf_ohef_pred)
sdf_model.select('features').head(5), sdf_model.select('fare').head(5)

([Row(features=SparseVector(12, {0: 1.0, 8: 1.0, 11: -1.5367})),
  Row(features=SparseVector(12, {1: 1.0, 5: 1.0, 10: 1.0, 11: -1.2704})),
  Row(features=SparseVector(12, {11: -0.6314})),
  Row(features=SparseVector(12, {1: 1.0, 5: 1.0, 11: -0.5782})),
  Row(features=SparseVector(12, {2: 1.0, 7: 1.0, 10: 1.0, 11: 0.0076}))],
 [Row(fare=17.49),
  Row(fare=15.76),
  Row(fare=72.85),
  Row(fare=17.14),
  Row(fare=14.9)])

In [20]:
# Fit linear model
lm = LinearRegression(featuresCol='features', labelCol='fare').fit(sdf_model)

In [21]:
# Access coefficients

# 'EWR', 'Sun', 'Night' are not included as they are the reference group
coef_cols = ['Bronx', 'Brooklyn','Manhattan', 'Queens', 
             'Fri', 'Mon', 'Sat', 'Thu', 'Sun', 'Tue', 
             'Morning', 
             'temp']

pd.DataFrame(
    data=[lm.intercept] + list(lm.coefficients),
    index=['intercept'] + coef_cols,
    columns=['coefficient']
)

Unnamed: 0,coefficient
intercept,67.507894
Bronx,-44.407587
Brooklyn,-50.388662
Manhattan,-56.249388
Queens,-51.53678
Fri,-0.001549
Mon,1.037794
Sat,-0.499644
Thu,-0.108038
Sun,0.588683


In [22]:
# Test on test data and check error analysis MAE and R^2
fare_pred = lm.transform(sdf_model_pred)

In [23]:
# Get r-squared
lm.summary.r2adj

0.8618600203813602

In [24]:
# Get the mean absolute error
lm.summary.meanAbsoluteError

4.31155213569164