In [69]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [70]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [71]:
sdf_data = spark.read.parquet('../data/curated/result_data/')
sdf_predict = spark.read.parquet('../data/curated/result_predict/')
sdf_data.limit(5)

PULocationID,day,shift,fare,temp,Date
43,Tue,Night,6.8,25,2022-02-01
166,Tue,Night,5.8,25,2022-02-01
89,Tue,Night,50.3,25,2022-02-01
7,Tue,Night,25.05,25,2022-02-01
166,Tue,Night,9.8,25,2022-02-01


In [72]:
sdf_zone = spark.read.option("header",True).csv("../data/taxi_zones/taxi+_zone_lookup.csv")
sdf_zone = sdf_zone.withColumnRenamed('LocationID', 'PULocationID')
sdf_zone = sdf_zone.select('PULocationID', 'Borough')
sdf_zone.limit(5)

PULocationID,Borough
1,EWR
2,Queens
3,Bronx
4,Manhattan
5,Staten Island


In [73]:
sdf_merge = sdf_data.join(sdf_zone, on='PULocationID', how='left')
sdf_pred = sdf_predict.join(sdf_zone, on='PULocationID', how='left')
sdf_merge.limit(5)

PULocationID,day,shift,fare,temp,Date,Borough
43,Tue,Night,6.8,25,2022-02-01,Manhattan
166,Tue,Night,5.8,25,2022-02-01,Manhattan
89,Tue,Night,50.3,25,2022-02-01,Brooklyn
7,Tue,Night,25.05,25,2022-02-01,Queens
166,Tue,Night,9.8,25,2022-02-01,Manhattan


In [93]:
sdf_merge = sdf_merge.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_pred = sdf_pred.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_merge.limit(5)

Borough,day,shift,temp,fare
Manhattan,Tue,Night,25,6.8
Manhattan,Tue,Night,25,5.8
Brooklyn,Tue,Night,25,50.3
Queens,Tue,Night,25,25.05
Manhattan,Tue,Night,25,9.8


In [94]:
# In order to fulfill full rank model for linear regression, we combine and get the fare average for duplicates
sdf_agg = sdf_merge \
            .groupBy('Borough', 'day', 'shift','temp') \
            .agg(
                F.round(F.mean("fare"),2).alias("fare"),
            ) 
sdf_agg.limit(5)

Borough,day,shift,temp,fare
Bronx,Sun,Night,27,17.49
Brooklyn,Mon,Morning,32,15.76
Staten Island,Wed,Night,44,72.85
Brooklyn,Mon,Night,45,17.14
Manhattan,Thu,Morning,56,14.9


In [96]:
# Change to indexes from strings to one hot encode
sdf_agg = StringIndexer(inputCols=['Borough','day','shift'], outputCols=['Borough_idx','day_idx','shift_idx']).fit(sdf_agg).transform(sdf_agg)
sdf_pred = StringIndexer(inputCols=['Borough','day','shift'], outputCols=['Borough_idx','day_idx','shift_idx']).fit(sdf_pred).transform(sdf_pred)
sdf_agg.show()

IllegalArgumentException: requirement failed: Output column Borough_idx already exists.

In [78]:
# One Hot Encode
OHE = OneHotEncoder(inputCols=['Borough_idx','day_idx','shift_idx'], 
                    outputCols= ['Borough_ohe','day_ohe','shift_ohe'])
OHE_pred = OneHotEncoder(inputCols=['Borough_idx','day_idx','shift_idx'], 
                    outputCols= ['Borough_ohe','day_ohe','shift_ohe'])
OHE = OHE.fit(sdf_agg)
OHE_pred = OHE_pred.fit(sdf_pred)

sdf_ohe = OHE.transform(sdf_agg)
sdf_ohe_pred = OHE_pred.transform(sdf_pred)

sdf_ohef = sdf_ohe.select('Borough_ohe', 'day_ohe','shift_ohe', 'temp', 'fare')
sdf_ohef_pred = sdf_ohe_pred.select('Borough_ohe', 'day_ohe','shift_ohe', 'temp', 'fare')


sdf_ohef.limit(5)

Borough_ohe,day_ohe,shift_ohe,temp,fare
"(5,[0],[1.0])","(6,[4],[1.0])","(1,[],[])",27,17.49
"(5,[1],[1.0])","(6,[1],[1.0])","(1,[0],[1.0])",32,15.76
"(5,[4],[1.0])","(6,[],[])","(1,[],[])",44,72.85
"(5,[1],[1.0])","(6,[1],[1.0])","(1,[],[])",45,17.14
"(5,[2],[1.0])","(6,[3],[1.0])","(1,[0],[1.0])",56,14.9


In [79]:
sdf_ohe.select('Borough','Borough_ohe').distinct().sort('Borough_ohe').show()

+-------------+-------------+
|      Borough|  Borough_ohe|
+-------------+-------------+
|          EWR|    (5,[],[])|
|        Bronx|(5,[0],[1.0])|
|     Brooklyn|(5,[1],[1.0])|
|    Manhattan|(5,[2],[1.0])|
|       Queens|(5,[3],[1.0])|
|Staten Island|(5,[4],[1.0])|
+-------------+-------------+



In [80]:
sdf_ohe.select('day','day_ohe').distinct().sort('day_ohe').show()

+---+-------------+
|day|      day_ohe|
+---+-------------+
|Wed|    (6,[],[])|
|Fri|(6,[0],[1.0])|
|Mon|(6,[1],[1.0])|
|Sat|(6,[2],[1.0])|
|Thu|(6,[3],[1.0])|
|Sun|(6,[4],[1.0])|
|Tue|(6,[5],[1.0])|
+---+-------------+



In [81]:
sdf_ohe.select('shift','shift_ohe').distinct().sort('shift_ohe').show()

+-------+-------------+
|  shift|    shift_ohe|
+-------+-------------+
|  Night|    (1,[],[])|
|Morning|(1,[0],[1.0])|
+-------+-------------+



In [82]:
# We want to normalize temp so it can be compared to dummy variables
mean_temp, sd_temp = sdf_ohef.select(F.mean("temp"), F.stddev("temp")).first()
sdf_ohef = sdf_ohef.withColumn("temp_norm", (F.col("temp") - mean_temp) / sd_temp)
sdf_ohef_pred = sdf_ohef_pred.withColumn("temp_norm", (F.col("temp") - mean_temp) / sd_temp)

In [83]:
# Create assember object for predictors
features = 'features'
input_cols = ['Borough_ohe', 'day_ohe','shift_ohe', 'temp_norm']

assembler = VectorAssembler(
                                inputCols=input_cols,
                                outputCol= features
                            )
sdf_model = assembler.transform(sdf_ohef)
sdf_model.select('features').head(5), sdf_model.select('fare').head(5)

([Row(features=SparseVector(13, {0: 1.0, 9: 1.0, 12: -1.536})),
  Row(features=SparseVector(13, {1: 1.0, 6: 1.0, 11: 1.0, 12: -1.2696})),
  Row(features=SparseVector(13, {4: 1.0, 12: -0.6304})),
  Row(features=SparseVector(13, {1: 1.0, 6: 1.0, 12: -0.5771})),
  Row(features=SparseVector(13, {2: 1.0, 8: 1.0, 11: 1.0, 12: 0.0089}))],
 [Row(fare=17.49),
  Row(fare=15.76),
  Row(fare=72.85),
  Row(fare=17.14),
  Row(fare=14.9)])

In [84]:
# Create assember object for predictors (pred)
features = 'features'
input_cols = ['Borough_ohe', 'day_ohe','shift_ohe', 'temp_norm']

assembler_pred = VectorAssembler(
                                inputCols=input_cols,
                                outputCol= features
                            )
sdf_model_pred = assembler_pred.transform(sdf_ohef_pred)

In [85]:
lm = LinearRegression(featuresCol='features', labelCol='fare').fit(sdf_model)

In [86]:
# Access coefficients

# 'EWR', 'Sun', 'Night' are not included as they are the reference group
coef_cols = ['Manhattan', 'Queens', 'Brooklyn', 'Bronx', 'Staten Island', 
             'Fri', 'Thu', 'Wed', 'Mon', 'Tue', 'Sat', 
             'Morning', 
             'temp']

pd.DataFrame(
    data=[lm.intercept] + list(lm.coefficients),
    index=['intercept'] + coef_cols,
    columns=['coefficient']
)

Unnamed: 0,coefficient
intercept,85.120777
Manhattan,-62.225126
Queens,-68.206201
Brooklyn,-74.066927
Bronx,-69.35432
Staten Island,-17.795055
Fri,0.18262
Thu,1.281058
Wed,-0.277878
Mon,0.141586


In [87]:
# Test on test data and check error analysis RMSE and R^2
fare_pred = lm.transform(sdf_model_pred)

In [88]:
lm.summary.meanSquaredError

56.03839903117522

In [89]:
lm.summary.r2adj

0.8628067020053637

Use together with the graphs for demand.