In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

In [4]:
sdf_data = spark.read.parquet('../data/curated/result_data/')
sdf_predict = spark.read.parquet('../data/curated/result_predict/')

sdf_zone = spark.read.option("header",True).csv("../data/taxi_zones/taxi+_zone_lookup.csv")
sdf_zone = sdf_zone.withColumnRenamed('LocationID', 'PULocationID')
sdf_zone = sdf_zone.select('PULocationID', 'Borough')

In [5]:
sdf_merge = sdf_data.join(sdf_zone, on='PULocationID', how='left')
sdf_pred = sdf_predict.join(sdf_zone, on='PULocationID', how='left')
sdf_merge.limit(5)

PULocationID,day,shift,fare,temp,Date,Borough
43,Tue,Night,6.8,25,2022-02-01,Manhattan
166,Tue,Night,5.8,25,2022-02-01,Manhattan
89,Tue,Night,50.3,25,2022-02-01,Brooklyn
7,Tue,Night,25.05,25,2022-02-01,Queens
166,Tue,Night,9.8,25,2022-02-01,Manhattan


In [6]:
sdf_merge = sdf_merge.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_pred = sdf_pred.select('Borough', 'day', 'shift', 'temp', 'fare')
sdf_merge.limit(5)

Borough,day,shift,temp,fare
Manhattan,Tue,Night,25,6.8
Manhattan,Tue,Night,25,5.8
Brooklyn,Tue,Night,25,50.3
Queens,Tue,Night,25,25.05
Manhattan,Tue,Night,25,9.8


In [7]:
# In order to fulfill full rank model for linear regression, we combine and get the fare average for duplicates
sdf_agg = sdf_merge \
            .groupBy('Borough', 'day', 'shift','temp') \
            .agg(
                F.round(F.mean("fare"),2).alias("fare"),
            ) 
sdf_agg.limit(5)

Borough,day,shift,temp,fare
Bronx,Sun,Night,27,17.49
Brooklyn,Mon,Morning,32,15.76
Staten Island,Wed,Night,44,72.85
Brooklyn,Mon,Night,45,17.14
Manhattan,Thu,Morning,56,14.9


In [10]:
# Auto identify categorical features to index (>7 distinct values are treated continuous)
inputCol = ['Borough', 'day','shift', 'temp']
featureIndexer = VectorIndexer(
                    inputCol=inputCol, 
                    outputCol="indexedFeatures", 
                    maxCategories=7
                ).fit(sdf_agg)

TypeError: Invalid param value given for param "inputCol". Could not convert <class 'list'> to string type