# Introduction to Taxi ETL Job
This is the Taxi ETL job to generate the input datasets for the Taxi XGBoost job.

## Prerequirement
### 1. Download data
All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

### 2. Download needed jar
* [rapids-4-spark_2.12-23.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.06.0/rapids-4-spark_2.12-23.06.0.jar)

### 3. Start Spark Standalone
Before running the script, please setup Spark standalone mode

### 4. Add ENV
```
$ export SPARK_JARS=rapids-4-spark_2.12-23.06.0.jar

```

### 5.Start Jupyter Notebook with spylon-kernel or toree

```
$ jupyter notebook --allow-root --notebook-dir=${your-dir} --config=${your-configs}
```

## Import Libs

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DataTypes.{DoubleType, IntegerType, StringType}
import org.apache.spark.sql.types.{FloatType, StructField, StructType}

## Script Settings

### 1. File Path Settings
* Define input file path

In [6]:
val dataRoot = sys.env.getOrElse("DATA_ROOT", "/data")
val rawPath = dataRoot + "/taxi/taxi-etl-input-small.csv"
val outPath = dataRoot + "/taxi/output"

lastException = null
dataRoot = /data
rawPath = /data/taxi/taxi-etl-input-small.csv
outPath = /data/datasets/taxi/output


/data/taxi/output

## Function and Object Define
### Define the constants

* Define input file schema

In [7]:
val rawSchema = StructType(Seq(
    StructField("vendor_id", StringType),
    StructField("pickup_datetime", StringType),
    StructField("dropoff_datetime", StringType),
    StructField("passenger_count", IntegerType),
    StructField("trip_distance", DoubleType),
    StructField("pickup_longitude", DoubleType),
    StructField("pickup_latitude", DoubleType),
    StructField("rate_code", StringType),
    StructField("store_and_fwd_flag", StringType),
    StructField("dropoff_longitude", DoubleType),
    StructField("dropoff_latitude", DoubleType),
    StructField("payment_type", StringType),
    StructField("fare_amount", DoubleType),
    StructField("surcharge", DoubleType),
    StructField("mta_tax", DoubleType),
    StructField("tip_amount", DoubleType),
    StructField("tolls_amount", DoubleType),
    StructField("total_amount", DoubleType)
  ))

rawSchema = StructType(StructField(vendor_id,StringType,true), StructField(pickup_datetime,StringType,true), StructField(dropoff_datetime,StringType,true), StructField(passenger_count,IntegerType,true), StructField(trip_distance,DoubleType,true), StructField(pickup_longitude,DoubleType,true), StructField(pickup_latitude,DoubleType,true), StructField(rate_code,StringType,true), StructField(store_and_fwd_flag,StringType,true), StructField(dropoff_longitude,DoubleType,true), StructField(dropoff_latitude,DoubleType,true), StructField(payment_type,StringType,true), StructField(fare_amount,DoubleType,true), StructField(surcharge,DoubleType,true), StructField(mta_tax,DoubleType,true), StructField(tip_amount,DoubleType,true), StructField(tolls_amount,Doubl...


StructType(StructField(vendor_id,StringType,true), StructField(pickup_datetime,StringType,true), StructField(dropoff_datetime,StringType,true), StructField(passenger_count,IntegerType,true), StructField(trip_distance,DoubleType,true), StructField(pickup_longitude,DoubleType,true), StructField(pickup_latitude,DoubleType,true), StructField(rate_code,StringType,true), StructField(store_and_fwd_flag,StringType,true), StructField(dropoff_longitude,DoubleType,true), StructField(dropoff_latitude,DoubleType,true), StructField(payment_type,StringType,true), StructField(fare_amount,DoubleType,true), StructField(surcharge,DoubleType,true), StructField(mta_tax,DoubleType,true), StructField(tip_amount,DoubleType,true), StructField(tolls_amount,Doubl...

In [8]:
def dataRatios: (Int, Int, Int) = {
    val ratios = (80, 20)
    (ratios._1, ratios._2, 100 - ratios._1 - ratios._2)
  }
val (trainRatio, evalRatio, trainEvalRatio) = dataRatios

trainRatio = 80
evalRatio = 20
trainEvalRatio = 0


dataRatios: (Int, Int, Int)


0

* Build the spark session and dataframe

In [9]:
// Build the spark session and data reader as usual
val sparkSession = SparkSession.builder.appName("taxi-etl").getOrCreate
val df = sparkSession.read.option("header", true).schema(rawSchema).csv(rawPath)

sparkSession = org.apache.spark.sql.SparkSession@68530eb7
df = [vendor_id: string, pickup_datetime: string ... 16 more fields]


[vendor_id: string, pickup_datetime: string ... 16 more fields]

* Define some ETL functions

In [10]:
def dropUseless(dataFrame: DataFrame): DataFrame = {
    dataFrame.drop(
      "dropoff_datetime",
      "payment_type",
      "surcharge",
      "mta_tax",
      "tip_amount",
      "tolls_amount",
      "total_amount")
  }

dropUseless: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [11]:
def encodeCategories(dataFrame: DataFrame): DataFrame = {
    val categories = Seq("vendor_id", "rate_code", "store_and_fwd_flag")

    (categories.foldLeft(dataFrame) {
      case (df, category) => df.withColumn(category, hash(col(category)))
    }).withColumnRenamed("store_and_fwd_flag", "store_and_fwd")
  }

encodeCategories: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [12]:
def fillNa(dataFrame: DataFrame): DataFrame = {
    dataFrame.na.fill(-1)
  }

fillNa: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [13]:
def removeInvalid(dataFrame: DataFrame): DataFrame = {
    val conditions = Seq(
      Seq("fare_amount", 0, 500),
      Seq("passenger_count", 0, 6),
      Seq("pickup_longitude", -75, -73),
      Seq("dropoff_longitude", -75, -73),
      Seq("pickup_latitude", 40, 42),
      Seq("dropoff_latitude", 40, 42))

    conditions
      .map { case Seq(column, min, max) => "%s > %d and %s < %d".format(column, min, column, max) }
      .foldLeft(dataFrame) {
        _.filter(_)
      }
  }

removeInvalid: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [14]:
def convertDatetime(dataFrame: DataFrame): DataFrame = {
    val datetime = col("pickup_datetime")
    dataFrame
      .withColumn("pickup_datetime", to_timestamp(datetime))
      .withColumn("year", year(datetime))
      .withColumn("month", month(datetime))
      .withColumn("day", dayofmonth(datetime))
      .withColumn("day_of_week", dayofweek(datetime))
      .withColumn(
        "is_weekend",
        col("day_of_week").isin(1, 7).cast(IntegerType)) // 1: Sunday, 7: Saturday
      .withColumn("hour", hour(datetime))
      .drop(datetime.toString)
  }

convertDatetime: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [15]:
def addHDistance(dataFrame: DataFrame): DataFrame = {
    val P = math.Pi / 180
    val lat1 = col("pickup_latitude")
    val lon1 = col("pickup_longitude")
    val lat2 = col("dropoff_latitude")
    val lon2 = col("dropoff_longitude")
    val internalValue = (lit(0.5)
      - cos((lat2 - lat1) * P) / 2
      + cos(lat1 * P) * cos(lat2 * P) * (lit(1) - cos((lon2 - lon1) * P)) / 2)
    val hDistance = lit(12734) * asin(sqrt(internalValue))
    dataFrame.withColumn("h_distance", hDistance)
  }

addHDistance: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


* Define main ETL function

In [19]:
def preProcess(dataFrame: DataFrame, splits: Array[Int]): Array[DataFrame] = {
    val processes = Seq[DataFrame => DataFrame](
      dropUseless,
      encodeCategories,
      fillNa,
      removeInvalid,
      convertDatetime,
      addHDistance
    )

    processes
      .foldLeft(dataFrame) { case (df, process) => process(df) }
      .randomSplit(splits.map(_.toDouble))
  }

preProcess: (dataFrame: org.apache.spark.sql.DataFrame, splits: Array[Int])Array[org.apache.spark.sql.DataFrame]


In [20]:
val dataset = preProcess(df, Array(trainRatio, trainEvalRatio, evalRatio))

dataset = Array([vendor_id: int, passenger_count: int ... 15 more fields], [vendor_id: int, passenger_count: int ... 15 more fields], [vendor_id: int, passenger_count: int ... 15 more fields])


Array([vendor_id: int, passenger_count: int ... 15 more fields], [vendor_id: int, passenger_count: int ... 15 more fields], [vendor_id: int, passenger_count: int ... 15 more fields])

## Run ETL Process and Save the Result

In [21]:
val t0 = System.currentTimeMillis
for ((name, index) <- Seq("train", "eval", "trans").zipWithIndex) {
        dataset(index).write.mode("overwrite").parquet(outPath + "/parquet/" + name)
        dataset(index).write.mode("overwrite").csv(outPath + "/csv/" + name)
      }
val t1 = System.currentTimeMillis
println("Elapsed time : " + ((t1 - t0).toFloat / 1000) + "s")
sparkSession.stop()

Elapsed time : 4.371s


t0 = 1654139600797
t1 = 1654139605168


1654139605168