In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id, row_number
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from itertools import chain
from pyspark.sql.types import StringType, MapType, IntegerType
mapCol = MapType(IntegerType(),StringType(),False)

spark=SparkSession.builder.appName("dataETL").getOrCreate()

In [None]:
uberData=spark.read.parquet("C:/Users/hgarg/OneDrive/Desktop/PySpark/yellow_tripdata_2022-01.parquet")

#Adding a new Column as index
uberRideData = uberData.withColumn("UberId", row_number().over(Window.partitionBy("VendorId", "tpep_pickup_datetime").orderBy("VendorId","tpep_pickup_datetime")))
uberRawDataDf = uberRideData.select("UberId", "VendorId")
uberRideData.persist()

In [None]:

#PickUp and DropOff time details
weekDays = {
    1: "Monday",
    2: "Tuesday",
    3: "Wednesday",
    4: "Thursday",
    5: "Friday",
    6: "Saturday",
    7: "Sunday"
}
months = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
}
mapWeek = F.create_map(*[F.lit(x) for x in chain(*weekDays.items())])
mapMonth = F.create_map(*[F.lit(x) for x in chain(*months.items())])
pickDropDetails = uberRideData.select("tpep_pickup_datetime", "tpep_dropoff_datetime").distinct()
pickDropDf = pickDropDetails.withColumn("UberPickUpId", monotonically_increasing_id())\
    .withColumn("pickUpHour", F.hour(col("tpep_pickup_datetime"))).withColumn("pickUpMin", F.minute(col("tpep_pickup_datetime")))\
    .withColumn("pickUpSec", F.second(col("tpep_pickup_datetime"))).withColumn("pickUpDay", mapWeek[F.dayofweek(col("tpep_pickup_datetime"))])\
    .withColumn("pickUpMonth", mapMonth[F.month(col("tpep_pickup_datetime"))]).withColumn("pickUpYear", F.year(col("tpep_pickup_datetime")))\
    .withColumn("dropOffHour", F.hour(col("tpep_dropoff_datetime"))).withColumn("dropOffMin", F.minute(col("tpep_dropoff_datetime")))\
    .withColumn("dropOffSec", F.second(col("tpep_dropoff_datetime"))).withColumn("dropOffDay", mapWeek[F.dayofweek(col("tpep_dropoff_datetime"))])\
    .withColumn("dropOffMonth", mapMonth[F.month(col("tpep_dropoff_datetime"))]).withColumn("dropOffYear", F.year(col("tpep_dropoff_datetime")))


In [None]:
#Passenger Details
passengerDetails = uberRideData.select("passenger_count").distinct().filter(col("passenger_count").isNotNull())
passengerDetailsDF = passengerDetails.withColumn("UberPassengerId", monotonically_increasing_id())\
    .withColumn("passenger_count", col("passenger_count").cast("integer"))

In [None]:
#Trip Details
tripDetails = uberRideData.select("trip_distance").distinct().filter(col("trip_distance").isNotNull())
tripDetailsDF = tripDetails.withColumn("UberTripId", monotonically_increasing_id())\
    .withColumnRenamed("trip_distance", "TripDistance(km)")

In [None]:
#Creating Rate Card Table
from itertools import chain
rateDic = {
    1: "Standard Rate",
    2: "JFK",
    3: "Newark",
    4: "Nassau or WestChester",
    5: "Negotiated Fair",
    6: "Group Ride",
    99: "Premium"}

rateMap = F.create_map(*[F.lit(x) for x in chain(*rateDic.items())])

tripRateDetails = uberRideData.select("RateCodeId").distinct().filter(col("RateCodeId").isNotNull())
tripRateDetailsDf = tripRateDetails.withColumn("UberFairId", monotonically_increasing_id())\
    .withColumn("RateCodeId", col("RateCodeId").cast("integer"))

fairCardDetails = tripRateDetailsDf.withColumn("FairId", rateMap[col("RateCodeId")])

In [184]:
#Payment Mode Storage
from itertools import chain
paymentMode = {
    0: "Credit Card",
    1: "Cash",
    2: "No Charge",
    3: "Dispute",
    4: "Unknown",
    5: "Voided Trip"}

paymentModeMap = F.create_map(*[F.lit(x) for x in chain(*paymentMode.items())])

paymentModeDetails = uberRideData.select("payment_Type").distinct().filter(col("payment_Type").isNotNull())
paymentModeDetailsDf = paymentModeDetails.withColumn("UberPaymentId", monotonically_increasing_id())\
    .withColumn("paymentModeId", col("payment_Type").cast("integer"))

paymentModeDetailsFinalDf = paymentModeDetailsDf.withColumn("paymentMode", paymentModeMap[col("payment_Type")])\
    .select("UberPaymentId", "paymentModeId", "paymentMode")

paymentModeDetailsFinalDf.show()


+-------------+-------------+-----------+
|UberPaymentId|paymentModeId|paymentMode|
+-------------+-------------+-----------+
|            0|            0|Credit Card|
|            1|            1|       Cash|
|            2|            3|    Dispute|
|            3|            2|  No Charge|
|            4|            4|    Unknown|
|            5|            5|Voided Trip|
+-------------+-------------+-----------+



In [186]:
#Location Table
locationDetails = uberRideData.selectExpr("PULocationID as PickUpLocationId", "DOLocationID as DropOffLocationId")\
    .distinct().filter(col("PULocationID").isNotNull()).distinct().filter(col("DOLocationID").isNotNull())
locationDetailsDf = locationDetails.withColumn( "UberLocationId", monotonically_increasing_id())
locationDetailsDf.select("PickUpLocationId").distinct().count()


257

In [187]:
#Joining all the dataframes
finalFactRawData = uberRideData.join(pickDropDf, uberRideData["UberId"] == pickDropDf["UberPickUpId"], "left")\
    .join(passengerDetailsDF, uberRideData["UberId"] == passengerDetailsDF["UberPassengerId"], "left")\
    .join(tripDetailsDF, uberRideData["UberId"] == tripDetailsDF["UberTripId"], "left")\
    .join(fairCardDetails, uberRideData["UberId"] == fairCardDetails["UberFairId"], "left")\
    .join(paymentModeDetailsFinalDf, uberRideData["UberId"] == paymentModeDetailsFinalDf["UberPaymentId"], "left")\
    .join(locationDetailsDf, uberRideData["UberId"] == locationDetailsDf["UberLocationId"], "left")

finalData = finalFactRawData.select("UberId", "UberPickUpId", "UberPassengerId", "UberTripId", "UberFairId", "UberPaymentId", "UberLocationId", "fare_amount",
                     "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount")

finalData.show()

+------+------------+---------------+----------+----------+-------------+--------------+-----------+-----+-------+----------+------------+---------------------+------------+
|UberId|UberPickUpId|UberPassengerId|UberTripId|UberFairId|UberPaymentId|UberLocationId|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+------+------------+---------------+----------+----------+-------------+--------------+-----------+-----+-------+----------+------------+---------------------+------------+
|     1|           1|              1|         1|         1|            1|             1|        8.0|  3.0|    0.5|       0.0|         0.0|                  0.3|        11.8|
|     1|           1|              1|         1|         1|            1|             1|       13.5|  3.0|    0.5|      3.46|         0.0|                  0.3|       20.76|
|     1|           1|              1|         1|         1|            1|             1|        4.5|  0.5|    0.5|       4.0|     