In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
pyspark.__file__

'/usr/local/spark/python/pyspark/__init__.py'

In [3]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName('test') \
        .getOrCreate()

In [4]:
!wc -l fhvhv_tripdata_2021-01.parquet

1006794 fhvhv_tripdata_2021-01.parquet


In [4]:
df = spark.read \
    .option("header","true") \
    .parquet("fhvhv_tripdata_2021-01.parquet")

In [5]:
df = df.repartition(24)

In [6]:
df.show()

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

In [7]:
df.write.parquet('fhvhv/2021/01/', mode = 'overwrite')

In [None]:
df = spark.read.parquet('fhvhv/2021/01/')

In [None]:
df.printSchema()

In [None]:
from pyspark.sql import functions as F

In [None]:
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03X}'
    if num % 3 == 0:
        return f'a/{num:03X}'
    else:
        return f'e/{num:03X}'

In [None]:
crazy_stuff('B02884')

In [None]:
from pyspark.sql import types

In [53]:
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [55]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', crazy_stuff_udf(df.dispatching_base_num)) \
    .select('base_id', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

+-------+-----------+------------+------------+------------+
|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------+-----------+------------+------------+------------+
|  e/9CE| 2021-01-14|  2021-01-14|          17|          61|
|  e/9CE| 2021-01-14|  2021-01-14|          61|          71|
|  e/ACC| 2021-01-15|  2021-01-15|         142|         107|
|  a/B49| 2021-01-30|  2021-01-30|         159|          78|
|  e/9CE| 2021-01-07|  2021-01-07|         119|         235|
|  e/ACC| 2021-01-12|  2021-01-12|         158|          87|
|  e/B32| 2021-01-09|  2021-01-09|         118|           6|
|  e/B38| 2021-01-28|  2021-01-28|         116|          50|
|  e/A39| 2021-01-23|  2021-01-23|          17|         189|
|  e/9CE| 2021-01-16|  2021-01-16|         117|          86|
|  e/ACC| 2021-01-29|  2021-01-29|         205|         215|
|  e/9CE| 2021-01-24|  2021-01-24|          36|         198|
|  e/9CE| 2021-01-19|  2021-01-19|          87|          33|
|  e/B30| 2021-01-31|  2

In [36]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == "HV0003") \
    .show()

+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-15 21:11:07|2021-01-15 21:41:51|         142|         107|
|2021-01-30 01:20:30|2021-01-30 01:32:03|         159|          78|
|2021-01-12 04:49:41|2021-01-12 05:00:20|         158|          87|
|2021-01-09 14:49:14|2021-01-09 15:02:16|         118|           6|
|2021-01-28 00:56:55|2021-01-28 01:11:04|         116|          50|
|2021-01-23 22:15:15|2021-01-23 22:25:35|          17|         189|
|2021-01-29 23:05:00|2021-01-29 23:20:57|         205|         215|
|2021-01-31 06:46:53|2021-01-31 07:00:42|          69|         212|
|2021-01-29 04:56:06|2021-01-29 04:59:55|          20|          78|
|2021-01-08 03:32:43|2021-01-08 03:53:50|          82|          75|
|2021-01-27 03:35:42|2021-01-27 03:45:22|          47|          59|
|2021-01-21 21:07:53|2021-01-21 21:21:52|       