In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/24 15:18:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
spark.version

'3.3.1'

In [13]:
!ls -lh fhvhv_tripdata_2021-01.csv

-rw-rw-r-- 1 carlos carlos 718M Jul 14 16:23 fhvhv_tripdata_2021-01.csv


In [14]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [16]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhvhv_tripdata_2021-01.csv')

df = df.repartition(24)

df.write.parquet('data/pq/fhvhv/2021/01/')

                                                                                

In [17]:
!ls -lh ./data/pq/fhvhv/2021/01

total 215M
-rw-r--r-- 1 carlos carlos    0 Nov 24 15:33 _SUCCESS
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00000-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00001-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00002-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00003-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00004-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00005-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00006-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos carlos 9.0M Nov 24 15:33 part-00007-c9835e52-b040-4806-b173-25fda8a4a1f3-c000.snappy.parquet
-rw-r--r-- 1 carlos c

In [18]:
df = spark.read.parquet('data/pq/fhvhv/2021/01/')

**Q3**: How many taxi trips were there on February 15?

In [19]:
from pyspark.sql import functions as F

In [23]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter("pickup_date = '2021-01-15'") \
    .count()

                                                                                

443059

In [24]:
df.registerTempTable('fhvhv_2021_01')

In [25]:
spark.sql("""
SELECT
    COUNT(1)
FROM 
    fhvhv_2021_01
WHERE
    to_date(pickup_datetime) = '2021-01-15';
""").show()


[Stage 13:>                                                         (0 + 4) / 4]

+--------+
|count(1)|
+--------+
|  443059|
+--------+





                                                                                

**Q4**: Longest trip for each day

In [26]:
df.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']

In [27]:
df \
    .withColumn('duration', df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long')) \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .groupBy('pickup_date') \
        .max('duration') \
    .orderBy('max(duration)', ascending=False) \
    .limit(5) \
    .show()



+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
| 2021-01-27|        59143|
| 2021-01-05|        45012|
| 2021-01-30|        41193|
| 2021-01-04|        39967|
| 2021-01-06|        38417|
+-----------+-------------+




                                                                                

In [30]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX((CAST(dropoff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) / 60) AS duration
FROM 
    fhvhv_2021_01
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 10;
""").show()


[Stage 22:>                                                         (0 + 4) / 4]

+-----------+------------------+
|pickup_date|          duration|
+-----------+------------------+
| 2021-01-27| 985.7166666666667|
| 2021-01-05|             750.2|
| 2021-01-30|            686.55|
| 2021-01-04| 666.1166666666667|
| 2021-01-06| 640.2833333333333|
| 2021-01-17| 583.1166666666667|
| 2021-01-07| 577.1166666666667|
| 2021-01-29|508.21666666666664|
| 2021-01-18|497.18333333333334|
| 2021-01-03|486.51666666666665|
+-----------+------------------+




                                                                                

**Q5**: Most frequent `dispatching_base_num`

How many stages this spark job has?



In [33]:
spark.sql("""
SELECT
    dispatching_base_num,
    COUNT(1)
FROM 
    fhvhv_2021_01
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 5;
""").show()


[Stage 28:>                                                         (0 + 4) / 4]


+--------------------+--------+
|dispatching_base_num|count(1)|
+--------------------+--------+
|              B02510| 3091000|
|              B02764| 1009388|
|              B02872|  924960|
|              B02875|  735450|
|              B02765|  591242|
+--------------------+--------+




                                                                                

In [34]:
df \
    .groupBy('dispatching_base_num') \
        .count() \
    .orderBy('count', ascending=False) \
    .limit(5) \
    .show()


[Stage 31:>                                                         (0 + 4) / 4]

+--------------------+-------+
|dispatching_base_num|  count|
+--------------------+-------+
|              B02510|3091000|
|              B02764|1009388|
|              B02872| 924960|
|              B02875| 735450|
|              B02765| 591242|
+--------------------+-------+




                                                                                

**Q6**: Most common locations pair

In [35]:
df_zones = spark.read.parquet('zones')

In [36]:
df_zones.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [37]:
df.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']

In [38]:
df_zones.registerTempTable('zones')

In [40]:
spark.sql("""
SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS pu_do_pair,
    COUNT(1)
FROM 
    fhvhv_2021_01 fhv LEFT JOIN zones pul ON fhv.PULocationID = pul.LocationID
                      LEFT JOIN zones dol ON fhv.DOLocationID = dol.LocationID
GROUP BY 
    1
ORDER BY
    2 DESC
LIMIT 5;
""").take(5)

                                                                                

[Row(pu_do_pair='East New York / East New York', count(1)=47637),
 Row(pu_do_pair='Borough Park / Borough Park', count(1)=30920),
 Row(pu_do_pair='Canarsie / Canarsie', count(1)=29897),
 Row(pu_do_pair='Crown Heights North / Crown Heights North', count(1)=28851),
 Row(pu_do_pair='Central Harlem North / Central Harlem North', count(1)=17379)]