# Analysing New York City Taxi Data with Spark

In [3]:
 pip install shapely

Note: you may need to restart the kernel to use updated packages.


In [67]:
from collections import defaultdict

from pyspark.sql.window import Window
from pyspark.sql.functions import (
    lead, lag, unix_timestamp, 
    when, col, sum
)
import pyspark
import json
from shapely import *
from shapely.geometry import shape


builder = pyspark.sql.SparkSession.builder.appName("DF2_Practice") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Loading data

In [68]:
# Load GeoJSON file
with open('nyc-boroughs.geojson', 'r') as f:
    data = json.load(f)

boroughs = defaultdict(list)
for feature in data['features']:
    borough = feature['properties']['borough']
    geom = shape(feature['geometry'])

    boroughs[borough].append(geom)

for borough, geoms in boroughs.items():
    boroughs[borough] = unary_union(geoms)

In [74]:
iso_df = (spark.read
          .option("header","true")
          .option("inferSchema","true")
          .csv("sample.csv"))

## Preprocessing

In [75]:
window = Window.orderBy("pickup_datetime").partitionBy("medallion")

iso_df = iso_df.withColumn(
    "prev_pickup", lag("dropoff_datetime").over(window)
).withColumn(
    "delay", unix_timestamp("pickup_datetime") - unix_timestamp("prev_pickup")
).withColumn(
    "session_ind", when(col("delay") >= 14400, 'N').otherwise('Y')
)

In [76]:
def lkp_dict(long, lat):
    point = Point(long, lat)
    for borough, geom in boroughs.items():
        if geom.contains(point):
            return borough
    return None

lkp_udf = udf(lkp_dict, StringType())

iso_df = iso_df.withColumn(
    "pickup_bur", 
    lkp_udf(iso_df["pickup_longitude"], iso_df["pickup_latitude"])
).withColumn(
    "dropoff_bur", 
    lkp_udf(iso_df["dropoff_longitude"], iso_df["dropoff_latitude"])
)

## Query 1

In [77]:
iso_df.filter(
    iso_df.session_ind == 'Y'
).groupBy("medallion").agg(
    sum("delay").alias("driver_delay")
).filter(
    col("driver_delay").isNotNull()
).show()

+--------------------+------------+
|           medallion|driver_delay|
+--------------------+------------+
|000318C2E3E638158...|       17400|
|002E3B405B6ABEA23...|       16140|
|0030AD2648D81EE87...|         720|
|0036961468659D0BF...|       19740|
|0038EF45118925A51...|       15120|
|0053334C798EC6C8E...|       22440|
|005DED7D6E6C45441...|       11760|
|005F00B38F46E2100...|       42180|
|00790C7BAD30B7A9E...|       25320|
|0094A03FFE6BAFBE0...|        5400|
|009D3CCA83486B03F...|       40920|
|00BD5D1AD3A96C997...|       12540|
|00FB3D49C3DE5E002...|       14580|
|012B65864B3BE97D6...|        6360|
|012F172C0351A4767...|       40680|
|01389E9CF7758ECAC...|        5460|
|019AFB33C3153481B...|        9840|
|01BD10395EF30144C...|        5940|
|01C905F5CF4CD4D36...|        8340|
|01D13A056D9A26F84...|       15480|
+--------------------+------------+
only showing top 20 rows



## Query 2

In [78]:
iso_df.filter(
    (iso_df.session_ind == 'Y') & 
    col("dropoff_bur").isNotNull()
).groupBy("dropoff_bur").agg(
    avg("delay").alias("borough_delay")
).filter(
    col("borough_delay").isNotNull()
).show()

+-------------+------------------+
|  dropoff_bur|     borough_delay|
+-------------+------------------+
|       Queens| 2021.051451187335|
|     Brooklyn|1904.5495761150019|
|Staten Island|             780.0|
|    Manhattan| 1111.947757613042|
|        Bronx| 2192.818791946309|
+-------------+------------------+



## Query 3

In [79]:
qry3 = iso_df.filter(iso_df.dropoff_bur == iso_df.pickup_bur).count()

print(f"Amount of trips that start and end in the same borough: {qry3}")

Amount of trips that start and end in the same borough: 86074


## Query 4

In [80]:
qry4 = iso_df.filter(iso_df.dropoff_bur != iso_df.pickup_bur).count()

print(f"Amount of trips that do not start and end in the same borough: {qry4}")

Amount of trips that do not start and end in the same borough: 11433
