In [72]:
 pip install shapely

Note: you may need to restart the kernel to use updated packages.


In [73]:

from pyspark.sql.window import Window
from pyspark.sql.functions import lead
from pyspark.sql.functions import lag
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import when
from pyspark.sql.functions import *
import pyspark
from delta import *
import json
from shapely import *
from shapely.geometry import shape


builder = pyspark.sql.SparkSession.builder.appName("DF2_Practice") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [74]:
# Load GeoJSON file
with open('nyc-boroughs.geojson', 'r') as f:
    data = json.load(f)

boroughs = {}
for feature in data['features']:
    borough = feature['properties']['borough']
    geom = shape(feature['geometry'])

    if borough in boroughs:
        boroughs[borough]['geoms'].append(geom)
    else:
        boroughs[borough] = {'geoms': [geom]}

for borough, data in boroughs.items():
    combined_geom = unary_union(data['geoms'])
    area = combined_geom.area
    boroughs[borough] = {
                         'geom': combined_geom}


In [75]:

iso_df = (spark.read
          .option("header","true")
          .option("inferSchema","true")
          .csv("sample.csv")
         )


In [76]:
window = Window.orderBy("dropoff_datetime").partitionBy("medallion")

In [77]:
iso_df = iso_df.withColumn("prev_pickup", lag("dropoff_datetime").over(window))
iso_df = iso_df.withColumn("delay", unix_timestamp("pickup_datetime") - unix_timestamp("prev_pickup"))
iso_df = iso_df.withColumn("session_ind", when(iso_df.delay >= 14400, 'N').otherwise('Y'))
mean_delay_df = iso_df.filter(iso_df.session_ind == 'Y').groupBy("medallion").agg(avg("delay").alias("driver_delay"))
iso_df = iso_df.join(mean_delay_df, on="medallion", how="left")

In [78]:
def lkp_dict(long, lat):
    point = Point(long, lat)
    for borough, data in boroughs.items():
        if data['geom'].contains(point):
            return borough
    return None

lkp_udf = udf(lkp_dict, StringType())

df_iso_with_dict_values = iso_df.withColumn("pickup_bur", lkp_udf(iso_df["pickup_longitude"], iso_df["pickup_latitude"]))
df_iso_with_dict_values = df_iso_with_dict_values.withColumn("dropoff_bur", lkp_udf(iso_df["dropoff_longitude"], iso_df["dropoff_latitude"]))


In [79]:
bur_delay_df = df_iso_with_dict_values.filter(df_iso_with_dict_values.session_ind == 'Y').groupBy("dropoff_bur").agg(avg("delay").alias("borough_delay"))
bur_iso = df_iso_with_dict_values.join(bur_delay_df, on="dropoff_bur", how="left")

In [99]:
iso_df.head()

sel = df_iso_with_dict_values.select("medallion", "pickup_datetime", "dropoff_datetime", "prev_pickup", "delay","session_ind","driver_delay","pickup_bur","dropoff_bur")
qry2 = bur_iso.select( "dropoff_bur","borough_delay")

qry1 = df_iso_with_dict_values.select("medallion",  "driver_delay")

qry3 = df_iso_with_dict_values.select("pickup_bur",  "dropoff_bur")

## Query1

In [100]:
qry1.distinct().show()

+--------------------+------------------+
|           medallion|      driver_delay|
+--------------------+------------------+
|5C9E006BB8CCE89AC...|             480.0|
|380CC0362AA13BE55...| 849.2307692307693|
|8107B2DEB1021A2F8...|             590.0|
|B73668FE07B7EB48C...|368.57142857142856|
|D1F80808FCF8729A0...|1797.2727272727273|
|ABD4E623646A8BC94...|            1642.5|
|F6BD3BC7A18920DB6...| 971.5384615384615|
|1066F50D87BD0F9D8...|2146.6666666666665|
|FCB1BF2054823AB4F...|            1252.0|
|185D92FA910D5DD1E...| 859.0909090909091|
|B6B2422ACEE398BEA...|1211.4285714285713|
|54FB9ED06D1E0F0AF...|            4620.0|
|6B6565187F637156C...| 729.4736842105264|
|A47A97A6E57264A1C...|            1102.5|
|98671307F40E392C4...|            5370.0|
|7376BAC10BB8455E4...| 783.3333333333334|
|6667FF102AFCA0CA9...|            1640.0|
|7F32B2AA49B5E0F95...|            4440.0|
|40DC4146B748648E9...|             650.0|
|7DEB25123AE57111F...| 967.8260869565217|
+--------------------+------------

## Query2

In [90]:
qry2.distinct().filter(qry2.dropoff_bur.isNotNull()).show()

+-------------+------------------+
|  dropoff_bur|     borough_delay|
+-------------+------------------+
|Staten Island|             780.0|
|    Manhattan| 1111.947757613042|
|        Bronx| 2192.818791946309|
|       Queens| 2021.051451187335|
|     Brooklyn|1904.5495761150019|
+-------------+------------------+



## Query3

In [101]:
print("Amount of trips that start and end in the same borough: " + str(qry3.filter(qry3.dropoff_bur == qry3.pickup_bur).count()))


Amount of trips that start and end in the same borough: 86074


## Query4

In [102]:
print("Amount of trips that do not start and end in the same borough: " + str(qry3.filter(qry3.dropoff_bur != qry3.pickup_bur).count()))


Amount of trips that do not start and end in the same borough: 11433
