# Pre-processing_4
This notebook merges 3 preprocessed datasets and prepares for EDA_hourly and model analysis

In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 

In [3]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

24/08/31 22:47:50 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.12.218.66 instead (on interface en0)
24/08/31 22:47:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/31 22:47:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.conf.set("spark.sql.parquet.compression.codec","gzip")

## Datasets 

In [5]:
# read the data from preprocessed tlc data
tdf = spark.read.parquet("../data/raw/tlc_df.parquet") 

                                                                                

In [6]:
# read the preprocessed weather data
wdf = spark.read.csv("../data/raw/NYC_weather_raw.csv", header=True, inferSchema=True)


In [7]:
# read the preprocessed event data
edf = spark.read.parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet") 

In [8]:
# show 5 rows of the tlc data
tdf.show(5)

+------------+------------+-------------+-----------+------------+-----------+------------+-------------+
|PULocationID|DOLocationID|trip_distance|pickup_hour|dropoff_hour|pickup_date|dropoff_date|trip_duration|
+------------+------------+-------------+-----------+------------+-----------+------------+-------------+
|         249|         179|          7.1|          0|           1| 2023-07-01|  2023-07-01|         32.0|
|         132|         230|         18.9|          0|           1| 2023-07-01|  2023-07-01|         48.0|
|         164|         230|         1.24|          0|           0| 2023-07-01|  2023-07-01|          8.0|
|         132|         131|         14.8|          0|           0| 2023-07-01|  2023-07-01|         21.0|
|         144|         198|          9.6|          0|           0| 2023-07-01|  2023-07-01|         28.0|
+------------+------------+-------------+-----------+------------+-----------+------------+-------------+
only showing top 5 rows



In [9]:
# use 4 dcimal places for the float values in weather data
wdf = wdf.withColumn("CIG", round(col("CIG"), 4))
wdf = wdf.withColumn("WND", round(col("WND"), 4))
wdf = wdf.withColumn("TMP", round(col("TMP"), 4))
wdf = wdf.withColumn("DEW", round(col("DEW"), 4))
wdf = wdf.withColumn("SLP", round(col("SLP"), 4))
wdf = wdf.withColumn("VIS", round(col("VIS"), 4))   


In [10]:
# show 5 rows of the weather data
wdf.show(5)

+----------+----+-------+-----+-----+----+----+------+
|      DATE|HOUR|    CIG|  WND|  VIS| TMP| DEW|   SLP|
+----------+----+-------+-----+-----+----+----+------+
|2023-07-01|   0|22000.0|2.632|965.6|23.9|13.3|1017.1|
|2023-07-01|   1|22000.0|2.632|965.6|23.3|13.3|1017.6|
|2023-07-01|   2|22000.0|2.632|965.6|23.3|12.8|1017.8|
|2023-07-01|   3|22000.0|  3.1|965.6|22.8|12.8|1017.7|
|2023-07-01|   4|22000.0|  1.5|965.6|22.8|11.7|1017.4|
+----------+----+-------+-----+-----+----+----+------+
only showing top 5 rows



In [11]:
# show 5 rows of the event data
edf.show(5)

+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|    Location Details|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  725004|2023-09-05|         9|2023-09-05|      20|Sport - Youth|     Brooklyn|Commodore Barry Park|
|  720627|2023-09-25|        16|2023-09-25|      20|Sport - Adult|     Brooklyn|Red Hook Recreati...|
|  723466|2023-09-26|         9|2023-09-26|      15|Sport - Youth|        Bronx|       Colgate Close|
|  732945|2023-09-19|        15|2023-09-19|      18|Sport - Adult|     Brooklyn|   Calvert Vaux Park|
|  715391|2023-09-25|         8|2023-09-25|      19|Sport - Youth|     Brooklyn|Red Hook Recreati...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



## Aggregation for hourly

### Map taxi zones to boroughs 

In [12]:
# Load the zones DataFrame
zones = spark.read.csv("../data/landing/external/taxi_zones.csv", header=True, inferSchema=True)

In [13]:
# Join the tdf DataFrame with zones to get the borough for PULocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'PUBorough'),
               tdf['PULocationID'] == zones['LocationID'], 'left').drop('LocationID')

# Join the tdf DataFrame with zones to get the borough for DOLocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'DOBorough'),
               tdf['DOLocationID'] == zones['LocationID'], 'left').drop('LocationID')


In [14]:
# show 15 rows of the tlc data
tdf.show(15)

+------------+------------+-------------+-----------+------------+-----------+------------+-------------+---------+---------+
|PULocationID|DOLocationID|trip_distance|pickup_hour|dropoff_hour|pickup_date|dropoff_date|trip_duration|PUBorough|DOBorough|
+------------+------------+-------------+-----------+------------+-----------+------------+-------------+---------+---------+
|         249|         179|          7.1|          0|           1| 2023-07-01|  2023-07-01|         32.0|Manhattan|   Queens|
|         132|         230|         18.9|          0|           1| 2023-07-01|  2023-07-01|         48.0|   Queens|Manhattan|
|         164|         230|         1.24|          0|           0| 2023-07-01|  2023-07-01|          8.0|Manhattan|Manhattan|
|         132|         131|         14.8|          0|           0| 2023-07-01|  2023-07-01|         21.0|   Queens|   Queens|
|         144|         198|          9.6|          0|           0| 2023-07-01|  2023-07-01|         28.0|Manhattan|   

In [15]:
# show percentage of missing values in PUBorough column of the tlc data
tdf.select((F.sum(F.col("PUBorough").isNull().cast("int"))/F.count(F.col("PUBorough"))).alias("PUBorough")).show()

+-------------------+
|          PUBorough|
+-------------------+
|0.00734753204628976|
+-------------------+



In [16]:
# show percentage of missing values in all columns of the tlc data
tdf.select([(F.sum(F.col(c).isNull().cast("int"))/F.count(F.col(c))).alias(c) for c in tdf.columns]).show() 



+------------+------------+-------------+-----------+------------+-----------+------------+-------------+-------------------+-------------------+
|PULocationID|DOLocationID|trip_distance|pickup_hour|dropoff_hour|pickup_date|dropoff_date|trip_duration|          PUBorough|          DOBorough|
+------------+------------+-------------+-----------+------------+-----------+------------+-------------+-------------------+-------------------+
|         0.0|         0.0|          0.0|        0.0|         0.0|        0.0|         0.0|          0.0|0.00734753204628976|0.01304270366266107|
+------------+------------+-------------+-----------+------------+-----------+------------+-------------+-------------------+-------------------+



                                                                                

In [17]:
# drop rows with missing values in PUBorough and DOBorough column
tdf = tdf.filter(F.col("PUBorough").isNotNull())
tdf = tdf.filter(F.col("DOBorough").isNotNull())

In [18]:
# Convert pickup_date and pickup_hour to a timestamp and create Time column
tdf = tdf.withColumn(
    'Time', 
    to_timestamp(concat_ws(' ', col('pickup_date'), col('pickup_hour')))
) 

In [19]:
df_hourly_agg = tdf.groupBy(['pickup_date', 'pickup_hour', "PUBorough"]).agg({
    '*': 'count', 
}).withColumnRenamed('count(1)', 'hourly_trip_count') 

In [21]:
# tdf drop the Time column 
tdf = tdf.drop('Time')

In [22]:
# sort the hourly aggregated data by date pickup_date and pickup_hour
df_hourly_agg = df_hourly_agg.sort('pickup_date', 'pickup_hour')

In [23]:
 # drop pickup_date if it's outside of the pickup_date range of 2023-07-01 to 2023-12-31
df_hourly_agg = df_hourly_agg.filter((col('pickup_date') >= '2023-07-01') & (col('pickup_date') <= '2023-12-31'))

In [24]:
# check percentage of missing values in each column for df_hourly_agg
df_hourly_agg.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in df_hourly_agg.columns
]).show()


[Stage 30:>                                                         (0 + 8) / 8]

+-------------------+-------------------+-----------------+-------------------------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|
+-------------------+-------------------+-----------------+-------------------------+
|                0.0|                0.0|              0.0|                      0.0|
+-------------------+-------------------+-----------------+-------------------------+



                                                                                

### Aggregation of number of events

In [25]:
# Aggregate the number of events per date, hour, and borough
edf_hourly = edf.groupBy('Start Date', 'Start Hour', 'Event Borough').agg(count('Event ID').alias('Number of Events'))

In [26]:
# show edf
edf.show(5)

+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|Event ID|Start Date|Start Hour|  End Date|End Hour|   Event Type|Event Borough|    Location Details|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
|  725004|2023-09-05|         9|2023-09-05|      20|Sport - Youth|     Brooklyn|Commodore Barry Park|
|  720627|2023-09-25|        16|2023-09-25|      20|Sport - Adult|     Brooklyn|Red Hook Recreati...|
|  723466|2023-09-26|         9|2023-09-26|      15|Sport - Youth|        Bronx|       Colgate Close|
|  732945|2023-09-19|        15|2023-09-19|      18|Sport - Adult|     Brooklyn|   Calvert Vaux Park|
|  715391|2023-09-25|         8|2023-09-25|      19|Sport - Youth|     Brooklyn|Red Hook Recreati...|
+--------+----------+----------+----------+--------+-------------+-------------+--------------------+
only showing top 5 rows



In [27]:
edf_hourly.show(5)

+----------+----------+-------------+----------------+
|Start Date|Start Hour|Event Borough|Number of Events|
+----------+----------+-------------+----------------+
|2023-09-28|         9|     Brooklyn|             292|
|2023-09-16|        13|        Bronx|             122|
|2023-09-17|        12|        Bronx|              96|
|2023-11-04|         8|       Queens|            2430|
|2023-11-23|         8|       Queens|             947|
+----------+----------+-------------+----------------+
only showing top 5 rows



## Join hourly datasets

In [28]:
# show 5 rows for the weather data
wdf.show(5)

+----------+----+-------+-----+-----+----+----+------+
|      DATE|HOUR|    CIG|  WND|  VIS| TMP| DEW|   SLP|
+----------+----+-------+-----+-----+----+----+------+
|2023-07-01|   0|22000.0|2.632|965.6|23.9|13.3|1017.1|
|2023-07-01|   1|22000.0|2.632|965.6|23.3|13.3|1017.6|
|2023-07-01|   2|22000.0|2.632|965.6|23.3|12.8|1017.8|
|2023-07-01|   3|22000.0|  3.1|965.6|22.8|12.8|1017.7|
|2023-07-01|   4|22000.0|  1.5|965.6|22.8|11.7|1017.4|
+----------+----+-------+-----+-----+----+----+------+
only showing top 5 rows



In [29]:
# show 5 rows for the df_hourly_agg data
df_hourly_agg.show(5)

+-----------+-----------+---------+-----------------+
|pickup_date|pickup_hour|PUBorough|hourly_trip_count|
+-----------+-----------+---------+-----------------+
| 2023-07-01|          0|Manhattan|             3092|
| 2023-07-01|          0|   Queens|              361|
| 2023-07-01|          0| Brooklyn|               32|
| 2023-07-01|          1|   Queens|              163|
| 2023-07-01|          1| Brooklyn|               33|
+-----------+-----------+---------+-----------------+
only showing top 5 rows



                                                                                

In [30]:
# merge the weather data with the df_hourly_agg data by pickup_date and pickup_hour
merged1 = df_hourly_agg.join(wdf, (df_hourly_agg['pickup_date'] == wdf['DATE']) & (df_hourly_agg['pickup_hour'] == wdf['HOUR']), 'left')

In [31]:
# sort by pickup_date and pickup_hour
merged1 = merged1.sort('pickup_date', 'pickup_hour')

In [32]:
# drop DATE and HOUR columns
merged1 = merged1.drop('DATE', 'HOUR')

In [33]:
# show 5 rows of the merged data
merged1.show(15)



+-----------+-----------+---------+-----------------+-------+-----+-----+----+----+------+
|pickup_date|pickup_hour|PUBorough|hourly_trip_count|    CIG|  WND|  VIS| TMP| DEW|   SLP|
+-----------+-----------+---------+-----------------+-------+-----+-----+----+----+------+
| 2023-07-01|          0|Manhattan|             3092|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          0|   Queens|              361|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          0| Brooklyn|               32|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          1|   Queens|              163|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1| Brooklyn|               33|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1|    Bronx|                3|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1|Manhattan|             2685|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          2|Manhattan|             1996|22000.0|2.632|965.6|23.3|12.8|1017.8|

                                                                                

In [34]:
# check missing values in the merged data
merged1.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in merged1.columns
]).show()



+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|         CIG_missing|         WND_missing|         VIS_missing|         TMP_missing|         DEW_missing|         SLP_missing|
+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                0.0|                0.0|              0.0|                      0.0|0.004823355193568912|0.004823355193568912|0.004823355193568912|0.004823355193568912|0.004823355193568912|0.004823355193568912|
+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+----

                                                                                

In [35]:
# Define a window specification with proper partitioning
window_spec = Window.partitionBy("pickup_date").orderBy("pickup_hour").rowsBetween(-sys.maxsize, 0)

# Apply forward fill to the missing weather columns
merged1 = merged1.withColumn("CIG", last(col("CIG"), ignorenulls=True).over(window_spec))
merged1 = merged1.withColumn("WND", last(col("WND"), ignorenulls=True).over(window_spec))
merged1 = merged1.withColumn("VIS", last(col("VIS"), ignorenulls=True).over(window_spec))
merged1 = merged1.withColumn("TMP", last(col("TMP"), ignorenulls=True).over(window_spec))
merged1 = merged1.withColumn("DEW", last(col("DEW"), ignorenulls=True).over(window_spec))
merged1 = merged1.withColumn("SLP", last(col("SLP"), ignorenulls=True).over(window_spec))

In [36]:
# check missing values in merged1
merged1.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in merged1.columns
]).show()

                                                                                

+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|         CIG_missing|         WND_missing|         VIS_missing|         TMP_missing|         DEW_missing|         SLP_missing|
+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                0.0|                0.0|              0.0|                      0.0|0.004188703194415111|0.004188703194415111|0.004188703194415111|0.004188703194415111|0.004188703194415111|0.004188703194415111|
+-------------------+-------------------+-----------------+-------------------------+--------------------+--------------------+--------------------+----

In [37]:
# impute missing values with the mean of the column including CIG, WND, VIS, TMP, DEW, and SLP
merged1 = merged1.fillna(merged1.agg(*[
    F.mean(c).alias(c)
    for c in ["CIG", "WND", "VIS", "TMP", "DEW", "SLP"]
]).first().asDict())

                                                                                

In [38]:
# check missing values in merged1
merged1.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in merged1.columns
]).show()

+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|CIG_missing|WND_missing|VIS_missing|TMP_missing|DEW_missing|SLP_missing|
+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+
|                0.0|                0.0|              0.0|                      0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|
+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+



In [39]:
# show 15 rows of the merged1 data
merged1.show(15)

                                                                                

+-----------+-----------+---------+-----------------+-------+-----+-----+----+----+------+
|pickup_date|pickup_hour|PUBorough|hourly_trip_count|    CIG|  WND|  VIS| TMP| DEW|   SLP|
+-----------+-----------+---------+-----------------+-------+-----+-----+----+----+------+
| 2023-07-01|          0|Manhattan|             3092|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          0|   Queens|              361|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          0| Brooklyn|               32|22000.0|2.632|965.6|23.9|13.3|1017.1|
| 2023-07-01|          1|   Queens|              163|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1|    Bronx|                3|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1| Brooklyn|               33|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          1|Manhattan|             2685|22000.0|2.632|965.6|23.3|13.3|1017.6|
| 2023-07-01|          2|Manhattan|             1996|22000.0|2.632|965.6|23.3|12.8|1017.8|

In [40]:
# sort the edf_hourly data by Start Date and Start Hour
edf_hourly = edf_hourly.sort('Start Date', 'Start Hour')

In [41]:
# check missing values in edf_hourly
edf_hourly.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in edf_hourly.columns
]).show()

+------------------+------------------+---------------------+------------------------+
|Start Date_missing|Start Hour_missing|Event Borough_missing|Number of Events_missing|
+------------------+------------------+---------------------+------------------------+
|               0.0|               0.0|                  0.0|                     0.0|
+------------------+------------------+---------------------+------------------------+



In [42]:
# join the edf_hourly data with the merged1 data by pickup_date, and pickup_hour and PUBorough
merged2 = merged1.join(edf_hourly, (merged1['pickup_date'] == edf_hourly['Start Date']) 
                       & (merged1['pickup_hour'] == edf_hourly['Start Hour']) 
                       & (merged1['PUBorough'] == edf_hourly['Event Borough']), 'left')


In [43]:
# sort by pickup_date and pickup_hour
merged2 = merged2.sort('pickup_date', 'pickup_hour')

In [44]:
# check missing values in merged2
merged2.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in merged2.columns
]).show()

                                                                                

+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+------------------+------------------+---------------------+------------------------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|CIG_missing|WND_missing|VIS_missing|TMP_missing|DEW_missing|SLP_missing|Start Date_missing|Start Hour_missing|Event Borough_missing|Number of Events_missing|
+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+------------------+------------------+---------------------+------------------------+
|                0.0|                0.0|              0.0|                      0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|0.3958112968055849|0.3958112968055849|   0.3958112968055849|      0.3958112968055849|
+-------------------+-------

In [45]:
# chcck when are the missing values in the merged2 data accoringd to borough
merged2.filter(col('Number of Events').isNull()).groupBy('PUBorough').count().show()

                                                                                

+-------------+-----+
|    PUBorough|count|
+-------------+-----+
|       Queens| 2504|
|          EWR|  196|
|     Brooklyn| 2162|
|Staten Island|  208|
|    Manhattan| 1953|
|        Bronx| 2332|
+-------------+-----+



In [46]:
# missing values in the Number of Events column by hour and sorted by hour
merged2.filter(col('Number of Events').isNull()).groupBy('pickup_hour').count().sort('pickup_hour').show()


+-----------+-----+
|pickup_hour|count|
+-----------+-----+
|          0|  199|
|          1|  596|
|          2|  745|
|          3|  822|
|          4| 1547|
|          5|  976|
|          6|  618|
|          7|  278|
|          8|    8|
|          9|   12|
|         10|   15|
|         11|  107|
|         12|  104|
|         13|   95|
|         14|   67|
|         15|   54|
|         16|   32|
|         17|  116|
|         18|   87|
|         19|  317|
+-----------+-----+
only showing top 20 rows



### Handling missing data
The assumption is no events occured if number of event in Null


In [47]:
# impute missing values with 0 for the Number of Events column and 
merged2 = merged2.fillna(0, subset=['Number of Events'])

In [48]:
# drop the Start Date, Start Hour, and Event Borough columns
merged2 = merged2.drop('Start Date', 'Start Hour', 'Event Borough')

In [49]:
# show 10 rows of the merged2 data
merged2.show(50)

                                                                                

+-----------+-----------+-------------+-----------------+----------+-----+---------+-------+-------+---------+----------------+
|pickup_date|pickup_hour|    PUBorough|hourly_trip_count|       CIG|  WND|      VIS|    TMP|    DEW|      SLP|Number of Events|
+-----------+-----------+-------------+-----------------+----------+-----+---------+-------+-------+---------+----------------+
| 2023-07-01|          0|    Manhattan|             3092|   22000.0|2.632|    965.6|   23.9|   13.3|   1017.1|              77|
| 2023-07-01|          0|       Queens|              361|   22000.0|2.632|    965.6|   23.9|   13.3|   1017.1|              20|
| 2023-07-01|          0|     Brooklyn|               32|   22000.0|2.632|    965.6|   23.9|   13.3|   1017.1|              25|
| 2023-07-01|          1|       Queens|              163|   22000.0|2.632|    965.6|   23.3|   13.3|   1017.6|               0|
| 2023-07-01|          1|        Bronx|                3|   22000.0|2.632|    965.6|   23.3|   13.3|   1

In [50]:
# check missing values in merged2
merged2.agg(*[
    (1 - (F.count(c) / F.count('*'))).alias(c + '_missing')
    for c in merged2.columns
]).show()

+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+------------------------+
|pickup_date_missing|pickup_hour_missing|PUBorough_missing|hourly_trip_count_missing|CIG_missing|WND_missing|VIS_missing|TMP_missing|DEW_missing|SLP_missing|Number of Events_missing|
+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+------------------------+
|                0.0|                0.0|              0.0|                      0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|                     0.0|
+-------------------+-------------------+-----------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+------------------------+



In [51]:
# count the number of rows in the merged2 data
merged2.count()

23635

In [52]:
# check schema
merged2.printSchema()

root
 |-- pickup_date: date (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- PUBorough: string (nullable = true)
 |-- hourly_trip_count: long (nullable = false)
 |-- CIG: double (nullable = false)
 |-- WND: double (nullable = false)
 |-- VIS: double (nullable = false)
 |-- TMP: double (nullable = false)
 |-- DEW: double (nullable = false)
 |-- SLP: double (nullable = false)
 |-- Number of Events: long (nullable = true)



## Export for EDA_1 and model analysis

In [343]:
# save the merged data and overwrite
merged2.write.mode("overwrite").parquet("../data/curated/merged_data/first_cleaned.parquet")