In [None]:
import duckdb

parquet_path = "data/merged_output_parquet/*.parquet"

First we test that we can read our parquet files with DuckDB by checking that all columns are present.

In [None]:
duckdb.read_parquet(f"{parquet_path}").columns

We do some initial data anlysis by calculating the Average, Minimum, and Maximum values for the most interesting columns.

Also we calculate the speed of execution se we can later compare it to Dask SQL.

We see that we should do some filtering because minimum for trip_distance and total_amount is negative, which are not results that should be possible. Also we should cap the total_amount because we doubt that anyone would pay 1000003.8 for a taxi ride.

In [None]:
%%time

duckdb.sql(f""" select 
                avg(passenger_count) avg_passengers, 
                avg(trip_distance),
                min(trip_distance),
                max(trip_distance),
                avg(total_amount),
                min(total_amount),
                max(total_amount) 
           from '{parquet_path}'
""")

Calculate the median trip distance of rides that were done for each day of the year, we separete the results by month so we can visualize the results.

In [None]:
%%time

result = duckdb.sql(f""" select month(tpep_pickup_datetime), day(tpep_pickup_datetime), round(mean(trip_distance), 2) as avg_distance from  
           '{parquet_path}'
           group by day(tpep_pickup_datetime), month(tpep_pickup_datetime) 
           order by month(tpep_pickup_datetime), day(tpep_pickup_datetime)
""").fetchall()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.ticker as ticker

We are going to visualize the results we have calculated earlier, so we get a better understanding for the trips taken during the year. As we can see, for the first four months of the year, people are usually taking shorter trips compared to the other parts of the year. 

In [None]:
df = pd.DataFrame(result, columns=['month', 'day', 'avg_distance'])
df = df.sort_values(by=['month', 'day'])
df['date'] = df.apply(lambda row: str(int(row['day'])) + "-" + str(int(row['month'])), axis=1)


plt.figure(figsize=(18, 7))

plt.bar(df['date'], df['avg_distance'], label='Daily Avg Distance')

plt.title('Average Trip Distance Over the Year', fontsize=18)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Trip Distance (miles)', fontsize=14) 

plt.xticks(rotation=45)

ax = plt.gca()
# ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
ax.xaxis.set_major_locator(ticker.FixedLocator(df[df['day'] == 1].index))
# ax.axis.set_major_locator( df[df['day'] == 1]['date']) 

# Adding a grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Adding a legend
plt.legend(fontsize=12)

# Show the plot
plt.tight_layout() # Adjusts plot to ensure everything fits without overlapping
plt.show()

Next we take a look at the median fare amount during different hours of the day. As we can see the taxi drivers earn the most in the early morning, during 4, 5, 6, hours in the morning. Also there are some inconsistencies which should be looked at, we have a negative median fare amount for the 8th hour of the day.

In [None]:
result =  duckdb.sql(f""" select hour(tpep_pickup_datetime), round(mean(fare_amount), 2) as avg_amount from  
           '{parquet_path}'
           group by hour(tpep_pickup_datetime)
           order by hour(tpep_pickup_datetime)
""").fetchall()

df = pd.DataFrame(result, columns=['hour', 'fare_amount'])
df = df.sort_values(by=['hour'])

plt.figure(figsize=(18, 7))

plt.bar(df['hour'], df['fare_amount'], label='Hourly Average Fare Amount')

plt.title('Hourly Average Fare Amount', fontsize=18)
plt.xlabel('Hour', fontsize=14)
plt.ylabel('Average Fare Amount', fontsize=14) 

plt.xticks(rotation=45)

ax = plt.gca()
# ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
ax.xaxis.set_major_locator(ticker.FixedLocator(df['hour'].index))
# ax.axis.set_major_locator( df[df['day'] == 1]['date']) 

# Adding a grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Adding a legend
plt.legend(fontsize=12)

# Show the plot
plt.tight_layout() # Adjusts plot to ensure everything fits without overlapping
plt.show()

We take a look at the borughs that people usually take trips between. And also calculate the time needed to calculate this query in order to compare it to the Dask SQL.

In [None]:
%%time

duckdb.sql(f""" select borough_pickup, borough_dropoff, count(*) as trips_count from  
           '{parquet_path}'
           where borough_pickup is not null and borough_dropoff is not null
           group by borough_pickup, borough_dropoff
           order by trips_count desc
""").fetchall()

In the following cells, we try to make some conclusion based on the data we have added to the original data. We take the look at the fact if the day is rainy or if it is a holiday.

In [None]:
duckdb.sql(f""" select is_holiday, round(avg(trip_distance), 2), round(mean(tip_amount), 2) median_tip from  
           '{parquet_path}'
           group by is_holiday 
""")

In [None]:
duckdb.sql(f""" select round(avg(no_trips)) avg_trips_holiday from
           (
                select year(tpep_pickup_datetime), month(tpep_pickup_datetime), day(tpep_pickup_datetime), count(*) as no_trips from  
                '{parquet_path}'
                where is_holiday is true
                group by year(tpep_pickup_datetime), month(tpep_pickup_datetime), day(tpep_pickup_datetime)
           )
""")

In [None]:
duckdb.sql(f""" select round(avg(no_trips)) as avg_trips_non_holiday from
           (
                select year(tpep_pickup_datetime), month(tpep_pickup_datetime), day(tpep_pickup_datetime), round(count(*)) as no_trips from  
                '{parquet_path}'
                where is_holiday is false
                group by year(tpep_pickup_datetime), month(tpep_pickup_datetime), day(tpep_pickup_datetime)
           )
""")

In [None]:
duckdb.sql(f""" select round(avg(trip_distance), 2), round(mean(tip_amount), 2) median_tip from  
           '{parquet_path}'
           where "rain (mm)" >= 0.5
""")

In [None]:
duckdb.sql(f""" select round(avg(trip_distance), 2), round(mean(tip_amount), 2) median_tip from  
           '{parquet_path}'
           where "rain (mm)" < 0.5
""")

We are creating an Dask client in order to compute the queries using the Dask SQL.

In [None]:
from dask.distributed import Client, 
from dask_sql import Context

client = Client(n_workers=4, threads_per_worker=1, memory_limit='8GB')
print(f"Dask Dashboard link: {client.dashboard_link}")

In [None]:
parquet_path = "data/merged_output_parquet" 
c = Context()


dask_table_name = "taxi_data"
c.create_table(dask_table_name, parquet_path, format="parquet")


sql_query = f""" select borough_pickup, borough_dropoff, count(*) as trips_count from  
           '{dask_table_name}'
           where borough_pickup is not null and borough_dropoff is not null
           group by borough_pickup, borough_dropoff
           order by trips_count desc
"""

sql_query_initial = f"""
    select 
                avg(passenger_count) avg_passengers, 
                avg(trip_distance),
                min(trip_distance),
                max(trip_distance),
                avg(total_amount),
                min(total_amount),
                max(total_amount) 
           from '{dask_table_name}'
"""


For this initial query we can see that by using DuckDB we can save a lot of time compared to Dask SQL implementation. 
For my local implementation I can notice that using DuckDB needs 2.11 ms to compute this query, where as Dask SQL requires 6.48 s.

In [None]:
%%time 
c.sql(sql_query_initial).compute()

The same thing can be noticed in this example here, DuckDB requires significently less time to calculate the same query comapred to Dask SQL. 
We can see that the Dask SQL locally takes 46.4 s to compute this query, while DuckDB need only 669 ms, which is more than 50 times faster.

In [None]:
%%time

c.sql(sql_query).compute()