## Module 1 Dataset Exploration and Homework Solution

### Setting up the environment

In [1]:
import pandas as pd
from typing import List


In [2]:
# this is to simulate calling a script from cli with arguments and env
ARGS: List[str] = ['', '1']
ZONE_FILE_PATH: str = 'data/taxi_zone_lookup.csv'
TRIP_FILE_PATH: str = 'data/green_tripdata_2025-11.parquet' 

In [3]:
day: int = int(ARGS[1])
print(f"Running pipeline for day {day}")

Running pipeline for day 1


### Reading the data

In [4]:
taxi_zlu = pd.read_csv(ZONE_FILE_PATH)
taxi_zlu.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [5]:
taxi_zlu.dtypes

LocationID       int64
Borough         object
Zone            object
service_zone    object
dtype: object

In [6]:
trip_data = pd.read_parquet(TRIP_FILE_PATH)
trip_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
0,2,2025-11-01 00:34:48,2025-11-01 00:41:39,N,1.0,74,42,1.0,0.74,7.2,...,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0,0.0
1,2,2025-11-01 00:18:52,2025-11-01 00:24:27,N,1.0,74,42,2.0,0.95,7.2,...,0.5,0.0,0.0,,1.0,9.7,2.0,1.0,0.0,0.0
2,2,2025-11-01 01:03:14,2025-11-01 01:15:24,N,1.0,83,160,1.0,2.19,13.5,...,0.5,5.0,0.0,,1.0,21.0,1.0,1.0,0.0,0.0
3,2,2025-11-01 00:10:57,2025-11-01 00:24:53,N,1.0,166,127,1.0,5.44,24.7,...,0.5,0.5,0.0,,1.0,27.7,1.0,1.0,0.0,0.0
4,1,2025-11-01 00:03:48,2025-11-01 00:19:38,N,1.0,166,262,1.0,3.2,18.4,...,1.5,1.0,0.0,,1.0,24.65,1.0,1.0,2.75,0.0


### Exploring November only data

In [7]:
filter_col_name = 'lpep_pickup_datetime'
nov_trips = trip_data.loc[
    (trip_data[filter_col_name] >= '2025-11-01')
    & 
    (trip_data[filter_col_name] < '2025-12-01')
]
nov_trips.describe()[filter_col_name]

count                         46891
mean     2025-11-15 20:07:20.005118
min             2025-11-01 00:00:13
25%      2025-11-08 13:17:39.500000
50%             2025-11-15 18:50:56
75%      2025-11-22 17:46:27.500000
max             2025-11-30 23:58:28
std                             NaN
Name: lpep_pickup_datetime, dtype: object

In [8]:
nov_trips.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
cbd_congestion_fee              float64
dtype: object

In [9]:
nov_trips.shape

(46891, 21)

In [10]:
null_status = nov_trips.isnull().sum()
null_status

VendorID                     0
lpep_pickup_datetime         0
lpep_dropoff_datetime        0
store_and_fwd_flag        5569
RatecodeID                5569
PULocationID                 0
DOLocationID                 0
passenger_count           5569
trip_distance                0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
ehail_fee                46891
improvement_surcharge        0
total_amount                 0
payment_type              5569
trip_type                 5570
congestion_surcharge      5569
cbd_congestion_fee           0
dtype: int64

In [11]:
for col, nulls in null_status.items():
    if nulls > 0:
        print(f'{col} '+ '=' * 20)
        print(nov_trips[col].value_counts())

store_and_fwd_flag
N    41196
Y      126
Name: count, dtype: int64
RatecodeID
1.0     38704
5.0      2432
2.0       112
4.0        53
3.0        20
99.0        1
Name: count, dtype: int64
passenger_count
1.0    33831
2.0     4707
5.0      954
0.0      612
3.0      502
6.0      442
4.0      257
9.0        7
8.0        5
7.0        5
Name: count, dtype: int64
Series([], Name: count, dtype: int64)
payment_type
1.0    31612
2.0     9336
3.0      286
4.0       86
5.0        2
Name: count, dtype: int64
trip_type
1.0    39030
2.0     2291
Name: count, dtype: int64
congestion_surcharge
 0.00    28025
 2.75    13267
 2.50       24
-2.75        6
Name: count, dtype: int64


### Solving Homework Questions


##### Question 3. Counting short trips

For the trips in November 2025 (lpep_pickup_datetime between '2025-11-01' and '2025-12-01', exclusive of the upper bound), how many trips had a trip_distance of less than or equal to 1 mile?

- 7,853
- 8,007
- 8,254
- 8,421


In [12]:
(nov_trips['trip_distance'] <= 1).sum()

np.int64(8007)

We need to find out what unit of measure is the column `trip_distance` in.
assuming it is in miles because this is a dataset collected from the US is a good a assumption, but we have to double check it.

luckily there is a data dictionary here: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf

which states it is in fact in miles

> trip_distance | The elapsed trip distance in *_miles_* reported by the taximeter.


##### Question 4. Longest trip for each day

Which was the pick up day with the longest trip distance? Only consider trips with trip_distance less than 100 miles (to exclude data errors).

Use the pick up time for your calculations.

- 2025-11-14
- 2025-11-20
- 2025-11-23
- 2025-11-25


In [13]:
valid_trips = nov_trips[nov_trips['trip_distance'] < 100]
print(valid_trips['trip_distance'].max())
valid_trips.loc[valid_trips['trip_distance'].idxmax(), :]

# Warning: don't use iloc here
# idxmax returns the DataFrame index, not an integer index

88.03


VendorID                                   2
lpep_pickup_datetime     2025-11-14 15:36:27
lpep_dropoff_datetime    2025-11-14 18:40:48
store_and_fwd_flag                         N
RatecodeID                               4.0
PULocationID                             130
DOLocationID                             265
passenger_count                          2.0
trip_distance                          88.03
fare_amount                            610.6
extra                                    0.0
mta_tax                                  0.5
tip_amount                               0.0
tolls_amount                             0.0
ehail_fee                                NaN
improvement_surcharge                    1.0
total_amount                           612.1
payment_type                             2.0
trip_type                                1.0
congestion_surcharge                     0.0
cbd_congestion_fee                       0.0
Name: 18867, dtype: object


##### Question 5. Biggest pickup zone

Which was the pickup zone with the largest total_amount (sum of all trips) on November 18th, 2025?

- East Harlem North
- East Harlem South
- Morningside Heights
- Forest Hills


In [14]:
taxi_zlu.dtypes

LocationID       int64
Borough         object
Zone            object
service_zone    object
dtype: object

In [15]:
merged_df = nov_trips.merge(taxi_zlu, how='left', left_on='PULocationID', right_on='LocationID')
merged_df.groupby('Zone')['total_amount'].agg('sum').sort_values(ascending=False)

Zone
East Harlem North            257446.40
East Harlem South            126754.10
Morningside Heights           49118.26
Jamaica                       46490.74
Central Park                  45595.64
                               ...    
Mariners Harbor                  27.23
Battery Park City                21.04
Roosevelt Island                 16.50
Saint George/New Brighton        13.75
Forest Park/Highland Park         4.50
Name: total_amount, Length: 231, dtype: float64


##### Question 6. Largest tip

For the passengers picked up in the zone named "East Harlem North" in November 2025, which was the drop off zone that had the largest tip?

Note: it's tip , not trip. We need the name of the zone, not the ID.

- JFK Airport
- Yorkville West
- East Harlem North
- LaGuardia Airport


In [16]:
ehn_pickups = merged_df[merged_df['Zone'] == 'East Harlem North']
ehn_pickups['Zone'].value_counts()

Zone
East Harlem North    12041
Name: count, dtype: int64

In [17]:
ehn_max_tip_record = ehn_pickups.loc[ehn_pickups.tip_amount.idxmax(), :]
ehn_max_tip_record

VendorID                                   2
lpep_pickup_datetime     2025-11-30 16:30:27
lpep_dropoff_datetime    2025-11-30 16:41:29
store_and_fwd_flag                         N
RatecodeID                               1.0
PULocationID                              74
DOLocationID                             263
passenger_count                          1.0
trip_distance                           2.63
fare_amount                             14.2
extra                                    0.0
mta_tax                                  0.5
tip_amount                             81.89
tolls_amount                             0.0
ehail_fee                                NaN
improvement_surcharge                    1.0
total_amount                          100.34
payment_type                             1.0
trip_type                                1.0
congestion_surcharge                    2.75
cbd_congestion_fee                       0.0
LocationID                                74
Borough   

In [18]:
taxi_zlu.loc[taxi_zlu['LocationID'] == ehn_max_tip_record['DOLocationID'], :]

Unnamed: 0,LocationID,Borough,Zone,service_zone
262,263,Manhattan,Yorkville West,Yellow Zone
