In [1]:
from datetime import datetime   
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option("display.max_columns", 30) 
import requests

In [2]:
""""
Steps for the Transform Load Lambda function
1. Get data from S3 (taxi, weather)
2. Weather data transformations
3. Taxi data transformations
4. Update dim_payment_type 
5. Update dim_company
6. Update fact_taxi_trips with the ids from the dim_payment_type and dim_company
7. Upload dim_weather to S3
8. Upload fact_taxi_trips to S3
9. Upload dim_payment_type and dim_company (current, and previous version)

"""

'"\nSteps for the Transform Load Lambda function\n1. Get data from S3 (taxi, weather)\n2. Weather data transformations\n3. Taxi data transformations\n4. Update dim_payment_type \n5. Update dim_company\n6. Update fact_taxi_trips with the ids from the dim_payment_type and dim_company\n7. Upload dim_weather to S3\n8. Upload fact_taxi_trips to S3\n9. Upload dim_payment_type and dim_company (current, and previous version)\n\n'

In [3]:
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

url = ( 
    f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' " 
    f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59'"
    f"&$limit=30000"
)
response = requests.get(url)
data = response.json()

len(data)  # Number of taxi trips on February 16, 2024

taxi_trips = pd.DataFrame(data)
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,5887b84e350521b3d1c9b31f6e9a5a4ef39be5d6,3618045f9110d4d88482266ade23659c1a50d32ac37f20...,2025-09-26T23:45:00.000,2025-09-27T00:00:00.000,783,2.99,22,16,10.39,3.31,0,0,14.2,Mobile,Tac - Yellow Cab Association,41.92276062,-87.699155343,"{'type': 'Point', 'coordinates': [-87.69915534...",41.953582125,-87.72345239,"{'type': 'Point', 'coordinates': [-87.72345239...",,
1,006d64de27bc9d408ccb3fda78ff2d643334aee7,6c1e4e8e25a1b47575b359c5a0844cf23c50e540a86ecd...,2025-09-26T23:45:00.000,2025-09-26T23:45:00.000,630,1.39,8,8,8.0,3.0,0,0,11.5,Credit Card,Flash Cab,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
2,03fddd4b286099fe548254b40e2ee57bb6865375,8123ec5a70681a139fcd93fd477eec0ee05dda82987748...,2025-09-26T23:45:00.000,2025-09-27T00:00:00.000,751,4.18,8,6,15.77,0.0,0,0,16.27,Mobile,Globe Taxi,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.944226601,-87.655998182,"{'type': 'Point', 'coordinates': [-87.65599818...",,
3,043c9bde3029eb5a436ba33acfe379c89a49bf36,a0f44bc0a273e49230e4abe4d0a8d3a1e8305945f8fa0b...,2025-09-26T23:45:00.000,2025-09-27T00:15:00.000,1464,8.65,33,23,35.52,0.0,0,0,36.02,Mobile,Globe Taxi,41.857183858,-87.620334624,"{'type': 'Point', 'coordinates': [-87.62033462...",41.900069603,-87.720918238,"{'type': 'Point', 'coordinates': [-87.72091823...",,
4,052b170224311f9b4f4eff285dcfe23a03fcc68d,d032016cdde07ebbac8ab34b98a534aed192e1c86d610f...,2025-09-26T23:45:00.000,2025-09-27T00:00:00.000,452,2.81,76,76,9.75,2.0,0,4,16.25,Credit Card,Medallion Leasin,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",,


#### Taxi data transformations

In [8]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """ Perform transformations on the taxi data.

    1.Drop selected columns
    2.Drop NULL values across all columns
    3.Rename selected columns
    4.Create "datetime_for_weather" helper column (for dim_weather join) 

    :param taxi_trips: _The DataFrame holding the daily taxi trips.
    :return: Transformed taxi trips DataFrame.
    """

    if not isinstance(taxi_trips, pd.DataFrame):
        raise ValueError("Taxi trips is not a valid DataFrame.")


    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract", 
                     "pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True,  errors="ignore" )
    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)
    
    taxi_trips['trip_start_timestamp'] = pd.to_datetime(taxi_trips['trip_start_timestamp'])
    taxi_trips["datetime_for_weather"] = taxi_trips["trip_start_timestamp"].dt.floor("h")

    return taxi_trips

In [9]:
taxi_trips_transformed = taxi_trips_transformations(taxi_trips)

taxi_trips_transformed.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,5887b84e350521b3d1c9b31f6e9a5a4ef39be5d6,3618045f9110d4d88482266ade23659c1a50d32ac37f20...,2025-09-26 23:45:00,2025-09-27T00:00:00.000,783,2.99,22,16,10.39,3.31,0,0,14.2,Mobile,Tac - Yellow Cab Association,41.92276062,-87.699155343,41.953582125,-87.72345239,2025-09-26 23:00:00
1,006d64de27bc9d408ccb3fda78ff2d643334aee7,6c1e4e8e25a1b47575b359c5a0844cf23c50e540a86ecd...,2025-09-26 23:45:00,2025-09-26T23:45:00.000,630,1.39,8,8,8.0,3.0,0,0,11.5,Credit Card,Flash Cab,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-09-26 23:00:00
2,03fddd4b286099fe548254b40e2ee57bb6865375,8123ec5a70681a139fcd93fd477eec0ee05dda82987748...,2025-09-26 23:45:00,2025-09-27T00:00:00.000,751,4.18,8,6,15.77,0.0,0,0,16.27,Mobile,Globe Taxi,41.899602111,-87.633308037,41.944226601,-87.655998182,2025-09-26 23:00:00
3,043c9bde3029eb5a436ba33acfe379c89a49bf36,a0f44bc0a273e49230e4abe4d0a8d3a1e8305945f8fa0b...,2025-09-26 23:45:00,2025-09-27T00:15:00.000,1464,8.65,33,23,35.52,0.0,0,0,36.02,Mobile,Globe Taxi,41.857183858,-87.620334624,41.900069603,-87.720918238,2025-09-26 23:00:00
4,052b170224311f9b4f4eff285dcfe23a03fcc68d,d032016cdde07ebbac8ab34b98a534aed192e1c86d610f...,2025-09-26 23:45:00,2025-09-27T00:00:00.000,452,2.81,76,76,9.75,2.0,0,4,16.25,Credit Card,Medallion Leasin,41.980264315,-87.913624596,41.980264315,-87.913624596,2025-09-26 23:00:00


In [10]:
taxi_trips_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21003 entries, 0 to 23173
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   trip_id                     21003 non-null  object        
 1   taxi_id                     21003 non-null  object        
 2   trip_start_timestamp        21003 non-null  datetime64[ns]
 3   trip_end_timestamp          21003 non-null  object        
 4   trip_seconds                21003 non-null  object        
 5   trip_miles                  21003 non-null  object        
 6   pickup_community_area_id    21003 non-null  object        
 7   dropoff_community_area_id   21003 non-null  object        
 8   fare                        21003 non-null  object        
 9   tips                        21003 non-null  object        
 10  tolls                       21003 non-null  object        
 11  extras                      21003 non-null  object        
