In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

from typing import Dict

import pandas as pd
pd.set_option("display.max_columns", 30)

import requests

In [2]:
""""
Steps for the Transform Load Lambda function
1. Get data from S3 (taxi, weather)
2. Weather data transformations -  DONE
3. Taxi data transformations  -  DONE
4. Update dim_payment_type   - DONE
5. Update dim_company  - DONE
6. Update fact_taxi_trips with the ids from the dim_payment_type and dim_company - DONE 
7. Upload dim_weather to S3
8. Upload fact_taxi_trips to S3
9. Upload dim_payment_type and dim_company (current, and previous version)

"""

'"\nSteps for the Transform Load Lambda function\n1. Get data from S3 (taxi, weather)\n2. Weather data transformations -  DONE\n3. Taxi data transformations  -  DONE\n4. Update dim_payment_type   - DONE\n5. Update dim_company  - DONE\n6. Update fact_taxi_trips with the ids from the dim_payment_type and dim_company - DONE \n7. Upload dim_weather to S3\n8. Upload fact_taxi_trips to S3\n9. Upload dim_payment_type and dim_company (current, and previous version)\n\n'

In [3]:
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

url = (
    f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_datetime}T00:00:00' " 
    f"AND trip_start_timestamp <= '{formatted_datetime}T23:59:59' "
    f"&$limit=50000"
)

response = requests.get(url)
data = response.json()

taxi_trips = pd.DataFrame(data)
taxi_trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,pickup_census_tract,dropoff_census_tract
0,840c5dfca48d77a66fdd41a00b6b7236099205f1,9400757c30cc4592f54bcd4d10f8cb2a5db5bd507f285f...,2025-10-14T23:45:00.000,2025-10-14T23:45:00.000,342,1.12,8.0,8.0,6.25,3.0,0,1,10.75,Credit Card,Flash Cab,41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",41.899602111,-87.633308037,"{'type': 'Point', 'coordinates': [-87.63330803...",,
1,7cefd96797e7d1d1f7a2cfe5658967ba467a1406,6898e40854937399e0ef25dad63740d21b205934390907...,2025-10-14T23:45:00.000,2025-10-15T00:15:00.000,1676,23.83,76.0,39.0,57.25,0.0,0,4,61.75,Credit Card,Flash Cab,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",41.808916283,-87.596183344,"{'type': 'Point', 'coordinates': [-87.59618334...",,
2,7be8bc20391bc53469fb62ce58c66c9586a11c8c,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2025-10-14T23:45:00.000,2025-10-14T23:45:00.000,120,0.3,,,4.25,0.0,0,1,5.25,Cash,Transit Administrative Center Inc,,,,,,,,
3,798078d9e0a5402ba9c013b57b50786f93a8d114,477e2191cd2213e2db9413d810936b2f24a73c8e527059...,2025-10-14T23:45:00.000,2025-10-14T23:45:00.000,240,0.7,28.0,32.0,5.25,0.0,0,0,5.25,Cash,Transit Administrative Center Inc,41.874005383,-87.66351755,"{'type': 'Point', 'coordinates': [-87.66351754...",41.878865584,-87.625192142,"{'type': 'Point', 'coordinates': [-87.62519214...",,
4,7920f91c3dd22288fee8c515567d759d170b652c,3f46ef398d3308fb9794b8c5de450a88439d16c47b77b7...,2025-10-14T23:45:00.000,2025-10-15T00:00:00.000,1076,7.69,76.0,,21.25,5.15,0,4,30.9,Credit Card,Sun Taxi,41.980264315,-87.913624596,"{'type': 'Point', 'coordinates': [-87.91362459...",,,,,


#### Taxi data transformations

In [4]:
def taxi_trips_transformations(taxi_trips: pd.DataFrame) -> pd.DataFrame:
    """Perform tranformations on the taxi data.

    1. Drop selected columns.
    2. Drop NULL values accross all columns.
    3. Rename selected columns.
    4. Create "datetime_for_weather" helper column (for dim_weather join).

    :param taxi_trips:  The DataFrame holding the daily taxi trips.
    :raises TypeError:  When taxi_trips parameter is not a valid pandas DataFrame.
    :return:            Transformed taxi trips DataFrame.
    """
    if not isinstance(taxi_trips, pd.DataFrame):
        raise TypeError("taxi_trips is not a valid pandas DataFrame.")

    taxi_trips.drop(["pickup_census_tract", "dropoff_census_tract", 
                 "pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)
    taxi_trips.dropna(inplace=True)

    taxi_trips.rename(columns={"pickup_community_area": "pickup_community_area_id",
                            "dropoff_community_area": "dropoff_community_area_id"}, inplace=True)

    taxi_trips["trip_start_timestamp"] = pd.to_datetime(taxi_trips["trip_start_timestamp"])
    taxi_trips["datetime_for_weather"] = taxi_trips["trip_start_timestamp"].dt.floor("h")

    return taxi_trips

In [5]:
taxi_trips_transformed = taxi_trips_transformations(taxi_trips)

taxi_trips_transformed.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather
0,840c5dfca48d77a66fdd41a00b6b7236099205f1,9400757c30cc4592f54bcd4d10f8cb2a5db5bd507f285f...,2025-10-14 23:45:00,2025-10-14T23:45:00.000,342,1.12,8,8,6.25,3.0,0,1,10.75,Credit Card,Flash Cab,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-10-14 23:00:00
1,7cefd96797e7d1d1f7a2cfe5658967ba467a1406,6898e40854937399e0ef25dad63740d21b205934390907...,2025-10-14 23:45:00,2025-10-15T00:15:00.000,1676,23.83,76,39,57.25,0.0,0,4,61.75,Credit Card,Flash Cab,41.980264315,-87.913624596,41.808916283,-87.596183344,2025-10-14 23:00:00
3,798078d9e0a5402ba9c013b57b50786f93a8d114,477e2191cd2213e2db9413d810936b2f24a73c8e527059...,2025-10-14 23:45:00,2025-10-14T23:45:00.000,240,0.7,28,32,5.25,0.0,0,0,5.25,Cash,Transit Administrative Center Inc,41.874005383,-87.66351755,41.878865584,-87.625192142,2025-10-14 23:00:00
5,71944a1cea8fc9c478c7e9e33a666570ad80302b,a0144e27e6b11720292c08c38cc696f14e6254bbbad971...,2025-10-14 23:45:00,2025-10-15T00:00:00.000,1320,0.0,76,5,45.0,9.05,0,0,54.05,Credit Card,Transit Administrative Center Inc,41.980264315,-87.913624596,41.947791586,-87.683834942,2025-10-14 23:00:00
6,6eb76227d860198005307fe53f507e184882fafb,d07c53f34c119447e9b081f774d7dc37076befc5d997ee...,2025-10-14 23:45:00,2025-10-15T00:15:00.000,1273,17.05,76,32,42.25,9.35,0,4,56.1,Credit Card,5 Star Taxi,41.97907082,-87.903039661,41.884987192,-87.620992913,2025-10-14 23:00:00


In [6]:
taxi_trips_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21249 entries, 0 to 23496
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   trip_id                     21249 non-null  object        
 1   taxi_id                     21249 non-null  object        
 2   trip_start_timestamp        21249 non-null  datetime64[ns]
 3   trip_end_timestamp          21249 non-null  object        
 4   trip_seconds                21249 non-null  object        
 5   trip_miles                  21249 non-null  object        
 6   pickup_community_area_id    21249 non-null  object        
 7   dropoff_community_area_id   21249 non-null  object        
 8   fare                        21249 non-null  object        
 9   tips                        21249 non-null  object        
 10  tolls                       21249 non-null  object        
 11  extras                      21249 non-null  object        


#### Dim Compyna and Dim Payment Type update

In [7]:
def update_dim_table(taxi_trips: pd.DataFrame, dim_df: pd.DataFrame, value_col: str) -> pd.DataFrame:
    """Extend the dimension DataFrame with new values if there are any.

    :param taxi_trips:  DataFrame with the daily taxi trips.
    :param dim_df:      DataFrame with the dimension data (company, payment_type).
    :param value_col:   Name of the column in dimension DataFrame containing the values.
    :return:            The updated dimension data, if new values are in the taxi data, they will be loaded to it.
    """
    id_col = f"{value_col}_id"

    todays_dim_data = pd.DataFrame(taxi_trips[value_col].unique(), columns=[value_col])
    new_dim_data = todays_dim_data[~todays_dim_data[value_col].isin(dim_df[value_col])]

    if not new_dim_data.empty:
        max_id = dim_df[id_col].max()
        new_dim_data[id_col] = range(max_id + 1, max_id + 1 + len(new_dim_data))
        dim_df = pd.concat([dim_df, new_dim_data], ignore_index=True)

    return dim_df

In [8]:
dim_payment_type = taxi_trips["payment_type"].drop_duplicates().reset_index(drop=True)
dim_payment_type = pd.DataFrame(
    {
        "payment_type_id": range(1, len(dim_payment_type) + 1),
        "payment_type": dim_payment_type
    }
)

dim_company = taxi_trips["company"].drop_duplicates().reset_index(drop=True)
dim_company = pd.DataFrame(
    {
        "company_id": range(1, len(dim_company) + 1),
        "company": dim_company
    }
)

dummy_company_data = [
    {"company": "Metro Jet Taxi A."},
    {"company": "X"},
    {"company": "Y"},
    {"company": "X"},
]
dummy_company_data_df = pd.DataFrame(dummy_company_data)

dummy_payment_type_data = [
    {"payment_type": "Credit Card"},
    {"payment_type": "X"},
    {"payment_type": "Z"},
    {"payment_type": "Z"},
]
dummy_payment_type_data_df = pd.DataFrame(dummy_payment_type_data)

In [9]:
dim_payment_type_updated = update_dim_table(dummy_payment_type_data_df, dim_payment_type, "payment_type")
dim_company_updated = update_dim_table(dummy_company_data_df, dim_company, "company")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dim_data[id_col] = range(max_id + 1, max_id + 1 + len(new_dim_data))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dim_data[id_col] = range(max_id + 1, max_id + 1 + len(new_dim_data))


In [10]:
dim_payment_type_updated


Unnamed: 0,payment_type_id,payment_type
0,1,Credit Card
1,2,Cash
2,3,Unknown
3,4,Prcard
4,5,Mobile
5,6,No Charge
6,7,Dispute
7,8,X
8,9,Z


In [11]:
dim_company_updated

Unnamed: 0,company_id,company
0,1,Flash Cab
1,2,Transit Administrative Center Inc
2,3,5 Star Taxi
3,4,Chicago Independents
4,5,City Service
5,6,Wolley Taxi
6,7,Sun Taxi
7,8,Globe Taxi
8,9,Medallion Leasin
9,10,Blue Ribbon Taxi Association


#### Update fact_taxi_trips with company and payment_type ids

In [12]:
def update_fact_taxi_trips_with_dimension_data(taxi_trips: pd.DataFrame, dim_payment_type: pd.DataFrame, dim_company: pd.DataFrame) -> pd.DataFrame:
    """Update the fact_taxi_trips DataFrame with the dim_company and dim_payment_type ids, and delete the string columns.

    :param taxi_trips:          The DataFrame with the daily taxi trips.
    :param dim_payment_type:    The payment type dimension table.
    :param dim_company:         The company dimension table.
    :return:                    The taxi trips data, with only payment_type_id and company_id, without company or
                                payment_type values.
    """

    fact_taxi_trips = taxi_trips.merge(dim_payment_type, on="payment_type")
    fact_taxi_trips = fact_taxi_trips.merge(dim_company, on="company")
    fact_taxi_trips.drop(["payment_type", "company"], axis=1, inplace=True)

    return fact_taxi_trips

In [13]:
taxi_trips_transformed_with_dim_ids = update_fact_taxi_trips_with_dimension_data(taxi_trips_transformed, dim_payment_type, dim_company)

taxi_trips_transformed_with_dim_ids.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,dropoff_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,payment_type_id,company_id
0,840c5dfca48d77a66fdd41a00b6b7236099205f1,9400757c30cc4592f54bcd4d10f8cb2a5db5bd507f285f...,2025-10-14 23:45:00,2025-10-14T23:45:00.000,342,1.12,8,8,6.25,3.0,0,1,10.75,41.899602111,-87.633308037,41.899602111,-87.633308037,2025-10-14 23:00:00,1,1
1,7cefd96797e7d1d1f7a2cfe5658967ba467a1406,6898e40854937399e0ef25dad63740d21b205934390907...,2025-10-14 23:45:00,2025-10-15T00:15:00.000,1676,23.83,76,39,57.25,0.0,0,4,61.75,41.980264315,-87.913624596,41.808916283,-87.596183344,2025-10-14 23:00:00,1,1
2,798078d9e0a5402ba9c013b57b50786f93a8d114,477e2191cd2213e2db9413d810936b2f24a73c8e527059...,2025-10-14 23:45:00,2025-10-14T23:45:00.000,240,0.7,28,32,5.25,0.0,0,0,5.25,41.874005383,-87.66351755,41.878865584,-87.625192142,2025-10-14 23:00:00,2,2
3,71944a1cea8fc9c478c7e9e33a666570ad80302b,a0144e27e6b11720292c08c38cc696f14e6254bbbad971...,2025-10-14 23:45:00,2025-10-15T00:00:00.000,1320,0.0,76,5,45.0,9.05,0,0,54.05,41.980264315,-87.913624596,41.947791586,-87.683834942,2025-10-14 23:00:00,1,2
4,6eb76227d860198005307fe53f507e184882fafb,d07c53f34c119447e9b081f774d7dc37076befc5d997ee...,2025-10-14 23:45:00,2025-10-15T00:15:00.000,1273,17.05,76,32,42.25,9.35,0,4,56.1,41.97907082,-87.903039661,41.884987192,-87.620992913,2025-10-14 23:00:00,1,3


#### Weather transformations


In [14]:
def transform_weather(data: Dict) -> pd.DataFrame:
    """Select and transform weather data.

    :param weather_data:    The daily weather data from the Open Meteo API.
    :return:                Transformed weather pandas DataFrame.
    """

    weather_data_dict = {
        "datetime": data["hourly"]["time"],
        "temperature": data["hourly"]["temperature_2m"],
        "wind_speed": data["hourly"]["wind_speed_10m"],
        "rain": data["hourly"]["rain"],
        "precipitation": data["hourly"]["precipitation"]
    }

    weather_df = pd.DataFrame(weather_data_dict)
    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"])

    return weather_df

In [15]:
current_datetime = datetime.now() - relativedelta(months=2)
formatted_datetime = current_datetime.strftime("%Y-%m-%d")

url = "https://archive-api.open-meteo.com/v1/era5"

params = {
    "latitude": 41.85,
    "longitude": -87.65,
    "start_date": formatted_datetime,
    "end_date": formatted_datetime,
    "hourly": "temperature_2m,wind_speed_10m,rain,precipitation"
}

response = requests.get(url, params=params)
weather_raw_data = response.json()

weather_raw_data

{'latitude': 41.862915,
 'longitude': -87.64877,
 'generationtime_ms': 0.7274150848388672,
 'utc_offset_seconds': 0,
 'timezone': 'GMT',
 'timezone_abbreviation': 'GMT',
 'elevation': 179.0,
 'hourly_units': {'time': 'iso8601',
  'temperature_2m': 'Â°C',
  'wind_speed_10m': 'km/h',
  'rain': 'mm',
  'precipitation': 'mm'},
 'hourly': {'time': ['2025-10-14T00:00',
   '2025-10-14T01:00',
   '2025-10-14T02:00',
   '2025-10-14T03:00',
   '2025-10-14T04:00',
   '2025-10-14T05:00',
   '2025-10-14T06:00',
   '2025-10-14T07:00',
   '2025-10-14T08:00',
   '2025-10-14T09:00',
   '2025-10-14T10:00',
   '2025-10-14T11:00',
   '2025-10-14T12:00',
   '2025-10-14T13:00',
   '2025-10-14T14:00',
   '2025-10-14T15:00',
   '2025-10-14T16:00',
   '2025-10-14T17:00',
   '2025-10-14T18:00',
   '2025-10-14T19:00',
   '2025-10-14T20:00',
   '2025-10-14T21:00',
   '2025-10-14T22:00',
   '2025-10-14T23:00'],
  'temperature_2m': [18.3,
   17.8,
   16.5,
   16.3,
   16.7,
   16.9,
   16.4,
   16.4,
   16.3,
   15

In [16]:
weather_df = transform_weather(weather_raw_data)

weather_df

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2025-10-14 00:00:00,18.3,5.3,0.0,0.0
1,2025-10-14 01:00:00,17.8,4.3,0.0,0.0
2,2025-10-14 02:00:00,16.5,2.8,0.0,0.0
3,2025-10-14 03:00:00,16.3,2.2,0.0,0.0
4,2025-10-14 04:00:00,16.7,3.4,0.0,0.0
5,2025-10-14 05:00:00,16.9,5.5,0.0,0.0
6,2025-10-14 06:00:00,16.4,8.4,0.0,0.0
7,2025-10-14 07:00:00,16.4,9.6,0.0,0.0
8,2025-10-14 08:00:00,16.3,10.3,0.0,0.0
9,2025-10-14 09:00:00,15.9,9.6,0.0,0.0


In [17]:
weather_df

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2025-10-14 00:00:00,18.3,5.3,0.0,0.0
1,2025-10-14 01:00:00,17.8,4.3,0.0,0.0
2,2025-10-14 02:00:00,16.5,2.8,0.0,0.0
3,2025-10-14 03:00:00,16.3,2.2,0.0,0.0
4,2025-10-14 04:00:00,16.7,3.4,0.0,0.0
5,2025-10-14 05:00:00,16.9,5.5,0.0,0.0
6,2025-10-14 06:00:00,16.4,8.4,0.0,0.0
7,2025-10-14 07:00:00,16.4,9.6,0.0,0.0
8,2025-10-14 08:00:00,16.3,10.3,0.0,0.0
9,2025-10-14 09:00:00,15.9,9.6,0.0,0.0
