In [1]:
import boto3
import json
import time
from datetime import datetime

# Initialize the Lambda client
lambda_client = boto3.client('lambda')

# Lambda function name
function_name = 'etl_fetch_raw_taxi_data'

# Parameters
year = 2024
bucket_name = 'taximd'
s3_key_prefix = 'taxi/raw'
glue_job_name = 'iadFilterAndTransform'

# Create a timestamp for logging
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Starting Lambda invocations at {timestamp}")

# Loop through all months of 2024
for month in range(1, 13):
    # Create the payload for the Lambda function
    payload = {
        'year': year,
        'month': month,
        'bucket_name': bucket_name,
        's3_key_prefix': s3_key_prefix,
        'glue_job_name': glue_job_name
    }

    print(f"\nInvoking Lambda for {year}-{month:02d}...")

    try:
        # Invoke the Lambda function
        response = lambda_client.invoke(
            FunctionName=function_name,
            InvocationType='RequestResponse',  # Use 'Event' for asynchronous invocation
            Payload=json.dumps(payload)
        )

        # Parse the response
        response_payload = json.loads(response['Payload'].read().decode())

        # Print the response
        print(f"Status code: {response['StatusCode']}")
        print(f"Response: {json.dumps(response_payload, indent=2)}")

    except Exception as e:
        print(f"Error invoking Lambda for {year}-{month:02d}: {str(e)}")

    # Add a delay between invocations to avoid throttling
    if month < 12:
        print("Waiting 1 second before next invocation...")
        time.sleep(20)

print(f"\nCompleted all Lambda invocations at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Starting Lambda invocations at 2025-04-20 03:58:54

Invoking Lambda for 2024-01...
Status code: 200
Response: {
  "statusCode": 200,
  "body": "{\"fetch_result\": \"File already exists in S3: s3://taximd/taxi/raw/year=2024/month=01/yellow_tripdata_2024-01.parquet\", \"glue_job\": \"Started Glue job iadFilterAndTransform with run ID: jr_ebbf2807459151de23f6ef3d11f8891ed5e828c9ab87384e142fbf881852db31\"}"
}
Waiting 1 second before next invocation...

Invoking Lambda for 2024-02...
Status code: 200
Response: {
  "statusCode": 200,
  "body": "{\"fetch_result\": \"File already exists in S3: s3://taximd/taxi/raw/year=2024/month=02/yellow_tripdata_2024-02.parquet\", \"glue_job\": \"Started Glue job iadFilterAndTransform with run ID: jr_4f1d56a19c8a5cb563b18d2e5fb3c79879a24703f0ee3fa3d7b284b82e2cd189\"}"
}
Waiting 1 second before next invocation...

Invoking Lambda for 2024-03...
Status code: 200
Response: {
  "statusCode": 200,
  "body": "{\"fetch_result\": \"File already exists in S3: s3://t

In [2]:
import boto3
import pandas as pd
import time

# Initialize Athena client
athena_client = boto3.client('athena')

# Set your S3 output location where query results will be stored
s3_output = 's3://taximd/athena/'

# Your query
query = """
SELECT
    pickup_hour,
    pickup_location_id,
    rides,
    month
FROM glue_transformed
WHERE (CAST(year AS INT) = 2023 AND CAST(month AS INT) BETWEEN 1 AND 12)
   OR (CAST(year AS INT) = 2024 AND CAST(month AS INT) = 1)
ORDER BY pickup_hour;
"""

# Start the query execution
response = athena_client.start_query_execution(
    QueryString=query,
    QueryExecutionContext={
        'Database': 'nyc_taxi_transformeddb'
    },
    ResultConfiguration={
        'OutputLocation': s3_output,
    }
)

# Get the query execution ID
query_execution_id = response['QueryExecutionId']

# Wait for the query to complete
state = 'RUNNING'
while state in ['RUNNING', 'QUEUED']:
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    state = response['QueryExecution']['Status']['State']
    if state in ['RUNNING', 'QUEUED']:
        time.sleep(1)

# If query executed successfully, fetch the results
if state == 'SUCCEEDED':
    # Get the results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    # Extract column names
    columns = [col['Label'] for col in results['ResultSet']['ResultSetMetadata']['ColumnInfo']]

    # Extract data rows
    rows = []
    for row in results['ResultSet']['Rows'][1:]:  # Skip the header row
        data = [field.get('VarCharValue', '') for field in row['Data']]
        rows.append(data)

    # Create pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)

    # Convert data types as needed
    df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])
    df['pickup_location_id'] = df['pickup_location_id'].astype(int)
    df['rides'] = df['rides'].astype(int)

    print(f"Query results loaded into DataFrame with {len(df)} rows")
    print(df.head())
else:
    print(f"Query failed with state: {state}")
    print(response['QueryExecution']['Status']['StateChangeReason'])

Query results loaded into DataFrame with 999 rows
  pickup_hour  pickup_location_id  rides month
0  2023-01-01                 202      0    01
1  2023-01-01                  95      0    01
2  2023-01-01                  10      0    01
3  2023-01-01                 236    137    01
4  2023-01-01                  39      1    01


In [3]:
import boto3
import pandas as pd
import sqlite3
from io import BytesIO

# Initialize S3 client
s3 = boto3.client('s3')

# S3 bucket and paths
bucket_name = "taximd"
input_prefix = "taxi/glue-transformed/"
output_prefix = "taxi/sqlite3db/"
sqlite_db_name = "taxi_data.db"

# Function to list all Parquet files in the specified year and month
def list_parquet_files(bucket, prefix, year, month):
    folder_prefix = f"{prefix}year={year}/month={month:02d}/"
    response = s3.list_objects_v2(Bucket=bucket, Prefix=folder_prefix)
    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.parquet')]
    return []

# Function to read a Parquet file from S3 into a Pandas DataFrame
def read_parquet_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_parquet(BytesIO(response['Body'].read()))

# Function to upload SQLite database to S3
def upload_sqlite_to_s3(bucket, key, db_path):
    with open(db_path, 'rb') as f:
        s3.upload_fileobj(f, bucket, key)


# Create an SQLite database
conn = sqlite3.connect(sqlite_db_name)
cursor = conn.cursor()

# Loop through years and months
for year in [2023, 2024]:
    for month in range(1, 13):
        print(f"Processing year={year}, month={month:02d}...")
        parquet_files = list_parquet_files(bucket_name, input_prefix, year, month)

        for file_key in parquet_files:
            print(f"Reading file: {file_key}")
            # Read Parquet file into a DataFrame
            df = read_parquet_from_s3(bucket_name, file_key)

            # Append data to SQLite database
            df.to_sql('taxi_data', conn, if_exists='append', index=False)

# Create an index on the `pickup_hour` column
print("Creating index on pickup_hour...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pickup_hour ON taxi_data (pickup_hour);")

# Create an index on the `pickup_location_id` column
print("Creating index on pickup_location_id...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pickup_location_id ON taxi_data (pickup_location_id);")

# Commit and close the SQLite database
conn.commit()
conn.close()

# Upload the SQLite database to S3
output_key = f"{output_prefix}{sqlite_db_name}"
print(f"Uploading SQLite database to S3: {output_key}")
upload_sqlite_to_s3(bucket_name, output_key, sqlite_db_name)
print("Upload complete.")

Processing year=2023, month=01...
Reading file: taxi/glue-transformed/year=2023/month=01/part-00000-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00001-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00002-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00003-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00004-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00005-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00006-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c000.snappy.parquet
Reading file: taxi/glue-transformed/year=2023/month=01/part-00007-6a5aebc6-3d37-4aed-aac2-d7b4c8ea1aac-c0

In [4]:
import boto3
import sqlite3
import pandas as pd
import tempfile

# S3 bucket and database path
bucket_name = "taximd"
sqlite_db_key = "taxi/sqlite3db/taxi_data.db"

# Initialize S3 client
s3 = boto3.client('s3')

# Download the SQLite database from S3 to a temporary file
with tempfile.NamedTemporaryFile(suffix=".db") as temp_file:
    # Download the database file from S3
    s3.download_file(bucket_name, sqlite_db_key, temp_file.name)

    # Connect to the SQLite database
    conn = sqlite3.connect(temp_file.name)

    # Query the database to read the data into a Pandas DataFrame
    query = "SELECT * FROM taxi_data;"  # Replace with your desired query if needed
    df = pd.read_sql_query(query, conn)

    # Close the connection
    conn.close()

# Display the DataFrame
print(df.head())  # Display the first few rows of the DataFrame

           pickup_hour  pickup_location_id  rides
0  2023-01-01 00:00:00                   2      0
1  2023-01-01 01:00:00                   2      0
2  2023-01-01 02:00:00                   2      0
3  2023-01-01 03:00:00                   2      0
4  2023-01-01 04:00:00                   2      0


In [5]:
df.shape

(4394088, 3)

In [6]:
import pandas as pd
# Filter the DataFrame for the specified conditions
filtered_df = (
    df.loc[
        (df['pickup_hour'] >= '2023-01-01 00:00:00') &
        (df['pickup_hour'] <= '2023-12-31 23:59:59'),
        ['pickup_hour', 'pickup_location_id', 'rides']
    ]
    .sort_values(by='pickup_hour')
)

# Display the filtered DataFrame
print(filtered_df)

                 pickup_hour  pickup_location_id  rides
0        2023-01-01 00:00:00                   2      0
26040    2023-01-01 00:00:00                  38      0
154008   2023-01-01 00:00:00                 220      0
119040   2023-01-01 00:00:00                 171      0
25296    2023-01-01 00:00:00                  37      1
...                      ...                 ...    ...
2186831  2023-12-31 23:00:00                 245      0
2082671  2023-12-31 23:00:00                  95      0
2081927  2023-12-31 23:00:00                  94      0
2080439  2023-12-31 23:00:00                  92      0
2200223  2023-12-31 23:00:00                 263     79

[2200224 rows x 3 columns]


In [7]:
# Check if there are any fully duplicated rows
duplicates = filtered_df[filtered_df.duplicated()]
print(f"🔁 Total duplicate rows: {len(duplicates)}")
print(duplicates.head())


🔁 Total duplicate rows: 0
Empty DataFrame
Columns: [pickup_hour, pickup_location_id, rides]
Index: []


In [8]:
import pandas as pd

# Filter the DataFrame for the specified conditions
filtered_df = (
    df.loc[
        (df['pickup_hour'] >= '2023-01-01 00:00:00') &
        (df['pickup_hour'] <= '2024-01-31 23:59:59') &
        (df['pickup_location_id'] == 43),
        ['pickup_hour', 'pickup_location_id', 'rides']
    ]
    .sort_values(by='pickup_hour')
)

# Display the filtered DataFrame
print(filtered_df)

                 pickup_hour  pickup_location_id  rides
29760    2023-01-01 00:00:00                  43     87
29761    2023-01-01 01:00:00                  43     72
29762    2023-01-01 02:00:00                  43     27
29763    2023-01-01 03:00:00                  43     13
29764    2023-01-01 04:00:00                  43      3
...                      ...                 ...    ...
2230723  2024-01-31 19:00:00                  43     75
2230724  2024-01-31 20:00:00                  43     47
2230725  2024-01-31 21:00:00                  43     46
2230726  2024-01-31 22:00:00                  43     37
2230727  2024-01-31 23:00:00                  43     13

[9504 rows x 3 columns]


# Model 1 – LightGBM with 28-day Lag Features

In [11]:
def create_time_series_features(df, n_lags=24*28):
    """
    Creates features and targets for the provided DataFrame, assuming it contains data for a single location.

    Parameters:
    - df: DataFrame with columns [pickup_hour, rides]
    - n_lags: Number of lag features to create

    Returns:
    - Xy: Features DataFrame with target column
    """
    # Check if enough data
    if len(df) <= n_lags:
        raise ValueError(f"Insufficient data ({len(df)} rows)")

    # Sort the data by pickup_hour
    df = df.sort_values('pickup_hour')
    rides_series = df['rides'].reset_index(drop=True)

    # Create lag features
    lag_data = {}
    for lag in range(1, n_lags + 1):
        lag_data[f'lag_{lag}'] = rides_series.shift(lag)

    # Create features DataFrame
    features_df = pd.DataFrame(lag_data)
    features_df['target'] = rides_series
    features_df['pickup_hour'] = df['pickup_hour'].reset_index(drop=True)

    # Drop rows with NaN values
    features_df = features_df.dropna()

    if len(features_df) == 0:
        raise ValueError("No valid data after dropping NaNs")

    # Create final dataset
    column_order = [f'lag_{i}' for i in range(n_lags, 0, -1)]
    Xy = features_df[column_order].copy()
    Xy['pickup_hour'] = features_df['pickup_hour']
    Xy['target'] = features_df['target'].copy()

    print(f"\nCreated dataset with {len(Xy)} rows")
    print(f"Date range: {Xy['pickup_hour'].min()} to {Xy['pickup_hour'].max()}")

    return Xy

In [12]:
import pandas as pd

def build_dataset_for_location(df, location_id, start_date, end_date, n_lags=24*28):
    """
    Filters 'df' for one location + date range, then calls 'create_time_series_features'
    to build lag-based features.

    Assumes 'df' has columns: [pickup_hour, pickup_location_id, rides].
    Uses the 'create_time_series_features' function you provided.

    Returns:
        (X, y): Feature matrix and target array.
    """

    loc_df = df.loc[
        (df['pickup_hour'] >= start_date) &
        (df['pickup_hour'] <= end_date) &
        (df['pickup_location_id'] == location_id),
        ['pickup_hour', 'rides']  # We only need these columns for the TS function
    ].copy()

    # Create lag features using your existing function
    Xy = create_time_series_features(loc_df, n_lags=n_lags)

    # Separate features and target
    X = Xy.drop(columns=['pickup_hour','target'])
    y = Xy['target'].values

    return X, y


In [13]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

def train_single_lgbm_model(X, y):
    """
    Trains a single LightGBM model on the provided X, y.
    Returns the fitted model plus basic metrics (MAE, RMSE).
    """
    model = lgb.LGBMRegressor()
    model.fit(X, y)

    # Evaluate on the same data (or do a proper train/test split by date if needed)
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds, squared=False)

    return model, mae, rmse


In [14]:
# Example usage:

# Suppose your main DataFrame is called 'df' and has columns:
# [pickup_hour, pickup_location_id, rides]

# Define the locations you want, e.g., 3 total, or as many as you like
locations = [43, 50, 79]

# Define date range and lags
START_DATE = '2023-01-01 00:00:00'
END_DATE   = '2024-01-31 23:59:59'
N_LAGS     = 24 * 28  # 28 days

# Dictionary to store the models
location_models = {}

for loc_id in locations:
    print(f"\n=== Training one model for Location {loc_id} ===")

    # Build dataset for this location
    X, y = build_dataset_for_location(
        df,
        location_id=loc_id,
        start_date=START_DATE,
        end_date=END_DATE,
        n_lags=N_LAGS
    )
    
    if len(X) == 0:
        print("  No data found for this location, skipping.")
        continue

    # Train a single LightGBM model
    model, mae, rmse = train_single_lgbm_model(X, y)
    print(f"  MAE={mae:.2f}, RMSE={rmse:.2f} (rows={len(X)})")

    # Store the model and metrics
    location_models[loc_id] = {
        "model": model,
        "MAE": mae,
        "RMSE": rmse
    }



=== Training one model for Location 43 ===

Created dataset with 8832 rows
Date range: 2023-01-29 00:00:00 to 2024-01-31 23:00:00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156487
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 672
[LightGBM] [Info] Start training from score 67.247849




  MAE=5.86, RMSE=8.35 (rows=8832)

=== Training one model for Location 50 ===

Created dataset with 8832 rows
Date range: 2023-01-29 00:00:00 to 2024-01-31 23:00:00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 59812
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 672
[LightGBM] [Info] Start training from score 24.705276




  MAE=3.39, RMSE=4.57 (rows=8832)

=== Training one model for Location 79 ===

Created dataset with 8832 rows
Date range: 2023-01-29 00:00:00 to 2024-01-31 23:00:00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171360
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 672
[LightGBM] [Info] Start training from score 92.872283
  MAE=6.92, RMSE=10.13 (rows=8832)




In [15]:
import pandas as pd



# Filter the DataFrame for the specified conditions
filtered_df = (
    df.loc[
        (df['pickup_hour'] >= '2023-12-04 00:00:00') &
        (df['pickup_hour'] <= '2024-12-31 23:59:59') &
        (df['pickup_location_id'] == 43),
        ['pickup_hour', 'pickup_location_id', 'rides']
    ]
    .sort_values(by='pickup_hour')
)

# Display the filtered DataFrame
print(filtered_df)

                 pickup_hour  pickup_location_id  rides
2044056  2023-12-04 00:00:00                  43      2
2044057  2023-12-04 01:00:00                  43      2
2044058  2023-12-04 02:00:00                  43      0
2044059  2023-12-04 03:00:00                  43      1
2044060  2023-12-04 04:00:00                  43      2
...                      ...                 ...    ...
4238587  2024-12-31 19:00:00                  43     70
4238588  2024-12-31 20:00:00                  43     45
4238589  2024-12-31 21:00:00                  43     37
4238590  2024-12-31 22:00:00                  43     22
4238591  2024-12-31 23:00:00                  43     44

[9456 rows x 3 columns]


In [16]:
df_2024_ts = create_time_series_features(filtered_df)


Created dataset with 8784 rows
Date range: 2024-01-01 00:00:00 to 2024-12-31 23:00:00


In [17]:
X_2024 = df_2024_ts.copy()
X_2024 = X_2024.drop(columns=['pickup_hour', 'target'])
y_2024 = df_2024_ts['target']

In [18]:
location_models

{43: {'model': LGBMRegressor(),
  'MAE': 5.862633429810342,
  'RMSE': 8.3480343553471},
 50: {'model': LGBMRegressor(),
  'MAE': 3.3931056602672043,
  'RMSE': 4.570935382506919},
 79: {'model': LGBMRegressor(),
  'MAE': 6.920997608160822,
  'RMSE': 10.13344702398812}}

In [19]:
import numpy as np

In [20]:
y_pred = np.ceil(location_models[43]['model'].predict(X_2024))
test_mae = mean_absolute_error(y_2024, y_pred)
print(f"{test_mae:.4f}")

9.6477


In [21]:
# Create a DataFrame with named columns for better readability
df_result = pd.DataFrame({
    'pickup_hour': df_2024_ts['pickup_hour'],
    'actual': y_2024,
    'predicted': y_pred
})

# Display the resulting DataFrame
print(df_result)

              pickup_hour  actual  predicted
672   2024-01-01 00:00:00     155       33.0
673   2024-01-01 01:00:00      77       62.0
674   2024-01-01 02:00:00      34       32.0
675   2024-01-01 03:00:00      11       13.0
676   2024-01-01 04:00:00       4        4.0
...                   ...     ...        ...
9451  2024-12-31 19:00:00      70       55.0
9452  2024-12-31 20:00:00      45       52.0
9453  2024-12-31 21:00:00      37       41.0
9454  2024-12-31 22:00:00      22       32.0
9455  2024-12-31 23:00:00      44       17.0

[8784 rows x 3 columns]


In [22]:
import plotly.graph_objects as go

def plot_ride_prediction(df_result, final_prediction=None):
    """
    Create a Plotly visualization of actual vs predicted time series with an optional final prediction point.

    Parameters:
    - df_result: DataFrame with columns ['pickup_hour', 'actual', 'predicted']
    - final_prediction: Optional dictionary with keys ['pickup_hour', 'predicted'] for the next prediction
    """
    # Create figure
    fig = go.Figure()

    # Add actual rides trace
    fig.add_trace(go.Scatter(
        x=df_result['pickup_hour'],
        y=df_result['actual'],
        mode='lines',
        name='Actual Rides',
        line=dict(color='blue', width=2)
    ))

    # Add predicted rides trace
    fig.add_trace(go.Scatter(
        x=df_result['pickup_hour'],
        y=df_result['predicted'],
        mode='lines',
        name='Predicted Rides',
        line=dict(color='green', width=2, dash='dash')
    ))

    # Add final prediction point if provided
    if final_prediction:
        fig.add_trace(go.Scatter(
            x=[final_prediction['pickup_hour']],
            y=[final_prediction['predicted']],
            mode='markers',
            name='Next Hour Prediction',
            marker=dict(color='red', size=12, symbol='star')
        ))

        # Add annotation for the prediction
        fig.add_annotation(
            x=final_prediction['pickup_hour'],
            y=final_prediction['predicted'],
            text=f"Prediction: {final_prediction['predicted']} rides",
            showarrow=True,
            arrowhead=1,
            ax=40,
            ay=-40
        )

    # Add vertical line at the end of historical data
    last_historical_time = df_result['pickup_hour'].iloc[-1]
    fig.add_shape(
        type="line",
        x0=last_historical_time,
        y0=0,
        x1=last_historical_time,
        y1=max(df_result['actual'].max(), df_result['predicted'].max()) * 1.1,
        line=dict(color="gray", width=2, dash="dot")
    )

    # Update layout
    fig.update_layout(
        title="Ride Prediction",
        xaxis_title="Date & Time",
        yaxis_title="Number of Rides",
        legend_title="Data Series",
        hovermode="x unified",
        template="plotly_white"
    )

    # Add range selector
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=12, label="12h", step="hour", stepmode="backward"),
                dict(count=1, label="1d", step="day", stepmode="backward"),
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(step="all")
            ])
        )
    )

    return fig

In [23]:
plot_ride_prediction(df_result)

In [24]:
import os
import sqlite3
import boto3
import numpy as np
import pandas as pd
from io import StringIO
from datetime import datetime
from sklearn.metrics import mean_absolute_error

In [25]:
def save_predictions_to_s3_partitioned(
    df_predictions, 
    location_id, 
    s3_bucket, 
    model_id="lightgbm"
):
    """
    Saves each row of 'df_predictions' to a partitioned path in S3, as well as inserts them into a local SQLite DB.

    Args:
        df_predictions (DataFrame): Must have columns [prediction_datetime, predicted_rides].
        location_id (int): Pickup location ID
        s3_bucket (str): Name of the S3 bucket
        model_id (str): Tag inserted into 'model=<model_id>' for S3 partition path
    """
    s3_client = boto3.client('s3') if s3_bucket else None

    # SQLite DB file
    sqlite_db_path = "predicted.db"
    conn = sqlite3.connect(sqlite_db_path)
    cursor = conn.cursor()

    cursor.execute("""
    CREATE TABLE IF NOT EXISTS predictions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        location_id INTEGER,
        prediction_datetime TEXT,
        predicted_rides INTEGER
    )
    """)

    # Loop over each row in the predictions DF
    for _, row in df_predictions.iterrows():
        dt = pd.to_datetime(row['prediction_datetime'])
        year = dt.year
        month = dt.month
        day = dt.day
        hour = dt.hour

        # Build the S3 partition path:
        # taxi/predictions/model=<model_id>/location_id=<loc_id>/year=<YYYY>/month=<MM>/day=<DD>/hour=<HH>/
        s3_key = (
            f"taxi/predictions/"
            f"model={model_id}/"
            f"location_id={location_id}/"
            f"year={year}/month={month:02d}/day={day:02d}/hour={hour}/prediction.csv"
        )

        # Create a 1-row DataFrame for CSV upload
        pred_df = pd.DataFrame([row])
        csv_buffer = StringIO()
        pred_df.to_csv(csv_buffer, index=False)

        # Upload to S3
        if s3_bucket:
            s3_client.put_object(
                Bucket=s3_bucket,
                Key=s3_key,
                Body=csv_buffer.getvalue()
            )
            print(f"Saved to s3://{s3_bucket}/{s3_key}")

        # Insert into SQLite
        cursor.execute("""
        INSERT INTO predictions (location_id, prediction_datetime, predicted_rides)
        VALUES (?, ?, ?)
        """, (location_id, row['prediction_datetime'], row['predicted_rides']))

    conn.commit()
    conn.close()
    print(f"Predictions also stored in SQLite3 at {sqlite_db_path}")

In [26]:
def predict_for_all_locations(df, location_models, s3_bucket, model_id="lightgbm"):
    """
    1) For each location in 'location_models',
       - Filter df for the unseen data range (e.g., 2023-12-04 to 2024-12-31).
       - Create time-series features.
       - Predict using that location's model.
       - Calculate MAE (optional).
       - Save predictions to S3 + SQLite partitioned path.
    """

    # Unseen data range
    UNSEEN_START = '2023-12-04 00:00:00'
    UNSEEN_END   = '2024-12-31 23:59:59'

    for loc_id, info in location_models.items():
        print(f"\n=== Generating predictions for Location {loc_id} ===")

        # 1) Filter unseen data
        unseen_df = df.loc[
            (df['pickup_hour'] >= UNSEEN_START) &
            (df['pickup_hour'] <= UNSEEN_END) &
            (df['pickup_location_id'] == loc_id),
            ['pickup_hour', 'pickup_location_id', 'rides']
        ].sort_values(by='pickup_hour')

        if unseen_df.empty:
            print(f"  No unseen data for location {loc_id}, skipping...")
            continue

        print(f"  Unseen rows: {len(unseen_df)}")

        # 2) Build time-series features (same function as training)
        unseen_ts = create_time_series_features(unseen_df)

        # 3) Separate features and actual
        X_unseen = unseen_ts.drop(columns=['pickup_hour','target'])
        y_unseen = unseen_ts['target']

        # 4) Make predictions
        model = info['model']  # your location_models dict might store {'model':..., 'MAE':..., etc.}
        y_pred = np.ceil(model.predict(X_unseen))  # rounding up, as in your snippet

        # Evaluate for reference
        mae_unseen = mean_absolute_error(y_unseen, y_pred)
        print(f"  MAE on unseen data = {mae_unseen:.4f}")

        # 5) Build a DataFrame of predictions
        df_predictions = pd.DataFrame({
            'prediction_datetime': unseen_ts['pickup_hour'],
            'predicted_rides': y_pred.astype(int)
        })

        save_predictions_to_s3_partitioned(
            df_predictions,
            location_id=loc_id,
            s3_bucket=s3_bucket,
            model_id=model_id
        )




In [27]:
df.shape

(4394088, 3)

In [28]:
bucket_name = "taximd"
predict_for_all_locations(df, location_models, bucket_name, model_id="lightgbm")


=== Generating predictions for Location 43 ===
  Unseen rows: 9456

Created dataset with 8784 rows
Date range: 2024-01-01 00:00:00 to 2024-12-31 23:00:00
  MAE on unseen data = 9.6477
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=0/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=1/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=2/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=3/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=4/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=5/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm/location_id=43/year=2024/month=01/day=01/hour=6/prediction.csv
Sav

# Lightgbm with features

In [70]:
import pandas as pd
import numpy as np
from datetime import timedelta

def build_full_feature_set(df, n_lags=24*28):
    """
    1) Creates many lag features (up to n_lags).
    2) Adds 'avg_same_hour_4w' = average rides in the preceding 4 weeks for that specific hour_of_day.
       (Naive approach: for each row, we find rows in [t-28 days, t) with the same hour_of_day, average the 'rides'.)

    Returns a DataFrame with columns:
      [lag_1 ... lag_n, pickup_hour, target, hour_of_day, avg_same_hour_4w, ...]
    """

    # Sort & reset index
    df = df.sort_values('pickup_hour').reset_index(drop=True)

    # Step A: create standard lag features from your existing logic
    #   - we assume "df" has columns: ['pickup_hour', 'rides']
    rides_series = df['rides'].copy()
    if len(rides_series) <= n_lags:
        raise ValueError(f"Insufficient data: {len(rides_series)} rows, need more than {n_lags}")

    lag_dict = {}
    for lag in range(1, n_lags + 1):
        lag_dict[f'lag_{lag}'] = rides_series.shift(lag)

    features_df = pd.DataFrame(lag_dict)
    features_df['target'] = rides_series
    features_df['pickup_hour'] = df['pickup_hour']
    features_df.dropna(inplace=True)

    # Step B: Extract hour_of_day
    features_df['hour_of_day'] = features_df['pickup_hour'].dt.hour

    # Step C: compute avg_same_hour_4w
    #   For each row i, we look for rows in [t-28 days, t), with same hour_of_day
    #   and average their 'target' (rides).
    avg_4w_list = []
    for i, row in features_df.iterrows():
        current_time = row['pickup_hour']
        hour = row['hour_of_day']
        start_time = current_time - timedelta(days=28)

        # Filter up to the current row for times in [start_time, current_time),
        # matching hour_of_day
        subset = features_df.loc[
            (features_df['pickup_hour'] < current_time) &
            (features_df['pickup_hour'] >= start_time) &
            (features_df['hour_of_day'] == hour)
        ]
        if len(subset) == 0:
            avg_4w_list.append(np.nan)
        else:
            avg_4w_list.append(subset['target'].mean())

    features_df['avg_same_hour_4w'] = avg_4w_list
    features_df.fillna(features_df['target'].mean(), inplace=True)  # fill any leftover NaNs

    return features_df


In [71]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

REQUIRED_FEATURES = [
    'lag_1', 'lag_24', 'lag_48', 'lag_72', 'lag_96',
    'avg_same_hour_4w'
]
# We have 6 mandatory. We'll add 4 more from feature importance = total 10.
# If you truly want 5 more, that’s 11 total.

def train_lgbm_with_feature_selection(X_full, y_full):
    """
    1) Train a temporary model on all features to get feature importances.
    2) Keep only REQUIRED_FEATURES plus the top 4 from feature importance (excluding required).
    3) Train a final LightGBM on that subset of features, return model + metrics + used feature list.
    """

    # A) TEMP model on all features for importance
    temp_model = lgb.LGBMRegressor(random_state=42)
    temp_model.fit(X_full, y_full)

    # Get feature importances
    importances = temp_model.feature_importances_
    feature_names = X_full.columns.tolist()
    feat_imp_pairs = sorted(
        zip(feature_names, importances),
        key=lambda x: x[1],
        reverse=True
    )

    # B) Build the final feature list:
    #  - Always keep the REQUIRED_FEATURES (if they exist in X).
    #  - Then select top 4 from the rest, ignoring any that are in REQUIRED_FEATURES already.
    required = [f for f in REQUIRED_FEATURES if f in X_full.columns]
    # Gather all non-required features sorted by importance
    non_required_sorted = [f for f, imp in feat_imp_pairs if f not in required]
    # Pick top 4 from these
    extra_4 = non_required_sorted[:4]  # pick however many you need to get total = 10
    final_features = required + extra_4

    # Filter X to final columns
    X_final = X_full[final_features].copy()

    # C) Train final model
    final_model = lgb.LGBMRegressor(random_state=42)
    final_model.fit(X_final, y_full)

    # Evaluate
    preds = final_model.predict(X_final)
    mae = mean_absolute_error(y_full, preds)
    rmse = mean_squared_error(y_full, preds, squared=False)

    return final_model, final_features, mae, rmse


In [72]:
def train_lightgbm_per_location(df, locations, start_date, end_date):
    """
    For each location in 'locations':
      - Filter data to (start_date, end_date),
      - Build the full feature set (with all lags + avg_same_hour_4w),
      - Keep only 10 features (6 required + 4 best from importance),
      - Train a final LightGBM,
      - Return a dict of {loc_id: {'model': model, 'features': [...], 'MAE':..., 'RMSE':...}}.
    """

    location_models = {}
    for loc_id in locations:
        print(f"\n=== Training LightGBM for Location {loc_id} ===")

        # A) Filter data for that location & date range
        loc_data = df.loc[
            (df['pickup_location_id'] == loc_id) &
            (df['pickup_hour'] >= start_date) &
            (df['pickup_hour'] <= end_date),
            ['pickup_hour','rides']
        ].copy()
        if loc_data.empty:
            print(f"  No data for {loc_id}, skipping...")
            continue

        # B) Build big feature set
        try:
            big_features = build_full_feature_set(loc_data, n_lags=24*28)
        except ValueError as e:
            print(f"  Skipping {loc_id}: {e}")
            continue

        # C) Separate X,y
        X_all = big_features.drop(columns=['pickup_hour','target'])
        y_all = big_features['target'].values

        # D) Train final model with feature selection
        model, used_feats, mae, rmse = train_lgbm_with_feature_selection(X_all, y_all)
        print(f"  => Used {len(used_feats)} features: {used_feats}")
        print(f"  => MAE={mae:.2f}, RMSE={rmse:.2f}, rows={len(X_all)}")

        location_models[loc_id] = {
            "model": model,
            "features": used_feats,
            "MAE": mae,
            "RMSE": rmse
        }
    return location_models


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4394088 entries, 0 to 4394087
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   pickup_location_id  int64         
 2   rides               int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 100.6 MB


In [74]:
import pandas as pd

# Suppose your DataFrame is named 'df' and 'pickup_hour' is currently a string
df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])

# Now 'pickup_hour' is a Pandas datetime64[ns] type
print(df.dtypes)


pickup_hour           datetime64[ns]
pickup_location_id             int64
rides                          int64
dtype: object


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4394088 entries, 0 to 4394087
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   pickup_location_id  int64         
 2   rides               int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 100.6 MB


In [76]:

locations = [43, 50, 79]  # e.g., 3 location IDs
START_DATE = '2023-01-01 00:00:00'
END_DATE   = '2024-01-31 23:59:59'

location_models = train_lightgbm_per_location(df, locations, START_DATE, END_DATE)

# 'location_models' now has your final LightGBM for each location,
# trained on exactly 10 features: the 6 required + 4 best from importance.



=== Training LightGBM for Location 43 ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156766
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 674
[LightGBM] [Info] Start training from score 67.247849
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2351
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 10
[LightGBM] [Info] Start training from score 67.247849
  => Used 10 features: ['lag_1', 'lag_24', 'lag_48', 'lag_72', 'lag_96', 'avg_same_hour_4w', 'lag_3', 'lag_335', 'lag_168', 'lag_2']
  => MAE=8.24, RMSE=12.14, rows=8832

=== Training LightGBM for Location 50 ===



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 60091
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 674
[LightGBM] [Info] Start training from score 24.705276
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1056
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 10
[LightGBM] [Info] Start training from score 24.705276
  => Used 10 features: ['lag_1', 'lag_24', 'lag_48', 'lag_72', 'lag_96', 'avg_same_hour_4w', 'lag_2', 'lag_336', 'lag_4', 'lag_144']
  => MAE=4.63, RMSE=6.45, rows=8832

=== Training LightGBM for Location 79 ===



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171639
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 674
[LightGBM] [Info] Start training from score 92.872283
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 8832, number of used features: 10
[LightGBM] [Info] Start training from score 92.872283
  => Used 10 features: ['lag_1', 'lag_24', 'lag_48', 'lag_72', 'lag_96', 'avg_same_hour_4w', 'lag_2', 'lag_168', 'lag_362', 'lag_167']
  => MAE=9.38, RMSE=13.96, rows=8832



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [77]:
location_models

{43: {'model': LGBMRegressor(random_state=42),
  'features': ['lag_1',
   'lag_24',
   'lag_48',
   'lag_72',
   'lag_96',
   'avg_same_hour_4w',
   'lag_3',
   'lag_335',
   'lag_168',
   'lag_2'],
  'MAE': 8.241173471074546,
  'RMSE': 12.137380945197188},
 50: {'model': LGBMRegressor(random_state=42),
  'features': ['lag_1',
   'lag_24',
   'lag_48',
   'lag_72',
   'lag_96',
   'avg_same_hour_4w',
   'lag_2',
   'lag_336',
   'lag_4',
   'lag_144'],
  'MAE': 4.629577434019678,
  'RMSE': 6.446459844355163},
 79: {'model': LGBMRegressor(random_state=42),
  'features': ['lag_1',
   'lag_24',
   'lag_48',
   'lag_72',
   'lag_96',
   'avg_same_hour_4w',
   'lag_2',
   'lag_168',
   'lag_362',
   'lag_167'],
  'MAE': 9.382510418587712,
  'RMSE': 13.95911547507249}}

In [78]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
2,2023-01-01 02:00:00,2,0
3,2023-01-01 03:00:00,2,0
4,2023-01-01 04:00:00,2,0
...,...,...,...
4394083,2024-12-31 19:00:00,263,122
4394084,2024-12-31 20:00:00,263,194
4394085,2024-12-31 21:00:00,263,196
4394086,2024-12-31 22:00:00,263,150


In [79]:
import sqlite3
import boto3
import pandas as pd
from io import StringIO

def save_predictions_to_s3_partitioned_newdb(
    df_predictions,
    location_id,
    s3_bucket,
    model_id="lightgbm"
):
    """
    Saves each row of 'df_predictions' to:
      s3://<bucket>/taxi/predictions/model=<model_id>/location_id=<ID>/year=<YYYY>/month=<MM>/day=<DD>/hour=<HH>/prediction.csv
    AND inserts them into a local SQLite DB named 'predicted_v3.db' with a table 'predictions_v3'.
    
    The 'prediction_datetime' column is declared as DATETIME in SQLite.
    
    Args:
        df_predictions (DataFrame): Must have columns [prediction_datetime, predicted_rides].
        location_id (int): Pickup location ID
        s3_bucket (str): Name of the S3 bucket
        model_id (str): Tag used in the S3 key (e.g., 'lightgbm')
    """

    # Create S3 client if bucket specified
    s3_client = boto3.client('s3') if s3_bucket else None

    # SQLite DB name
    sqlite_db_path = "predicted_v3.db"

    # Connect to the new DB
    conn = sqlite3.connect(sqlite_db_path)
    cursor = conn.cursor()

    # Create the table with DATETIME for prediction_datetime
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS predictions_v3 (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        location_id INTEGER,
        prediction_datetime DATETIME,
        predicted_rides INTEGER
    )
    """)

    # Insert each row + upload to S3
    for _, row in df_predictions.iterrows():
        dt = pd.to_datetime(row['prediction_datetime'])
        year = dt.year
        month = dt.month
        day = dt.day
        hour = dt.hour

        # Build S3 key (partition path)
        s3_key = (
            f"taxi/predictions/"
            f"model={model_id}/"
            f"location_id={location_id}/"
            f"year={year}/month={month:02d}/day={day:02d}/hour={hour}/prediction.csv"
        )

        # Convert single row to CSV
        pred_df = pd.DataFrame([row])
        csv_buffer = StringIO()
        pred_df.to_csv(csv_buffer, index=False)

        # Upload to S3 if a bucket was specified
        if s3_client:
            s3_client.put_object(
                Bucket=s3_bucket,
                Key=s3_key,
                Body=csv_buffer.getvalue()
            )
            print(f"Saved to s3://{s3_bucket}/{s3_key}")

        # Insert into the new DB table
        # Convert row['prediction_datetime'] to a string
        time_str = row['prediction_datetime'].strftime('%Y-%m-%d %H:%M:%S')
        
        cursor.execute("""
        INSERT INTO predictions_v3 (location_id, prediction_datetime, predicted_rides)
        VALUES (?, ?, ?)
        """, (location_id, time_str, row['predicted_rides']))


    conn.commit()
    conn.close()

    print(f"Predictions stored in {sqlite_db_path} (table: predictions_v3) with DATETIME column.")


In [80]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

def predict_for_all_locations_with_features(
    df,
    location_models,
    s3_bucket,
    model_id="lightgbm_feature_engineering",
    unseen_start='2023-12-04 00:00:00',
    unseen_end='2024-12-31 23:59:59'
):
    """
    For each location in 'location_models':
      1) Filter df for the unseen date range (unseen_start to unseen_end)
      2) Build the same full feature set used during training (via build_full_feature_set)
      3) Subset the columns to those the model was trained on
      4) Predict using the stored model
      5) Compute and print MAE for reference
      6) Save predictions to S3 + SQLite partitioned path using save_predictions_to_s3_partitioned
    """

    for loc_id, info in location_models.items():
        print(f"\n=== Generating predictions for Location {loc_id} ===")

        # 1) Filter the unseen data for this location
        unseen_df = df.loc[
            (df['pickup_hour'] >= unseen_start) &
            (df['pickup_hour'] <= unseen_end) &
            (df['pickup_location_id'] == loc_id)
        ].copy()

        # Convert pickup_hour to datetime if still string
        unseen_df['pickup_hour'] = pd.to_datetime(unseen_df['pickup_hour'])

        if unseen_df.empty:
            print(f"  No unseen data for location {loc_id}; skipping...")
            continue

        # 2) Build the same full feature set used at training
        try:
            big_unseen_features = build_full_feature_set(unseen_df, n_lags=24*28)
        except ValueError as e:
            print(f"  Skipping {loc_id} due to insufficient data: {e}")
            continue

        print(f"  Unseen rows (after dropping NaN lags): {len(big_unseen_features)}")

        # 3) Subset columns to match what the model was trained on
        final_feature_list = info['features']  # e.g. from your training dict: { "model":..., "features":..., ...}
        X_unseen = big_unseen_features[final_feature_list].copy()

        # True target (if you want to measure error)
        y_unseen = big_unseen_features['target'].values

        # 4) Predict
        model = info['model']
        y_pred = model.predict(X_unseen)
        y_pred_rounded = np.ceil(y_pred)  # or np.round, as you prefer

        # 5) Compute MAE
        mae_unseen = mean_absolute_error(y_unseen, y_pred_rounded)
        print(f"  MAE on unseen data = {mae_unseen:.4f}")

        # 6) Build DataFrame of predictions and save
        df_predictions = pd.DataFrame({
            'prediction_datetime': big_unseen_features['pickup_hour'],
            'predicted_rides': y_pred_rounded.astype(int)
        })

        # Optionally: df_predictions['actual_rides'] = y_unseen

        # Save predictions to S3 + SQLite
        save_predictions_to_s3_partitioned_newdb(
            df_predictions,
            location_id=loc_id,
            s3_bucket=s3_bucket,
            model_id=model_id
        )


In [81]:
bucket_name = "taximd"
predict_for_all_locations_with_features(df, location_models, bucket_name, model_id="lightgbm_feature_engineering")


=== Generating predictions for Location 43 ===
  Unseen rows (after dropping NaN lags): 8784
  MAE on unseen data = 10.5636
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=0/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=1/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=2/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=3/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=4/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_feature_engineering/location_id=43/year=2024/month=01/day=01/hour=5/prediction.csv
Saved to s3://taximd/taxi/predictions/model=lightgbm_featur