In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [3]:
from src.inference import get_feature_store

In [4]:
from datetime import datetime, timedelta
import pandas as pd  

# Get the current datetime64[us, Etc/UTC]  
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

# read time-series data from the feature store
fetch_data_to = current_date - timedelta(hours=1)
fetch_data_from = current_date - timedelta(days=1*29)
print(f"Fetching data from {fetch_data_from} to {fetch_data_to}")
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

ts_data = feature_view.get_batch_data(
    start_time=(fetch_data_from - timedelta(days=1)),
    end_time=(fetch_data_to + timedelta(days=1)),
)
ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]

2025-03-04 09:23:24,849 INFO: Initializing external client
2025-03-04 09:23:24,850 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 09:23:25,493 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Fetching data from 2025-02-03 14:23:24.848113+00:00 to 2025-03-04 13:23:24.848113+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.85s) 


In [5]:
ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-04 14:00:00+00:00,2,0
1,2025-02-04 15:00:00+00:00,2,0
2,2025-02-04 16:00:00+00:00,2,0
3,2025-02-04 17:00:00+00:00,2,0
4,2025-02-04 18:00:00+00:00,2,0
...,...,...,...
168667,2025-03-04 09:00:00+00:00,263,134
168668,2025-03-04 10:00:00+00:00,263,116
168669,2025-03-04 11:00:00+00:00,263,98
168670,2025-03-04 12:00:00+00:00,263,100


In [6]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168672 entries, 0 to 168671
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype                  
---  ------              --------------   -----                  
 0   pickup_hour         168672 non-null  datetime64[us, Etc/UTC]
 1   pickup_location_id  168672 non-null  int32                  
 2   rides               168672 non-null  int32                  
dtypes: datetime64[us, Etc/UTC](1), int32(2)
memory usage: 2.6 MB


In [7]:
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)

In [8]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168672 entries, 0 to 168671
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   pickup_hour         168672 non-null  datetime64[us]
 1   pickup_location_id  168672 non-null  int32         
 2   rides               168672 non-null  int32         
dtypes: datetime64[us](1), int32(2)
memory usage: 2.6 MB


In [10]:
# First, let's analyze data availability per location
location_data_counts = ts_data.groupby('pickup_location_id').size()
print(f"Locations with data: {len(location_data_counts)}")

# Calculate optimal window size based on data availability
min_records = location_data_counts.min()
median_records = location_data_counts.median()
print(f"Minimum records per location: {min_records}")
print(f"Median records per location: {median_records}")

# Dynamically adjust window size
optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps

print(f"\nUsing optimized parameters:")
print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
print(f"Step size: {optimal_step} hours")

# Filter locations with sufficient data
sufficient_locations = location_data_counts[location_data_counts >= optimal_window].index
ts_data_filtered = ts_data[ts_data.pickup_location_id.isin(sufficient_locations)]

# Transform with optimized parameters
features = transform_ts_data_info_features(
    ts_data_filtered,
    window_size=optimal_window,
    step_size=optimal_step
)

print(f"\nFeatures generated: {features.shape}")
print(f"Locations included: {features['pickup_location_id'].nunique()}")

Locations with data: 251
Minimum records per location: 672
Median records per location: 672.0

Using optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours

Features generated: (18072, 170)
Locations included: 251


In [11]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_location_id,pickup_hour
0,3,0,0,0,2,1,0,0,3,2,...,2,0,0,0,2,1,4,0,216,2025-03-02 00:00:00
1,0,3,2,0,1,1,1,1,1,0,...,0,2,4,1,8,1,0,0,216,2025-02-17 20:00:00
2,1,1,0,1,1,2,1,4,1,2,...,0,4,0,2,3,0,2,1,216,2025-02-28 05:00:00
3,4,1,2,2,1,0,0,1,1,0,...,1,2,0,2,1,0,0,0,216,2025-02-28 06:00:00
4,1,1,0,0,1,1,0,3,0,1,...,0,1,1,3,0,2,0,1,216,2025-02-26 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18067,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,202,2025-02-27 20:00:00
18068,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,202,2025-02-19 06:00:00
18069,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,2,1,0,202,2025-02-24 18:00:00
18070,1,1,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,202,2025-02-26 04:00:00


In [13]:
from src.inference import load_batch_of_features_from_store, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config

# Get current date and feature store
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

try:
    # Step 1: Get raw data
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = current_date - timedelta(days=14)  # Reduced to 14 days
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    # Step 2: Fetch and prepare data
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    # Step 3: Analyze data availability
    location_data_counts = ts_data.groupby('pickup_location_id').size()
    min_records = location_data_counts.min()
    print(f"Minimum records per location: {min_records}")
    
    # Step 4: Calculate optimal window size
    optimal_window = min(24 * 7, int(min_records * 0.8))  # Use 80% of minimum records or 7 days
    optimal_step = max(1, optimal_window // 24)  # Ensure at least 24 steps
    
    print(f"\nOptimized parameters:")
    print(f"Window size: {optimal_window} hours ({optimal_window/24:.1f} days)")
    print(f"Step size: {optimal_step} hours")
    
    # Step 5: Transform with optimized parameters
    from src.data_utils import transform_ts_data_info_features
    features = transform_ts_data_info_features(
        ts_data,
        window_size=optimal_window,
        step_size=optimal_step
    )
    
    print(f"\nFeatures generated successfully:")
    print(f"Shape: {features.shape}")
    print(f"Locations: {features['pickup_location_id'].nunique()}")
    
except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Information:")
    if 'ts_data' in locals():
        print(f"Data shape: {ts_data.shape}")
        print(f"Date range: {ts_data.pickup_hour.min()} to {ts_data.pickup_hour.max()}")
    features = None

# Return the features
features


2025-03-04 09:26:48,098 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 09:26:48,106 INFO: Initializing external client
2025-03-04 09:26:48,106 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 09:26:48,601 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.98s) 
Minimum records per location: 335

Optimized parameters:
Window size: 168 hours (7.0 days)
Step size: 7 hours

Features generated successfully:
Shape: (6024, 170)
Locations: 251


Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_location_id,pickup_hour
0,3,0,0,1,3,0,1,1,1,1,...,1,1,1,3,1,2,0,1,216,2025-02-23 11:00:00
1,1,1,1,1,1,1,1,0,1,0,...,1,0,0,1,1,0,0,3,216,2025-02-18 17:00:00
2,0,1,0,1,1,3,0,1,2,0,...,3,1,1,3,0,0,2,0,216,2025-03-01 00:00:00
3,1,2,0,1,0,0,0,1,0,6,...,0,0,0,0,0,2,2,0,216,2025-02-19 16:00:00
4,1,0,6,0,0,0,3,1,1,2,...,0,1,1,0,2,1,2,4,216,2025-02-21 23:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6019,246,72,56,91,43,18,96,102,3,56,...,44,36,72,47,63,53,89,100,79,2025-02-19 04:00:00
6020,102,3,56,74,164,59,44,125,22,12,...,100,20,131,266,49,10,26,22,79,2025-02-26 12:00:00
6021,125,22,12,39,77,98,45,329,67,168,...,22,56,28,56,29,63,11,111,79,2025-02-25 20:00:00
6022,329,67,168,93,12,155,63,244,10,46,...,111,274,114,35,17,81,44,550,79,2025-03-02 18:00:00


In [14]:
current_date

Timestamp('2025-03-04 14:26:48.098685+0000', tz='Etc/UTC')

In [15]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_location_id,pickup_hour
0,3,0,0,1,3,0,1,1,1,1,...,1,1,1,3,1,2,0,1,216,2025-02-23 11:00:00
1,1,1,1,1,1,1,1,0,1,0,...,1,0,0,1,1,0,0,3,216,2025-02-18 17:00:00
2,0,1,0,1,1,3,0,1,2,0,...,3,1,1,3,0,0,2,0,216,2025-03-01 00:00:00
3,1,2,0,1,0,0,0,1,0,6,...,0,0,0,0,0,2,2,0,216,2025-02-19 16:00:00
4,1,0,6,0,0,0,3,1,1,2,...,0,1,1,0,2,1,2,4,216,2025-02-21 23:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6019,246,72,56,91,43,18,96,102,3,56,...,44,36,72,47,63,53,89,100,79,2025-02-19 04:00:00
6020,102,3,56,74,164,59,44,125,22,12,...,100,20,131,266,49,10,26,22,79,2025-02-26 12:00:00
6021,125,22,12,39,77,98,45,329,67,168,...,22,56,28,56,29,63,11,111,79,2025-02-25 20:00:00
6022,329,67,168,93,12,155,63,244,10,46,...,111,274,114,35,17,81,44,550,79,2025-03-02 18:00:00


In [16]:
from src.inference import load_model_from_registry

model = load_model_from_registry()

2025-03-04 09:27:08,549 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 09:27:08,551 INFO: Initializing external client
2025-03-04 09:27:08,552 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 09:27:09,235 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Downloading model artifact (0 dirs, 1 files)... DONE

In [56]:
from src.inference import load_model_from_registry, get_model_predictions, get_feature_store
import pandas as pd
from datetime import timedelta
import src.config as config
import lightgbm as lgb

try:
    # Step 1: Get feature store and data
    feature_store = get_feature_store()
    current_date = pd.Timestamp.now(tz='Etc/UTC')
    fetch_data_to = current_date - timedelta(hours=1)
    fetch_data_from = fetch_data_to - timedelta(days=25)
    
    feature_view = feature_store.get_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION
    )
    
    ts_data = feature_view.get_batch_data(
        start_time=fetch_data_from,
        end_time=fetch_data_to
    )
    
    ts_data['pickup_hour'] = ts_data['pickup_hour'].dt.tz_localize(None)
    ts_data = ts_data.sort_values(['pickup_location_id', 'pickup_hour'])
    
    print(f"Data loaded: {len(ts_data)} records")
    
    # Step 2: Generate features
    from src.data_utils import transform_ts_data_info_features
    features = transform_ts_data_info_features(
        ts_data,
        window_size=504,
        step_size=24
    )
    
    # Step 3: Add missing required columns with zeros
    features['rides_t-672'] = 0
    
    # Step 4: Load model and modify its parameters
    model = load_model_from_registry()
    if isinstance(model, lgb.Booster):
        model.params['predict_disable_shape_check'] = True
    elif hasattr(model, 'steps') and isinstance(model.steps[-1][1], lgb.LGBMRegressor):
        model.steps[-1][1].set_params(predict_disable_shape_check=True)
    
    # Step 5: Generate predictions
    predictions = get_model_predictions(model, features)
    
    if predictions is not None and not predictions.empty:
        results = predictions.sort_values("predicted_demand", ascending=False)
        print("\nTop 10 locations by predicted demand:")
        print(results[["pickup_location_id", "predicted_demand"]].head(10))
        print(f"\nTotal predictions: {len(predictions)}")
        
        # Save predictions
        results.to_csv('/tmp/predictions.csv', index=False)
        print("\nPredictions saved to /tmp/predictions.csv")

except Exception as e:
    print(f"Error: {str(e)}")
    print("\nDebug Info:")
    if 'features' in locals():
        print(f"Available features shape: {features.shape}")
        print(f"Available columns: {features.columns.tolist()[:5]}")
    predictions = None

# Display predictions
predictions

2025-03-04 10:05:37,650 INFO: Closing external client and cleaning up certificates.


Connection closed.
2025-03-04 10:05:37,655 INFO: Initializing external client
2025-03-04 10:05:37,656 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 10:05:38,404 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.76s) 
Data loaded: 150349 records
2025-03-04 10:05:46,921 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 10:05:46,924 INFO: Initializing external client
2025-03-04 10:05:46,924 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 10:05:47,528 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Downloading model artifact (0 dirs, 1 files)... DONE
Top 10 locations by predicted demand:
     pickup_location_id  predicted_demand
489                 132               8.0
607                 162               4.0
490          

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
999,262,-6.0
1000,263,-5.0
1001,263,-4.0
1002,263,-3.0


In [57]:
predictions

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,3,0.0
...,...,...
999,262,-6.0
1000,263,-5.0
1001,263,-4.0
1002,263,-3.0


In [58]:
predictions.sort_values("predicted_demand", ascending=False).head(10)["pickup_location_id"].values

array([132, 162, 132, 209,  74,  45,  45,  45,  45, 209], dtype=int32)