In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [3]:
from src.inference import get_feature_store

In [4]:
from datetime import datetime, timedelta
import pandas as pd  

# Get the current datetime64[us, Etc/UTC]  
current_date = pd.Timestamp.now(tz='Etc/UTC')
feature_store = get_feature_store()

# read time-series data from the feature store
fetch_data_to = current_date - timedelta(hours=1)
fetch_data_from = current_date - timedelta(days=1*29)
print(f"Fetching data from {fetch_data_from} to {fetch_data_to}")
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME, version=config.FEATURE_VIEW_VERSION
)

ts_data = feature_view.get_batch_data(
    start_time=(fetch_data_from - timedelta(days=1)),
    end_time=(fetch_data_to + timedelta(days=1)),
)
ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]

2025-03-04 09:06:23,428 INFO: Initializing external client
2025-03-04 09:06:23,429 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 09:06:24,209 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
Fetching data from 2025-02-03 14:06:23.427453+00:00 to 2025-03-04 13:06:23.427453+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.31s) 


In [5]:
ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-02-04 14:00:00+00:00,2,0
1,2025-02-04 15:00:00+00:00,2,0
2,2025-02-04 16:00:00+00:00,2,0
3,2025-02-04 17:00:00+00:00,2,0
4,2025-02-04 18:00:00+00:00,2,0
...,...,...,...
168667,2025-03-04 09:00:00+00:00,263,134
168668,2025-03-04 10:00:00+00:00,263,116
168669,2025-03-04 11:00:00+00:00,263,98
168670,2025-03-04 12:00:00+00:00,263,100


In [6]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168672 entries, 0 to 168671
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype                  
---  ------              --------------   -----                  
 0   pickup_hour         168672 non-null  datetime64[us, Etc/UTC]
 1   pickup_location_id  168672 non-null  int32                  
 2   rides               168672 non-null  int32                  
dtypes: datetime64[us, Etc/UTC](1), int32(2)
memory usage: 2.6 MB


In [7]:
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)

In [8]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168672 entries, 0 to 168671
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   pickup_hour         168672 non-null  datetime64[us]
 1   pickup_location_id  168672 non-null  int32         
 2   rides               168672 non-null  int32         
dtypes: datetime64[us](1), int32(2)
memory usage: 2.6 MB


In [10]:
from datetime import datetime, timedelta
import pandas as pd
import hopsworks
from hsfs.client.exceptions import RestAPIError

def get_or_create_feature_group():
    """Create feature group with HUDI time-travel and sample data"""
    project = hopsworks.login()
    fs = project.get_feature_store()
    
    try:
        return fs.get_feature_group(
            name="time_series_hourly_feature_group",
            version=1
        )
    except RestAPIError:
        fg = fs.create_feature_group(
            name="time_series_hourly_feature_group",
            version=1,
            primary_key=["pickup_location_id", "pickup_hour"],
            event_time="pickup_hour",
            time_travel_format="HUDI",
            description="Hourly taxi ride time-series data"
        )
        
        # Insert 30 days of sample data
        sample_data = pd.DataFrame({
            "pickup_location_id": [1, 2, 3] * 24 * 30,
            "pickup_hour": pd.date_range(
                end=datetime.utcnow().replace(minute=0, second=0, microsecond=0),
                periods=24*30,
                freq="H"
            ),
            "rides": list(range(24*30))
        })
        fg.insert(sample_data)
        return fg

def transform_features_safely(ts_data, max_window_days=7):
    """Dynamic window sizing with per-location validation"""
    if ts_data.empty:
        raise ValueError("Input DataFrame is empty")
    
    # Calculate safe window size
    window_size = min(24 * max_window_days, len(ts_data))
    if window_size < 24:
        raise ValueError(f"Need ≥24h data. Got {len(ts_data)} rows")
    
    # Transformation logic
    features = []
    for location_id in ts_data["pickup_location_id"].unique():
        loc_data = ts_data[ts_data["pickup_location_id"] == location_id]
        
        if len(loc_data) < window_size:
            print(f"Skipping location {location_id}: Insufficient data")
            continue
            
        # Example: 7-day rolling average
        for i in range(0, len(loc_data) - window_size + 1, 24):
            window = loc_data.iloc[i:i+window_size]
            features.append({
                "location_id": location_id,
                "window_start": window["pickup_hour"].min(),
                "rides_avg": window["rides"].mean()
            })
    
    return pd.DataFrame(features) if features else pd.DataFrame()

def run_pipeline():
    # 1. Initialize feature group
    fg = get_or_create_feature_group()
    
    # 2. Get valid time range
    commits = fg.commit_details()
    latest_commit = datetime.fromtimestamp(list(commits.keys())[-1]/1000)
    
    # 3. Fetch data with Spark fallback
    ts_data = fg.select_all().as_of(latest_commit).read(
        read_options={"use_spark": True}
    )
    
    # 4. Validate and transform
    if not ts_data.empty:
        features = transform_features_safely(ts_data)
        if features.empty:
            print("Warning: No features generated. Check data requirements")
        return features
    else:
        raise ValueError("Data retrieval failed after all retries")

if __name__ == "__main__":
    try:
        features = run_pipeline()
        print(f"✅ Success! Generated {len(features)} features")
        print(features.head())
    except Exception as e:
        print(f"❌ Critical error: {str(e)}")
        print("Troubleshooting:")
        print("- Verify feature group exists in Hopsworks UI")
        print("- Check data insertion in feature group")
        print("- Reduce window_size parameter if needed")


2025-03-04 09:11:19,773 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 09:11:19,788 INFO: Initializing external client
2025-03-04 09:11:19,788 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 09:11:20,590 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214690
❌ Critical error: Metadata operation error: (url: https://c.app.hopsworks.ai/hopsworks-api/api/project/1214690/featurestores/query). Server response: 
HTTP code: 404, HTTP reason: Not Found, body: b'{"errorCode":270118,"usrMsg":"featureGroup: time_series_hourly_feature_group version 1","errorMsg":"No data is available for feature group with this commit date"}', error code: 270118, error msg: No data is available for feature group with this commit date, user msg: featureGroup: time_series_hourly_feature_group version 1
Troubleshooting:
- Verify feature group exists in Hopsworks UI
- Check data insertion in feature group
- R

In [11]:
features

NameError: name 'features' is not defined

In [None]:
from src.inference import load_batch_of_features_from_store
current_date = pd.Timestamp.now(tz='Etc/UTC')
features = load_batch_of_features_from_store(current_date)

2025-02-13 17:37:17,628 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-13 17:37:17,638 INFO: Initializing external client
2025-02-13 17:37:17,639 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-13 17:37:19,110 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211551
Fetching data from 2025-01-15 22:37:17.628670+00:00 to 2025-02-13 21:37:17.628670+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (9.11s) 
Skipping location_id 109: Not enough data to create even one window.


In [None]:
current_date

Timestamp('2025-02-13 22:37:17.628670+0000', tz='Etc/UTC')

In [None]:
features

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_location_id,pickup_hour
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2,2025-02-12 23:00:00
1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,3,2025-02-12 23:00:00
2,0,0,0,0,0,0,0,5,5,8,...,4,5,6,10,5,5,9,5,4,2025-02-12 23:00:00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,2025-02-12 23:00:00
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,2025-02-12 23:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,1,0,1,0,259,2025-02-12 23:00:00
252,0,0,0,0,1,0,0,0,2,3,...,3,1,1,1,0,4,0,1,260,2025-02-12 23:00:00
253,0,0,0,0,0,0,4,2,6,12,...,36,27,50,60,41,27,30,17,261,2025-02-12 23:00:00
254,0,0,0,0,2,4,14,75,138,162,...,126,90,127,162,96,56,47,22,262,2025-02-12 23:00:00


In [None]:
from src.inference import load_model_from_registry

model = load_model_from_registry()

2025-02-13 17:41:32,318 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-13 17:41:32,334 INFO: Initializing external client
2025-02-13 17:41:32,337 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-13 17:41:33,715 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211551




Downloading model artifact (0 dirs, 1 files)... DONE

In [None]:
from src.inference import get_model_predictions
predictions = get_model_predictions(model, features)

In [None]:
predictions

Unnamed: 0,pickup_location_id,predicted_demand
0,2,0.0
1,3,0.0
2,4,2.0
3,5,0.0
4,6,0.0
...,...,...
251,259,0.0
252,260,1.0
253,261,10.0
254,262,10.0


In [None]:
predictions.sort_values("predicted_demand", ascending=False).head(10)["pickup_location_id"].values

array([132, 249, 230, 161,  79, 142,  48, 114, 138, 163])