In [151]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [152]:
import src.config as config

In [153]:
import hopsworks

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature group
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)

2024-12-21 20:12:08,911 INFO: Closing external client and cleaning up certificates.
Connection closed.
2024-12-21 20:12:08,917 INFO: Initializing external client


2024-12-21 20:12:08,917 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-21 20:12:10,139 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192104


In [154]:
# create feature view (if it doesn't exist yet)
# This feature view only uses on feature group, so the query is trivial
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_NAME,
        version=config.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_NAME,
    version=config.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1192104/fs/1181777/fv/time_series_hourly_feature_view/version/1


In [155]:
ts_data, _ = feature_view.training_data(
    description='Time-series hourly taxi rides',
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (110.61s) 



Incremented version to `1`.



In [None]:
# drop `pickup_ts` column
ts_data.drop('pickup_ts', axis=1, inplace=True)

# sort by `pickup_location_id` and `pickup_hour`
ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
ts_data

In [159]:
print(ts_data.dtypes)

print("Missing values in DataFrame A's 'pickup_hour' column:")
print(ts_data['pickup_hour'].isnull().sum())

print("Unique values in DataFrame A's 'pickup_hour' column:")
print(ts_data['pickup_hour'].unique())

print("Shape of DataFrame A:", ts_data.shape)

pickup_hour           object
rides                  int64
pickup_location_id     int64
dtype: object
Missing values in DataFrame A's 'pickup_hour' column:
0
Unique values in DataFrame A's 'pickup_hour' column:
['2023-10-31 00:00:00+00:00' '2023-10-31 01:00:00+00:00'
 '2023-10-31 02:00:00+00:00' ... '2024-10-30 22:00:00+00:00'
 '2024-10-30 23:00:00+00:00' '2024-10-31 00:00:00+00:00']
Shape of DataFrame A: (2310455, 3)


In [163]:
# from src.plot import plot_ts
from typing import Optional, List
import pandas as pd
import plotly.express as px 

def plot_ts(
    ts_data: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot time-series data
    """
    ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data

    fig = px.line(
        ts_data_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()

plot_ts(ts_data, locations=[43])

In [164]:
from src.data import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*28*1, # one month
    step_size=24,
)

features

100%|██████████| 263/263 [00:21<00:00, 12.19it/s]


Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,...,4.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,2023-11-28 00:00:00+00:00,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2023-11-29 00:00:00+00:00,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2023-11-30 00:00:00+00:00,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2023-12-01 00:00:00+00:00,1
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-12-02 00:00:00+00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88889,3.0,1.0,1.0,9.0,3.0,2.0,1.0,2.0,7.0,2.0,...,1.0,0.0,9.0,3.0,2.0,2.0,1.0,0.0,2024-10-26 00:00:00+00:00,265
88890,3.0,8.0,6.0,0.0,5.0,5.0,2.0,2.0,1.0,2.0,...,0.0,8.0,3.0,3.0,3.0,4.0,3.0,6.0,2024-10-27 00:00:00+00:00,265
88891,0.0,2.0,2.0,1.0,0.0,2.0,3.0,2.0,1.0,3.0,...,8.0,4.0,0.0,3.0,7.0,3.0,7.0,3.0,2024-10-28 00:00:00+00:00,265
88892,2.0,2.0,3.0,0.0,1.0,1.0,4.0,4.0,1.0,1.0,...,5.0,5.0,1.0,2.0,5.0,6.0,6.0,2.0,2024-10-29 00:00:00+00:00,265


In [165]:
features_and_target = features.copy()
features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

features_and_target.shape=(88894, 675)


In [180]:
features_and_target

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,...,1.0,3.0,0.0,1.0,0.0,0.0,0.0,2023-11-28 00:00:00+00:00,1,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2023-11-29 00:00:00+00:00,1,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2023-11-30 00:00:00+00:00,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2023-12-01 00:00:00+00:00,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-12-02 00:00:00+00:00,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88889,3.0,1.0,1.0,9.0,3.0,2.0,1.0,2.0,7.0,2.0,...,0.0,9.0,3.0,2.0,2.0,1.0,0.0,2024-10-26 00:00:00+00:00,265,1.0
88890,3.0,8.0,6.0,0.0,5.0,5.0,2.0,2.0,1.0,2.0,...,8.0,3.0,3.0,3.0,4.0,3.0,6.0,2024-10-27 00:00:00+00:00,265,7.0
88891,0.0,2.0,2.0,1.0,0.0,2.0,3.0,2.0,1.0,3.0,...,4.0,0.0,3.0,7.0,3.0,7.0,3.0,2024-10-28 00:00:00+00:00,265,0.0
88892,2.0,2.0,3.0,0.0,1.0,1.0,4.0,4.0,1.0,1.0,...,5.0,1.0,2.0,5.0,6.0,6.0,2.0,2024-10-29 00:00:00+00:00,265,9.0


In [202]:
from datetime import datetime, timedelta
from pytz import timezone
import pandas as pd
from src.data_split import train_test_split

# training data -> from January 2023 up until 2 months ago
# test data -> last 2 months
current_date = pd.Timestamp('2024-10-31 00:00:00')
cutoff_date = (current_date - timedelta(days=28*1)).to_pydatetime()
# .replace(tzinfo=None)

print(f'{cutoff_date=}')
print(type(cutoff_date))


cutoff_date=datetime.datetime(2024, 10, 3, 0, 0)
<class 'datetime.datetime'>


In [203]:
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], errors='coerce').dt.tz_localize(None)

In [204]:
ts_data.tail()

Unnamed: 0,pickup_hour,rides,pickup_location_id
393648,2024-10-30 20:00:00,3,265
2255834,2024-10-30 21:00:00,4,265
1232950,2024-10-30 22:00:00,3,265
267720,2024-10-30 23:00:00,6,265
2290406,2024-10-31 00:00:00,0,265


In [205]:
ts_data.pickup_hour

1027820   2023-10-31 00:00:00
2074810   2023-10-31 01:00:00
587718    2023-10-31 02:00:00
489830    2023-10-31 03:00:00
2033508   2023-10-31 04:00:00
                  ...        
393648    2024-10-30 20:00:00
2255834   2024-10-30 21:00:00
1232950   2024-10-30 22:00:00
267720    2024-10-30 23:00:00
2290406   2024-10-31 00:00:00
Name: pickup_hour, Length: 2310455, dtype: datetime64[ns]

In [206]:
features_and_target['pickup_hour'] = pd.to_datetime(features_and_target['pickup_hour'], errors='coerce').dt.tz_localize(None)
features_and_target.pickup_hour

0       2023-11-28
1       2023-11-29
2       2023-11-30
3       2023-12-01
4       2023-12-02
           ...    
88889   2024-10-26
88890   2024-10-27
88891   2024-10-28
88892   2024-10-29
88893   2024-10-30
Name: pickup_hour, Length: 88894, dtype: datetime64[ns]

In [207]:
X_train, y_train, X_test, y_test = train_test_split(
   features_and_target,
    cutoff_date,
    target_column_name='target_rides_next_hour'   
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(81530, 674)
y_train.shape=(81530,)
X_test.shape=(7364, 674)
y_test.shape=(7364,)


In [208]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    
    # sort X_train by `pikup_hour` inplace
    # so the TimeSeriesSplit will split the data in a consistent way
    X_train.sort_values('pickup_hour', inplace=True)

    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [216]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[I 2024-12-21 20:42:01,921] A new study created in memory with name: no-name-0982c2fc-56c8-4047-aded-3db9e7933375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [217]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 5, 'feature_fraction': 0.9904213488145986, 'bagging_fraction': 0.9938247583773121, 'min_child_samples': 99}


In [218]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

# Error is still insanely high - Need to look into this...

In [220]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=20.6411


In [221]:
import joblib
from src.paths import MODELS_DIR

joblib.dump(pipeline, MODELS_DIR / 'model.pkl')

['/Users/keshansharp/poetry_taxi_demand_predictor/models/model.pkl']

In [222]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

In [228]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)
model_registry = project.get_model_registry()

model = model_registry.sklearn.create_model(
    name="taxi_demand_predictor_next_hour",
    metrics={"test_mae": test_mae},
    description="LightGBM regressor with a bit of hyper-parameter tuning",
    input_example=X_train.sample(),
    model_schema=model_schema
)


model.save(str(MODELS_DIR / 'model.pkl'))

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/111522 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/3410 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/58131 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1192104/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)