In [4]:
%load_ext autoreload
%autoreload 2

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [39]:
import numpy as np
import pandas as pd
from datetime import date, timedelta
from pytz import timezone
import optuna

from hsml.schema import Schema
from hsml.model_schema import ModelSchema

from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline as PPLN
from sklearn.metrics import mean_absolute_error as MAE

import os
import sys
sys.path.append("../src/")
sys.path.append("../")

import config
import data
import data_split
import pipe
import paths

In [2]:
import hopsworks

#Connect to the Project
Project = hopsworks.login(project = config.HopsworksProjectName, api_key_value = config.HOPSWORKSAPIKEY)

#Connect to the Feature Store
FeatureStore = Project.get_feature_store()

#Connect to the Feature Group
FeatureGroup = FeatureStore.get_feature_group(name = config.FeatureGroupName, version = config.FeatureGroupVersion)

  from .autonotebook import tqdm as notebook_tqdm


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/104605
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
#Create Feature View (if it doesn't exists yet)
#This Feature View will only use the Feature Group, so the Query is Trivial
try:
    #Create Feature View if it doesn't exist
    FeatureStore.create_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion, query = FeatureGroup.select_all())
    
except:
    print("Feature View already Exists.")
    
#Get the Feature View
FeatureView = FeatureStore.get_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/104605/fs/104524/fv/ts_hourly_featureview/version/1


In [5]:
TS_Data = FeatureView.training_data(description = "TimeSeries Hourly Taxi Rides")



2023-09-17 11:17:50,847 INFO: USE `taxidemandprediction_featurestore`
2023-09-17 11:17:51,270 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`numrides` `numrides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxidemandprediction_featurestore`.`ts_hourly_featuregroup_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '1970-01-01 12:16:40.000' AND `fg0`.`pickup_hour` < TIMESTAMP '2023-09-17 09:17:48.000'




In [15]:
TS_Data[0]

Unnamed: 0,pickup_hour,numrides,pickup_location_id
0,2022-11-25 14:00:00,0,26
1,2023-03-18 05:00:00,0,69
2,2023-03-10 06:00:00,0,76
3,2023-06-04 17:00:00,0,159
4,2022-04-15 11:00:00,0,22
...,...,...,...
3737687,2022-12-06 06:00:00,2,51
3737688,2022-12-14 07:00:00,0,27
3737689,2022-08-22 13:00:00,0,180
3737690,2022-08-06 06:00:00,0,134


In [17]:
TS_Data[0].rename(columns = {"pickup_hour":"PickupHour", "numrides":"NumOfRides", "pickup_location_id":"PickupLocationID"}, inplace = True)

In [25]:
#Transforming Data into Features and Targets

Features, Targets = data.TransformALL(tsData = TS_Data[0], nFeatures = 24*7*4, SlidingFactor = 23)

FeaturesAndTargets = Features.copy()
FeaturesAndTargets["Target Rides Next Hour"] = Targets

print(f"{FeaturesAndTargets.shape=}")

100%|█████████████████████████████████████████| 262/262 [01:14<00:00,  3.50it/s]


FeaturesAndTarget.shape=(154842, 675)


In [26]:
#Training Data -> From January 2022 until 2 Months Ago
#Test Data -> Last 2 Months
CutoffDate = pd.to_datetime(date.today() - timedelta(days=28*2))

print(f"{CutoffDate=}")

xTrain, yTrain, xTest, yTest = data_split.TrainTestSplit(FeaturesAndTargets, CutoffDate, "Target Rides Next Hour")

print(f"{xTrain.shape=}")
print(f"{yTrain.shape=}")
print(f"{xTest.shape=}")
print(f"{yTest.shape=}")

CutoffDate=Timestamp('2023-07-23 00:00:00')
xTrain.shape=(141742, 674)
yTrain.shape=(141742,)
xTest.shape=(13100, 674)
yTest.shape=(13100,)


In [28]:
from pipe import MakePipeline

#Given a Set of Hyperparameters it Trains a Model and Computes an Avg Validation Error Based on TimeSeriesSplit

def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TimeSeriesSplit(n_splits=2)
    Scores = []
    
    for trainIndex, valIndex in tss.split(xTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = xTrain.iloc[trainIndex, :], xTrain.iloc[valIndex, :]
        yTrain_, yVal_ = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]
        
        #Train the Model
        ppln = MakePipeline(**Hyperparams)
        ppln.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = ppln.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [29]:
#Creating an Optuna Study (That's how the Optuna API works)
Trials = 1

Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=Trials)

[I 2023-09-17 22:31:30,475] A new study created in memory with name: no-name-24e7e61f-e289-422d-8957-d70736f1f473
[I 2023-09-17 22:39:27,784] Trial 0 finished with value: 2.820189606522046 and parameters: {'num_leaves': 49, 'feature_fraction': 0.37016183145046777, 'bagging_fraction': 0.5903624028735266, 'min_child_samples': 84}. Best is trial 0 with value: 2.820189606522046.


In [30]:
best = Study.best_trial.params
best

{'num_leaves': 49,
 'feature_fraction': 0.37016183145046777,
 'bagging_fraction': 0.5903624028735266,
 'min_child_samples': 84}

In [31]:
Pipe = MakePipeline(**best)
Pipe.fit(xTrain, yTrain)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 171901
[LightGBM] [Info] Number of data points in the train set: 141742, number of used features: 676
[LightGBM] [Info] Start training from score 17.579497


In [32]:
Preds = Pipe.predict(xTest)
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

testMae = 4.3904


In [33]:
import joblib

In [37]:
joblib.dump(Pipe, paths.MODELS_DIR / "Model.pkl")

['/home/poppy/taxi_project/models/Model.pkl']

In [41]:
InputSchema = Schema(xTrain)
OutputSchema = Schema(yTrain)
ModelSchema_ = ModelSchema(input_schema = InputSchema, output_schema = OutputSchema)

In [None]:
#Uploading the Model to the Model Registry on the Cloud
ModelRegistry_ = Project.get_model_registry()

Model = ModelRegistry_.sklearn.create_model(
    name = "taxi_demand_predictor_next_hour",
    metrics = {"test_mae":testMae},
    description = "LightGMB Regressor with a bit of HyperParameter Tuning",
    input_example = xTrain.sample(),
    model_schema = ModelSchema_
)

Model.save(paths.MODELS_DIR / "Model.pkl")

Connected. Call `.close()` to terminate connection gracefully.


Waiting for model registration:  67%|████████▋    | 4/6 [00:34<00:13,  6.94s/it]