In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
from datetime import date, timedelta
from pytz import timezone
import optuna

from hsml.schema import Schema
from hsml.model_schema import ModelSchema

from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline as PPLN
from sklearn.metrics import mean_absolute_error as MAE

import os
import sys
sys.path.append("../src/")
sys.path.append("../")

import config
import data
import data_split
import pipe
import paths

In [4]:
import hopsworks

#Connect to the Project
Project = hopsworks.login(project = config.HopsworksProjectName, api_key_value = config.HOPSWORKSAPIKEY)

#Connect to the Feature Store
FeatureStore = Project.get_feature_store()

#Connect to the Feature Group
FeatureGroup = FeatureStore.get_feature_group(name = config.FeatureGroupName, version = config.FeatureGroupVersion)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/104605
Connected. Call `.close()` to terminate connection gracefully.


In [5]:
#Create Feature View (if it doesn't exists yet)
#This Feature View will only use the Feature Group, so the Query is Trivial
try:
    #Create Feature View if it doesn't exist
    FeatureStore.create_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion, query = FeatureGroup.select_all())
    
except:
    print("Feature View already Exists.")
    
#Get the Feature View
FeatureView = FeatureStore.get_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/104605/fs/104524/fv/ts_hourly_featureview/version/1


In [6]:
TS_Data = FeatureView.training_data(description = "TimeSeries Hourly Taxi Rides")



2023-09-21 11:40:25,437 INFO: USE `taxidemandprediction_featurestore`
2023-09-21 11:40:26,051 INFO: SELECT `fg0`.`pickup_hour` `pickup_hour`, `fg0`.`numrides` `numrides`, `fg0`.`pickup_location_id` `pickup_location_id`
FROM `taxidemandprediction_featurestore`.`ts_hourly_featuregroup_1` `fg0`
WHERE `fg0`.`pickup_hour` >= TIMESTAMP '1970-01-01 12:16:40.000' AND `fg0`.`pickup_hour` < TIMESTAMP '2023-09-21 09:40:23.000'




In [7]:
TS_Data[0]

Unnamed: 0,pickup_hour,numrides,pickup_location_id
0,2022-04-11 19:00:00,0,81
1,2022-01-02 22:00:00,2,181
2,2022-12-12 05:00:00,4,129
3,2022-12-07 20:00:00,0,106
4,2022-09-24 06:00:00,0,96
...,...,...,...
3762839,2022-07-22 20:00:00,0,6
3762840,2022-01-09 00:00:00,0,84
3762841,2022-08-13 23:00:00,0,44
3762842,2022-02-24 17:00:00,0,29


In [8]:
#Transforming Data into Features and Targets

Features, Targets = data.TransformALL(tsData = TS_Data[0], nFeatures = 24*7*4, SlidingFactor = 23)

FeaturesAndTargets = Features.copy()
FeaturesAndTargets["target_rides_next_hour"] = Targets

print(f"{FeaturesAndTargets.shape=}")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for

FeaturesAndTargets.shape=(156152, 675)


In [9]:
#Training Data -> From January 2022 until 2 Months Ago
#Test Data -> Last 2 Months
CutoffDate = pd.to_datetime(date.today() - timedelta(days=28*2))

print(f"{CutoffDate=}")

xTrain, yTrain, xTest, yTest = data_split.TrainTestSplit(FeaturesAndTargets, CutoffDate, "target_rides_next_hour")

print(f"{xTrain.shape=}")
print(f"{yTrain.shape=}")
print(f"{xTest.shape=}")
print(f"{yTest.shape=}")

CutoffDate=Timestamp('2023-07-27 00:00:00')
xTrain.shape=(141742, 674)
yTrain.shape=(141742,)
xTest.shape=(14410, 674)
yTest.shape=(14410,)


In [10]:
from pipe import MakePipeline

#Given a Set of Hyperparameters it Trains a Model and Computes an Avg Validation Error Based on TimeSeriesSplit

def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TimeSeriesSplit(n_splits=2)
    Scores = []
    
    for trainIndex, valIndex in tss.split(xTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = xTrain.iloc[trainIndex, :], xTrain.iloc[valIndex, :]
        yTrain_, yVal_ = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]
        
        #Train the Model
        ppln = MakePipeline(**Hyperparams)
        ppln.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = ppln.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [11]:
#Creating an Optuna Study (That's how the Optuna API works)
Trials = 1

Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=Trials)

[I 2023-09-21 11:55:02,689] A new study created in memory with name: no-name-eba3f36a-6910-45b9-af5e-e048c465237a
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)




See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)




See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
[I 2023-09-21 11:57:37,476] Trial 0 finished with value: 2.537232602324083 and parameters: {'num_leaves': 250, 'feature_fraction': 0.6554503122796482, 'bagging_fraction': 0.5139002535541383, 'min_child_samples': 40}. Best is trial 0 with value: 2.537232602324083.


In [12]:
best = Study.best_trial.params
best

{'num_leaves': 250,
 'feature_fraction': 0.6554503122796482,
 'bagging_fraction': 0.5139002535541383,
 'min_child_samples': 40}

In [13]:
Pipe = MakePipeline(**best)
Pipe.fit(xTrain, yTrain)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)




In [14]:
Preds = Pipe.predict(xTest)
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)


testMae = 2.6675


In [15]:
import joblib

In [16]:
joblib.dump(Pipe, paths.MODELS_DIR / "Model.pkl")

['/home/zero/Scrivania/taxiproject/models/Model.pkl']

In [17]:
InputSchema = Schema(xTrain)
OutputSchema = Schema(yTrain)
ModelSchema_ = ModelSchema(input_schema = InputSchema, output_schema = OutputSchema)

In [18]:
#Uploading the Model to the Model Registry on the Cloud
ModelRegistry_ = Project.get_model_registry()

Model = ModelRegistry_.sklearn.create_model(
    name = config.ModelName,
    metrics = {"test_mae":testMae},
    description = "LightGMB Regressor with a bit of HyperParameter Tuning",
    input_example = xTrain.sample(),
    model_schema = ModelSchema_
)

Model.save(paths.MODELS_DIR / "Model.pkl")

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)


Model created, explore it at https://c.app.hopsworks.ai:443/p/104605/models/taxi_demand_predictor_next_hour/1


Model(name: 'taxi_demand_predictor_next_hour', version: 1)