In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
from datetime import date, timedelta
from pytz import timezone
import optuna

from hsml.schema import Schema
from hsml.model_schema import ModelSchema

from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline as PPLN
from sklearn.metrics import mean_absolute_error as MAE

import os
import sys
sys.path.append("../src/")
sys.path.append("../")

import config
import data
import data_split
import pipe
import paths

In [None]:
import hopsworks

#Connect to the Project
Project = hopsworks.login(project = config.HopsworksProjectName, api_key_value = config.HOPSWORKSAPIKEY)

#Connect to the Feature Store
FeatureStore = Project.get_feature_store()

#Connect to the Feature Group
FeatureGroup = FeatureStore.get_feature_group(name = config.FeatureGroupName, version = config.FeatureGroupVersion)

In [None]:
#Create Feature View (if it doesn't exists yet)
#This Feature View will only use the Feature Group, so the Query is Trivial
try:
    #Create Feature View if it doesn't exist
    FeatureStore.create_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion, query = FeatureGroup.select_all())
    
except:
    print("Feature View already Exists.")
    
#Get the Feature View
FeatureView = FeatureStore.get_feature_view(name = config.FeatureViewName, version = config.FeatureViewVersion)

In [None]:
TS_Data = FeatureView.training_data(description = "TimeSeries Hourly Taxi Rides")

In [None]:
TS_Data[0]

In [None]:
TS_Data[0].rename(columns = {"pickup_hour":"PickupHour", "numrides":"NumOfRides", "pickup_location_id":"PickupLocationID"}, inplace = True)

In [None]:
#Transforming Data into Features and Targets

Features, Targets = data.TransformALL(tsData = TS_Data[0], nFeatures = 24*7*4, SlidingFactor = 23)

FeaturesAndTargets = Features.copy()
FeaturesAndTargets["Target Rides Next Hour"] = Targets

print(f"{FeaturesAndTargets.shape=}")

In [None]:
#Training Data -> From January 2022 until 2 Months Ago
#Test Data -> Last 2 Months
CutoffDate = pd.to_datetime(date.today() - timedelta(days=28*2))

print(f"{CutoffDate=}")

xTrain, yTrain, xTest, yTest = data_split.TrainTestSplit(FeaturesAndTargets, CutoffDate, "Target Rides Next Hour")

print(f"{xTrain.shape=}")
print(f"{yTrain.shape=}")
print(f"{xTest.shape=}")
print(f"{yTest.shape=}")

In [None]:
from pipe import MakePipeline

#Given a Set of Hyperparameters it Trains a Model and Computes an Avg Validation Error Based on TimeSeriesSplit

def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TimeSeriesSplit(n_splits=2)
    Scores = []
    
    for trainIndex, valIndex in tss.split(xTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = xTrain.iloc[trainIndex, :], xTrain.iloc[valIndex, :]
        yTrain_, yVal_ = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]
        
        #Train the Model
        ppln = MakePipeline(**Hyperparams)
        ppln.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = ppln.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [None]:
#Creating an Optuna Study (That's how the Optuna API works)
Trials = 1

Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=Trials)

In [None]:
best = Study.best_trial.params
best

In [None]:
Pipe = MakePipeline(**best)
Pipe.fit(xTrain, yTrain)

In [None]:
Preds = Pipe.predict(xTest)
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

In [None]:
import joblib

In [None]:
joblib.dump(Pipe, paths.MODELS_DIR / "Model.pkl")

In [None]:
InputSchema = Schema(xTrain)
OutputSchema = Schema(yTrain)
ModelSchema_ = ModelSchema(input_schema = InputSchema, output_schema = OutputSchema)

In [None]:
#Uploading the Model to the Model Registry on the Cloud
ModelRegistry_ = Project.get_model_registry()

Model = ModelRegistry_.sklearn.create_model(
    name = config.ModelName,
    metrics = {"test_mae":testMae},
    description = "LightGMB Regressor with a bit of HyperParameter Tuning",
    input_example = xTrain.sample(),
    model_schema = ModelSchema_
)

Model.save(paths.MODELS_DIR / "Model.pkl")