In [None]:
import sys

sys.path.append("../src/")

import data
import paths
import plot
import data_split

import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import KFold, TimeSeriesSplit

import lightgbm as lgb

import optuna

In [None]:
df = pd.read_parquet(paths.TRANSFORMED_DATA_DIR / "TabularData.parquet")
df

In [None]:
ddf = df[df["PickupHour"] < datetime(2022, 11, 1, 0 , 0, 0)]

ddf

In [None]:
xTrain, yTrain, xTest, yTest = data_split.TrainTestSplit(ddf, cutoff_date = datetime(2022, 6, 1, 0 , 0, 0), target_column_name = "Target Rides Next Hour") 

In [None]:
print(f"{xTrain.shape = }")
print(f"{yTrain.shape = }")
print(f"{xTest.shape = }")
print(f"{yTest.shape = }")

In [None]:
def AverageRidesLast4Weeks(x:pd.DataFrame) -> pd.DataFrame:
    
    x["Avg Rides Last 4 Weeks"] = (x[f"Rides {7*24*1} Hours Before"] + x[f"Rides {7*24*2} Hours Before"] + x[f"Rides {7*24*3} Hours Before"] + x[f"Rides {7*24*4} Hours Before"])/4
    
    return x

In [None]:
#Wrapping the Function Around the sklearn FunctionTransformer, so we can directly call fit and tranfosrm methods on Data

from sklearn.preprocessing import FunctionTransformer

add_feature_avgrideslast4weeks = FunctionTransformer(AverageRidesLast4Weeks, validate = False)
add_feature_avgrideslast4weeks.fit_transform(xTrain)

In [None]:
#Getting Day of Week and Hour of Day from the Datetime Pickup Hour

from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineering(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        X = x.copy()
        X["Hour"] = X["PickupHour"].dt.hour
        X["DoW"] = X["PickupHour"].dt.dayofweek
        
        return X.drop(columns = ["PickupHour"], inplace=False)
    
add_temporalfeatures = TemporalFeatureEngineering()
add_temporalfeatures.fit_transform(xTrain)

In [None]:
from sklearn.pipeline import make_pipeline as PPLN

Pipeline = PPLN(add_feature_avgrideslast4weeks, add_temporalfeatures, lgb.LGBMRegressor())

Pipeline.fit(xTrain, yTrain)

In [None]:
Preds = Pipeline.predict(xTest)

Mae = MAE(yTest, Preds)
print(f"{Mae = :.4f}")

# Hyperparameter Tuning and Validation Data

In [None]:
import warnings
warnings.filterwarnings("ignore")

from pipe import MakePipeline

In [None]:
#Given a Set of Hyperparameters it Trains a Model and Computes an Avg Validation Error Based on TimeSeriesSplit

def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TimeSeriesSplit(n_splits=2)
    Scores = []
    
    for trainIndex, valIndex in tss.split(xTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = xTrain.iloc[trainIndex, :], xTrain.iloc[valIndex, :]
        yTrain_, yVal_ = yTrain.iloc[trainIndex], yTrain.iloc[valIndex]
        
        #Train the Model
        ppln = MakePipeline(**Hyperparams)
        ppln.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = ppln.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [None]:
#Creating an Optuna Study (That's how the Optuna API works)

Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=5)

In [None]:
best = Study.best_trial.params
best

In [None]:
Pipe = MakePipeline(**best)
Pipe.fit(xTrain, yTrain)

In [None]:
Preds = Pipe.predict(xTest)
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

In [None]:
# The Following Import and Setting is needed to Make the Plotly Figure Showing
import plotly.io as pio
pio.renderers.default = "iframe" # or 'colab' or 'iframe' or 'iframe_connected' or 'sphinx_gallery'

plot.PlotOneRidesSample(features=xTest, targets=yTest, exampleID=2979, predictions=pd.Series(Preds))

In [None]:
plot.PlotOneRidesSample(features=xTest, targets=yTest, exampleID=3979, predictions=pd.Series(Preds))