# Importing Modules from other Files

In [None]:
import sys

sys.path.append("../src/")

import data
import paths
import plot
import data_split

import pandas as pd
import numpy as np
from datetime import datetime

# Downloading, Loading and Validating Raw Data

In [None]:
rides = data.LoadRawData(year = 2022)
rides

# Transforming Raw Data into Time-Series Tabular Data

In [None]:
TS_Data = data.TransformRawDataIntoTSData(rides)

In [None]:
TS_Data

In [None]:
Features, Targets = data.TransformALL(TS_Data, nFeatures = 24*7*4, SlidingFactor = 24) #One Month of Features

In [None]:
print(Features.shape)
print(Targets.shape)

In [None]:
#Dumping the Data to Disk

TabularData = Features
TabularData["target_rides_next_hour"] = Targets

TabularData.to_parquet(paths.TRANSFORMED_DATA_DIR / "TabularData.parquet")

In [None]:
TabularData

# Data Visualization

In [None]:
Features = TabularData.drop(["target_rides_next_hour"], axis = 1, inplace = False)
Targets = TabularData["target_rides_next_hour"]

In [None]:
# The Following Import and Setting is needed to Make the Plotly Figure Showing
import plotly.io as pio
pio.renderers.default = "iframe" # or 'colab' or 'iframe' or 'iframe_connected' or 'sphinx_gallery'

plot.PlotOneRidesSample(features = Features, targets = Targets, exampleID = 0)

# Building Some Baseline Models that will Compared against ML Models

In [None]:
#Splitting the Data into Train and Test
 
df = pd.read_parquet(paths.TRANSFORMED_DATA_DIR / "TabularData.parquet")
df

In [None]:
xTrain, yTrain, xTest, yTest = data_split.TrainTestSplit(df, cutoff_date = datetime(2022, 6, 1, 0 , 0, 0), target_column_name = "target_rides_next_hour") 

In [None]:
print(f"{xTrain.shape = }")
print(f"{yTrain.shape = }")
print(f"{xTest.shape = }")
print(f"{yTest.shape = }")

In [None]:
class BaselineModel:
    def fit(self, xTrain:pd.DataFrame, yTrain:pd.Series):
        pass
    
    def predict(self, xTest:pd.DataFrame) -> np.array:
        
        return xTest["rides_1_hours_before"]

In [None]:
blmodel = BaselineModel()
predictions = blmodel.predict(xTest)

predictions

In [None]:
from sklearn.metrics import mean_absolute_error as MAE

test_mae = MAE(yTest, predictions)
print(f"{test_mae = :.4f}")

In [None]:
class BaselineModelWeeklySeasonality:
    def fit(self, xTrain:pd.DataFrame, yTrain:pd.Series):
        pass
    
    def predict(self, xTest:pd.DataFrame) -> np.array:
        
        return xTest[f"rides_{7*24}_hours_before"]

In [None]:
blweeklymodel = BaselineModelWeeklySeasonality()
predictions = blweeklymodel.predict(xTest)

predictions

In [None]:
test_mae = MAE(yTest, predictions)
print(f"{test_mae = :.4f}")

In [None]:
class BaselineModelMonthlySeasonality:
    def fit(self, xTrain:pd.DataFrame, yTrain:pd.Series):
        pass
    
    def predict(self, xTest:pd.DataFrame) -> np.array:
        
        return (xTest[f"rides_{7*24*1}_hours_before"] + xTest[f"rides_{7*24*2}_hours_before"] + xTest[f"rides_{7*24*3}_hours_before"] + xTest[f"rides_{7*24*4}_hours_before"])/4

In [None]:
blmonthlymodel = BaselineModelMonthlySeasonality()
predictions = blmonthlymodel.predict(xTest)

predictions

In [None]:
test_mae = MAE(yTest, predictions)
print(f"{test_mae = :.4f}")

# Building the Machine Learning Model

In [None]:
import xgboost as xgb

PastRidesColumns = [c for c in xTrain.columns if c.endswith("hours_before")]
xTrainOnlyNumeric = xTrain[PastRidesColumns]

In [None]:
Model = xgb.XGBRegressor()
Model.fit(xTrainOnlyNumeric, yTrain)

In [None]:
xTestOnlyNumeric = xTest[PastRidesColumns]
Predictions = Model.predict(xTestOnlyNumeric)
Predictions

In [None]:
model_mae = MAE(yTest, Predictions)
print(f"{model_mae = :.4f}")

In [None]:
import lightgbm as lgb

Model = lgb.LGBMRegressor()
Model.fit(xTrainOnlyNumeric, yTrain)

In [None]:
Predictions = Model.predict(xTestOnlyNumeric)
Predictions

In [None]:
model_mae = MAE(yTest, Predictions)
print(f"{model_mae = :.4f}")

    x["avg_rides_last_4_weeks"] = (x[f"rides_{7*24*1}_hours_before"] + x[f"rides_{7*24*2}_hours_before"] + x[f"rides_{7*24*3}_hours_before"] + x[f"rides_{7*24*4}_hours_before"])/4
# Feature Engineering to Expand Information in out Dataset

In [None]:
def AverageRidesLast4Weeks(x:pd.DataFrame) -> pd.DataFrame:
    
    x["avg_rides_last_4_weeks"] = (x[f"rides_{7*24*1}_hours_before"] + x[f"rides_{7*24*2}_hours_before"] + x[f"rides_{7*24*3}_hours_before"] + x[f"rides_{7*24*4}_hours_before"])/4
    
    return x

In [None]:
#Wrapping the Function Around the sklearn FunctionTransformer, so we can directly call fit and transform methods on Data

from sklearn.preprocessing import FunctionTransformer

add_feature_avgrideslast4weeks = FunctionTransformer(AverageRidesLast4Weeks, validate = False)

In [None]:
add_feature_avgrideslast4weeks.fit_transform(xTrain)

In [None]:
#Getting Day of Week and Hour of Day from the Datetime Pickup Hour

from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineering(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        X = x.copy()
        X["hour"] = X["pickup_hour"].dt.hour
        X["dow"] = X["pickup_hour"].dt.dayofweek
        
        return X.drop(columns = ["pickup_hour"], inplace=False)

In [None]:
add_temporalfeatures = TemporalFeatureEngineering()
add_temporalfeatures.fit_transform(xTrain)

In [None]:
add_temporalfeatures.fit_transform(xTrain)

# Building a ML Pipeline

In [None]:
from sklearn.pipeline import make_pipeline as PPLN

Pipeline = PPLN(add_feature_avgrideslast4weeks, add_temporalfeatures, lgb.LGBMRegressor())

Pipeline.fit(xTrain, yTrain)

In [None]:
Preds = Pipeline.predict(xTest)

Mae = MAE(yTest, Preds)
print(f"{Mae = :.4f}")