In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import TimeSeriesSplit
from scipy import signal
import glob
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
import mlflow.pyfunc.model
from sklearn.ensemble import RandomForestRegressor
import os
import glob


# Some data preparation

In [7]:
notebook_path = os.path.abspath("5. Testing.ipynb")
notebook_directory = os.path.dirname(notebook_path)

replacement_string = "data\\interim\\Cluster_Data_All_Month\\*.csv"
index = notebook_directory.find("\\notebooks")
modified_path = notebook_directory[:index]

modified_path += "\\" + replacement_string

In [8]:
csv_files = glob.glob(
    modified_path
)

In [9]:
weekday_file_paths = [path for path in csv_files if 'weekday' in path]
weekend_file_paths = [path for path in csv_files if 'weekend' in path]

In [10]:
weekday_dict = {}

for index, file in enumerate(weekday_file_paths):
    current_df = pd.read_csv(file, index_col=[0])
    current_df['Date'] =  pd.to_datetime(current_df['Date'])
    current_df.index = pd.to_datetime(current_df.index)

    weekday_dict[int(weekday_file_paths[index][-13:-12])] = current_df

In [11]:
weekend_dict = {}  

for index, file in enumerate(weekend_file_paths):
    current_df = pd.read_csv(file, index_col=[0])
    current_df['Date'] = pd.to_datetime(current_df['Date'])
    current_df.index = pd.to_datetime(current_df.index)

    weekend_dict[int(weekend_file_paths[index][-13:-12])] = current_df

In [12]:
endog_week_dict = {}

for label in weekday_dict:
    current_df = weekday_dict[label]
    current_df = current_df.reset_index()
    current_df = current_df.drop(columns=['index', 'Hour', 'Weekday', 'Date'])
    endog_week_dict[label] = current_df

In [13]:
endog_weekend_dict = {}

for label in weekend_dict:
    current_df = weekend_dict[label]
    current_df = current_df.reset_index()
    current_df = current_df.drop(columns=['index', 'Hour', 'Weekday', 'Date'])
    endog_weekend_dict[label] = current_df

In [14]:
for label in endog_week_dict:
    for diff in range(1,25,1):
        endog_week_dict[label][f"Count{-diff}"] = endog_week_dict[label].Count.diff(diff)
    endog_week_dict[label].dropna(inplace=True)   
    endog_week_dict[label].reset_index(inplace=True)     
    endog_week_dict[label].drop(columns='index', inplace=True)


In [14]:
notebook_path = os.path.abspath("5. Testing.ipynb")
notebook_directory = os.path.dirname(notebook_path)

replacement_string = "models\\Tracking"
index = notebook_directory.find("\\notebooks")
modified_path = notebook_directory[:index]

modified_path += "\\" + replacement_string
trackin_uri = "file:///" + modified_path 

In [15]:
mlflow.set_tracking_uri(trackin_uri)

# Specify the experiment name
experiment_name = "Random_Forest_All_Data"

# Set the active experiment context (creates the experiment if it doesn't exist)
experiment = mlflow.set_experiment(experiment_name)


# Defining the functions

In [47]:
def random_forest(data, label, testing_lenght, max_features, max_depth, min_samples_leaf): 

    # Create an MLflow experiment and start a run
    with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"Random_Forest_{label}") as run:
        train_data = data[:-testing_lenght]
        test_data = data[-testing_lenght:]

    
        X_train = train_data.drop(['Count'], axis=1)
        y_train = train_data['Count']

        X_test = test_data.drop(['Count'], axis=1)
        y_test = test_data['Count']
        
        # Initialize and train the Random Forest model
        rf_model = RandomForestRegressor(
            n_estimators=100,  
            max_features=max_features, # 11
            max_depth=max_depth, # 20,
            min_samples_leaf= min_samples_leaf, # 1,
            random_state=42,  # Set a random seed for reproducibility
            n_jobs=-1  # Use all available CPU cores for parallel processing
        )
        
        rf_model.fit(X_train, y_train)

        # Make predictions
        y_pred_train = rf_model.predict(X_train)
        y_pred_test = rf_model.predict(X_test)

        # Calculate AIC and R-squared
        
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        train_size = data[:-testing_lenght].Count.sum()/len(data[:-testing_lenght])
        test_size = data[-testing_lenght:].Count.sum()/len(data[-testing_lenght:])

        # Log AIC and R-squared
        mlflow.log_metrics({
            "RMSE_Train" : rmse_train,
            "RMSE_Test" : rmse_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,  
            "MAE_Train_Percentage" : mae_train/train_size,
            "MAE_Test_Percentage" : mae_test/test_size
        })

        # Log model parameters
        mlflow.log_params({
            'label' : label,
            'test_size' : test_size,
            'train_size' : train_size,
            'min_samples_leaf' : min_samples_leaf,
            'max_features' : max_features,
            'max_depth' : max_depth
        })

        # Log the trained model
        # mlflow.sklearn.log_model(m_sarimax_24, "SARIMAX")

In [66]:
def random_forest_all_data(data, label, testing_lenght, max_features, max_depth, min_samples_leaf): 

    # Create an MLflow experiment and start a run
    with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"Random_Forest_{label}") as run:
        train_data = data[:-testing_lenght]
        test_data = data[-testing_lenght:]

    
        X_train = train_data.drop([f'Count_{label}'], axis=1)
        y_train = train_data[f'Count_{label}']

        X_test = test_data.drop([f'Count_{label}'], axis=1)
        y_test = test_data[f'Count_{label}']
        
        # Initialize and train the Random Forest model
        rf_model = RandomForestRegressor(
            n_estimators=100,  
            max_features=max_features, # 11
            max_depth=max_depth, # 20,
            min_samples_leaf= min_samples_leaf, # 1,
            random_state=42,  # Set a random seed for reproducibility
            n_jobs=-1  # Use all available CPU cores for parallel processing
        )
        
        rf_model.fit(X_train, y_train)

        # Make predictions
        y_pred_train = rf_model.predict(X_train)
        y_pred_test = rf_model.predict(X_test)

        # Calculate AIC and R-squared
        
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        train_size = data[:-testing_lenght][f'Count_{label}'].sum()/len(data[:-testing_lenght])
        test_size = data[-testing_lenght:][f'Count_{label}'].sum()/len(data[-testing_lenght:])

        # Log AIC and R-squared
        mlflow.log_metrics({
            "RMSE_Train" : rmse_train,
            "RMSE_Test" : rmse_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,  
            "MAE_Train_Percentage" : mae_train/train_size,
            "MAE_Test_Percentage" : mae_test/test_size
        })

        # Log model parameters
        mlflow.log_params({
            'label' : label,
            'test_size' : test_size,
            'train_size' : train_size,
            'min_samples_leaf' : min_samples_leaf,
            'max_features' : max_features,
            'max_depth' : max_depth
        })

        # Log the trained model
        # mlflow.sklearn.log_model(m_sarimax_24, "SARIMAX")

In [48]:
def sarima(data, label, order, seasonal_order, testing_lenght):
    
    # Fit the SARIMA model with harmonic regression (exogenous variables)
    with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"SARIMA{order}, {seasonal_order}, {label}") as run:
        data = data[['Count']]
        
        # Fit the SARIMA model with exogenous variables
        model = SARIMAX(data[:-testing_lenght], order=order, seasonal_order=seasonal_order)
        results = model.fit()
        
        # Calculate AIC, RMSE for training, and RMSE for testing
        aic = results.aic
        y_pred_train = results.fittedvalues
        y_pred_test = results.forecast(steps=testing_lenght)
        rmse_train = np.sqrt(mean_squared_error(data[:-testing_lenght], y_pred_train))
        rmse_test = np.sqrt(mean_squared_error(data[-testing_lenght:], y_pred_test))
        mae_train = mean_absolute_error(data['Count'][:-testing_lenght], y_pred_train)
        mae_test = mean_absolute_error(data['Count'][-testing_lenght:], y_pred_test)
        train_size = data[:-testing_lenght].Count.sum()/len(data[:-testing_lenght])
        test_size = data[-testing_lenght:].Count.sum()/len(data[-testing_lenght:])


        # Log AIC and R-squared
        mlflow.log_metrics({
            "AIC": aic,
            "RMSE_Train" : rmse_train,
            "RMSE_Test" : rmse_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,
            "MAE_Train_Percentage" : mae_train/train_size,
            "MAE_Test_Percentage" : mae_test/test_size

        })

        mlflow.log_params({
        "order": order,
        "seasonal_order": seasonal_order,
        'label' : label,
        'test_size' : test_size,
        'train_size' : train_size
        })
        # Log the trained model
        # mlflow.sklearn.log_model(results, "SARIMA")

In [49]:
def getMaxPeriodogram(freq, Pxx_spec, num_top=5):
    # Find the indices of the top 'num_top' maximum PSDs
    top_indices = np.argsort(Pxx_spec)[-num_top:][::-1]

    # Extract the frequencies and PSDs for the top 'num_top' maximum PSDs
    freqMax = freq[top_indices]
    maxPsd = Pxx_spec[top_indices]
    # Convert to periods
    periodMax = 1 / freqMax
    return maxPsd, freqMax, periodMax

In [50]:
def return_periodicities(data):
    freq, Pxx_spec = signal.periodogram(data.Count, scaling="spectrum")
    maxPsd, freqMax, periodMax = getMaxPeriodogram(freq, Pxx_spec)
        
    # Define periods
    period01, period02, period03, period04, period05 = list(1 / freqMax)

    # Extract the number hourly samples
    T = len(data)
    time = np.arange(1, T + 1)

    # Create regressors for the daily period
    omega01 = 2 * np.pi / period01
    s1 = np.cos(omega01 * time)
    c1 = np.sin(omega01 * time)

    # Create regressors for the weekly period
    omega02 = 2 * np.pi / period02
    s2 = np.cos(omega02 * time)
    c2 = np.sin(omega02 * time)

    omega03 = 2 * np.pi / period03
    s3 = np.cos(omega03 * time)
    c3 = np.sin(omega03 * time)

    omega04 = 2 * np.pi / period04
    s4 = np.cos(omega04 * time)
    c4 = np.sin(omega04 * time)

    omega05 = 2 * np.pi / period05
    s5 = np.cos(omega05 * time)
    c5 = np.sin(omega05 * time)

    # Stack regressors and add intercept
    regressors = np.column_stack((s1, c1, s2, c2, s3, c3, s4, c4, s5, c5))
    regressors_one = sm.add_constant(regressors)

    exog_var_week = regressors    

    return regressors_one, exog_var_week

In [51]:
def pure_harmonic_regression(data, label, testing_lenght):  

    regressors_one, exog_var_week = return_periodicities(data) 
    
# Create an MLflow experiment and start a run
    with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"april_harmonic_week_{label}") as run:
        data = data[['Count']]
        # Create and fit the regression model
        model = sm.OLS(data[:-testing_lenght], regressors_one[:-testing_lenght])
        results_harmonic_week = model.fit()

        # Make predictions
        y_pred_train = results_harmonic_week.predict(regressors_one[:-testing_lenght])
        y_pred_test = results_harmonic_week.predict(regressors_one[-testing_lenght:])

        # Calculate AIC and R-squared and other stuff
        aic = results_harmonic_week.aic
        r_squared = results_harmonic_week.rsquared
        rmse_train = np.sqrt(mean_squared_error(data[:-testing_lenght], y_pred_train))
        rmse_test = np.sqrt(mean_squared_error(data[-testing_lenght:], y_pred_test))
        mae_train = mean_absolute_error(data['Count'][:-testing_lenght], y_pred_train)
        mae_test = mean_absolute_error(data['Count'][-testing_lenght:], y_pred_test)
        train_size = data[:-testing_lenght].Count.sum()/len(data[:-testing_lenght])
        test_size = data[-testing_lenght:].Count.sum()/len(data[-testing_lenght:])



        # Log model parameters
        mlflow.log_params({
            'label' : label,
            'test_size' : test_size,
            'train_size' : train_size
        })

        # Log AIC and R-squared
        mlflow.log_metrics({
            "AIC": aic,
            "R-squared": r_squared,
            "RMSE_Train" : rmse_train,
            "RMSE_Test" : rmse_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,
            "MAE_Train_Percentage" : mae_train/train_size,
            "MAE_Test_Percentage" : mae_test/test_size

        })

        # Log the trained model
        # mlflow.sklearn.log_model(results_harmonic_week, "harmonic_model")

        
        

In [155]:
def sarimax(data, label, testing_lenght): 
    
    data = data[['Count']]
    regressors_one, exog_var_week = return_periodicities(data) 

    # Create an MLflow experiment 
    with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"SARIMAX {label}") as run:
        order = (24, 0, 0)
        seasonal_order = (0, 0, 0, 0)
        model_sarimax = SARIMAX(
            endog= data[:-testing_lenght],
            exog=exog_var_week[:-testing_lenght],
            trend="c",
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=True,
            enforce_invertibility=True,
        )
        m_sarimax_24 = model_sarimax.fit()

        # Make predictions
        y_pred_train = m_sarimax_24.predict()
        forecast = m_sarimax_24.get_forecast(steps=testing_lenght, exog=exog_var_week[-testing_lenght:])
        y_pred_test = forecast.predicted_mean

        # Calculate AIC and R-squared and more
        aic = m_sarimax_24.aic
        
        rmse_train = np.sqrt(mean_squared_error(data['Count'][:-testing_lenght], y_pred_train))
        rmse_test = np.sqrt(mean_squared_error(data['Count'][-testing_lenght:], y_pred_test))
        mae_train = mean_absolute_error(data['Count'][:-testing_lenght], y_pred_train)
        mae_test = mean_absolute_error(data['Count'][-testing_lenght:], y_pred_test)
        train_size = data[:-testing_lenght].Count.sum()/len(data[:-testing_lenght])
        test_size = data[-testing_lenght:].Count.sum()/len(data[-testing_lenght:])


        # Log AIC and R-squared
        mlflow.log_metrics({
            "AIC": aic,
            "RMSE_Train" : rmse_train,
            "RMSE_Test" : rmse_test,
            "MAE_Train" : mae_train,
            "MAE_Test" : mae_test,  
            "MAE_Train_Percentage" : mae_train/train_size,
            "MAE_Test_Percentage" : mae_test/test_size
        })

        # Log model parameters
        mlflow.log_params({
            'label' : label,
            'test_size' : test_size,
            'train_size' : train_size
        })

        # Log the trained model
        # mlflow.sklearn.log_model(m_sarimax_24, "SARIMAX")

# Defining the testers

In [156]:
def random_forest_tester(data, label, testing_lenght):
    random_forest(data, label, testing_lenght, 11, 20, 1)
    random_forest(data, label, testing_lenght, 11, 30, 1)
    random_forest(data, label, testing_lenght, 12, 30, 1)
    random_forest(data, label, testing_lenght, 14, 30, 1)
  

In [157]:
def function_wrapper(data, label, testing_lenght):
    print('started random forest')
    random_forest(data, label, testing_lenght, 12, 30, 1)
    print('started sarima', label)
    sarima(data, label, (0, 1, 6), (1, 1, 1, 24), testing_lenght)
    print('started sarima', label)
    sarima(data, label, (1, 0, 0), (1, 1, 0, 24), testing_lenght)
    print('started sarima', label)
    sarima(data, label, (1, 1, 0), (1, 1, 0, 24), testing_lenght)
    print('started sarima', label)
    sarima(data, label, (5, 1, 1), (1, 1, 1, 24), testing_lenght)
    print('started sarima', label)
    sarima(data, label, (2, 1, 1), (1, 1, 1, 24), testing_lenght)
    print('started sarimax', label)
    sarimax(data, label, testing_lenght)
    print('started pure harmonic', label)
    pure_harmonic_regression(data, label, testing_lenght)
    

# Cross Validation, which model is the best model on all data

In [158]:
testing_lenght = 48
tscv = TimeSeriesSplit(n_splits=5, test_size=testing_lenght)      

In [159]:
for label in endog_week_dict:
    for train_index, test_index in tscv.split(endog_week_dict[label]):
        data = endog_week_dict[label].iloc[np.hstack((train_index, test_index))]
        function_wrapper(data, label, testing_lenght)

started random forest


started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarimax 0
started pure harmonic 0
started random forest
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarimax 0
started pure harmonic 0
started random forest
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarimax 0
started pure harmonic 0
started random forest
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarimax 0
started pure harmonic 0
started random forest
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarima 0
started sarimax 0
started pure harmonic 0
started random forest
started sarima 1
started sarima 1
started sarima 1
started sarima 1




started sarima 1
started sarimax 1




started pure harmonic 1
started random forest
started sarima 1
started sarima 1
started sarima 1
started sarima 1




started sarima 1
started sarimax 1




started pure harmonic 1
started random forest
started sarima 1
started sarima 1
started sarima 1
started sarima 1




started sarima 1
started sarimax 1




started pure harmonic 1
started random forest
started sarima 1
started sarima 1
started sarima 1
started sarima 1




started sarima 1
started sarimax 1




started pure harmonic 1
started random forest
started sarima 1
started sarima 1
started sarima 1
started sarima 1




started sarima 1
started sarimax 1
started pure harmonic 1
started random forest
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarimax 2




started pure harmonic 2
started random forest
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarimax 2




started pure harmonic 2
started random forest
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarima 2
started sarimax 2
started pure harmonic 2
started random forest
started sarima 2
started sarima 2
started sarima 2
started sarima 2




started sarima 2
started sarimax 2




started pure harmonic 2
started random forest
started sarima 2
started sarima 2
started sarima 2
started sarima 2




started sarima 2
started sarimax 2




started pure harmonic 2
started random forest
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarimax 3




started pure harmonic 3
started random forest
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarimax 3




started pure harmonic 3
started random forest
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarimax 3




started pure harmonic 3
started random forest
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarimax 3




started pure harmonic 3
started random forest
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarima 3
started sarimax 3




started pure harmonic 3
started random forest
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarimax 4
started pure harmonic 4
started random forest
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarimax 4
started pure harmonic 4
started random forest
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarimax 4
started pure harmonic 4
started random forest
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarimax 4
started pure harmonic 4
started random forest
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarima 4
started sarimax 4
started pure harmonic 4
started random forest
started sarima 5




started sarima 5
started sarima 5
started sarima 5




started sarima 5
started sarimax 5




started pure harmonic 5
started random forest
started sarima 5




started sarima 5
started sarima 5
started sarima 5
started sarima 5
started sarimax 5




started pure harmonic 5
started random forest
started sarima 5




started sarima 5
started sarima 5
started sarima 5
started sarima 5
started sarimax 5




started pure harmonic 5
started random forest
started sarima 5
started sarima 5
started sarima 5
started sarima 5




started sarima 5
started sarimax 5




started pure harmonic 5
started random forest
started sarima 5
started sarima 5
started sarima 5
started sarima 5
started sarima 5
started sarimax 5




started pure harmonic 5


In [167]:
metric_df_all_data = mlflow.search_runs(experiment_ids=experiment.experiment_id)

In [168]:
result = (
    metric_df_all_data.groupby(["tags.mlflow.runName", "params.label"], as_index=False)
    .agg({"metrics.RMSE_Test": "mean", "metrics.RMSE_Train" : "mean", "metrics.MAE_Train" : "mean", "metrics.MAE_Test" : "mean", "metrics.AIC": "mean", "metrics.MAE_Test_Percentage": "mean"})
)

In [169]:
result.sort_values("metrics.MAE_Test_Percentage")

Unnamed: 0,tags.mlflow.runName,params.label,metrics.RMSE_Test,metrics.RMSE_Train,metrics.MAE_Train,metrics.MAE_Test,metrics.AIC,metrics.MAE_Test_Percentage
0,Random_Forest_0,0,22.63341,7.349098,5.364426,15.614667,,0.024377
4,Random_Forest_4,4,28.951335,8.561758,6.184565,21.185583,,0.040566
1,Random_Forest_1,1,6.879836,1.834547,1.296058,5.270458,,0.055205
3,Random_Forest_3,3,3.601422,1.224552,0.794644,2.832913,,0.06096
5,Random_Forest_5,5,13.947903,3.415611,2.473884,11.249042,,0.075785
2,Random_Forest_2,2,1.672348,0.482572,0.358078,1.317105,,0.087805
35,"SARIMA(2, 1, 1), (1, 1, 1, 24), 5",5,31.973397,18.034402,12.478741,23.80963,25483.167575,0.164716
47,"SARIMA(5, 1, 1), (1, 1, 1, 24), 5",5,31.992314,18.020586,12.495845,23.903243,25484.396115,0.165432
30,"SARIMA(2, 1, 1), (1, 1, 1, 24), 0",0,143.672613,76.834859,50.802822,110.860077,33958.30942,0.172433
42,"SARIMA(5, 1, 1), (1, 1, 1, 24), 0",0,147.787325,76.633652,50.986447,114.136124,33948.022153,0.17737


In [163]:
for label in endog_week_dict:
    number = endog_week_dict[label].Count.sum()
    result.loc[result['params.label'] == str(label), 'rides'] = number
    result.loc[result['params.label'] == str(label), 'average_rides'] = number/len(endog_week_dict[label])


In [164]:
result['metrics.MAE_Train_Percentage'] = result['metrics.MAE_Train'] / result['average_rides']

In [None]:
result.sort_values("metrics.MAE_Train_Percentage")

Unnamed: 0,tags.mlflow.runName,params.label,metrics.RMSE_Test,metrics.RMSE_Train,metrics.MAE_Train,metrics.MAE_Test,metrics.AIC,metrics.MAE_Test_Percentage,rides,average_rides,metrics.MAE_Train_Percentage
0,Random_Forest_0,0,31.492213,7.431819,5.414677,21.231175,,0.032615,1537742.0,493.024046,0.010983
4,Random_Forest_4,4,37.767497,8.489476,6.113823,28.333514,,0.052899,1229333.0,394.143315,0.015512
1,Random_Forest_1,1,6.98929,1.88839,1.331575,5.156255,,0.054907,192105.0,61.591856,0.021619
3,Random_Forest_3,3,4.020363,1.256901,0.814637,2.919586,,0.063734,104311.0,33.443732,0.024358
5,Random_Forest_5,5,14.604697,3.436245,2.493103,11.751192,,0.082122,285859.0,91.65085,0.027202
2,Random_Forest_2,2,1.863601,0.49439,0.363957,1.432376,,0.097757,34137.0,10.944854,0.033254
6,"SARIMA(0, 1, 6), (1, 1, 1, 24), 0",0,184.911663,76.677382,52.239753,153.283233,32571.897971,0.23712,1537742.0,493.024046,0.105958
24,SARIMAX 0,0,218.796924,78.137498,55.926935,170.121188,33062.700602,0.261647,1537742.0,493.024046,0.113437
12,"SARIMA(1, 0, 0), (1, 1, 0, 24), 0",0,179.277816,97.802003,63.519043,142.050259,33618.466313,0.220358,1537742.0,493.024046,0.128836
18,"SARIMA(1, 1, 0), (1, 1, 0, 24), 0",0,403.913758,95.440295,63.607117,364.465373,33827.549825,0.574776,1537742.0,493.024046,0.129014


# Testing which parameters for random forest week are the best for all clusters

In [3]:
notebook_path = os.path.abspath("5. Testing.ipynb")
notebook_directory = os.path.dirname(notebook_path)

replacement_string = "models\\Tracking"
index = notebook_directory.find("\\notebooks")
modified_path = notebook_directory[:index]

modified_path += "\\" + replacement_string
tracking_uri = "file:///" + modified_path 

In [4]:
mlflow.set_tracking_uri(tracking_uri)
# Create a named experiment
experiment = mlflow.set_experiment("extensive_testing_forest")

In [21]:
# for label in endog_week_dict:
#     for train_index, test_index in tscv.split(endog_week_dict[label]):
#         data = endog_week_dict[label].iloc[np.hstack((train_index, test_index))]
#         random_forest_tester(data, label, testing_lenght)

In [16]:
metric_df_forest_data = mlflow.search_runs(experiment_ids=experiment.experiment_id)

In [17]:
result_forest = (
    metric_df_forest_data.groupby(["tags.mlflow.runName", "params.label"], as_index=False)
    .agg({"metrics.RMSE_Test": "mean", "metrics.RMSE_Train" : "mean", "metrics.MAE_Train" : "mean", "metrics.MAE_Test" : "mean", "metrics.MAE_Test_Percentage": "mean"})
)

In [18]:
grouped = metric_df_forest_data.groupby(["params.max_depth", "params.max_features", "params.label"], as_index=False).agg({"metrics.RMSE_Test": "mean", "metrics.RMSE_Train" : "mean", "metrics.MAE_Train" : "mean", "metrics.MAE_Test" : "mean", "metrics.MAE_Test_Percentage": "mean"})

In [19]:
for label in endog_week_dict:
    number = endog_week_dict[label].Count.sum()
    grouped.loc[grouped['params.label'] == str(label), 'rides'] = number

In [20]:
# Calculationg the weight means of the metrics per rides

def weighted_mean(series):
    return np.average(series, weights=grouped.loc[series.index, 'rides'])

weighted = grouped.groupby(["params.max_depth", "params.max_features"]).agg({
    "metrics.RMSE_Test": weighted_mean,
    "metrics.RMSE_Train": weighted_mean,
    "metrics.MAE_Train": weighted_mean,
    "metrics.MAE_Test": weighted_mean,
    "metrics.MAE_Test_Percentage": weighted_mean
}).reset_index()

weighted.sort_values('metrics.MAE_Test_Percentage')

Unnamed: 0,params.max_depth,params.max_features,metrics.RMSE_Test,metrics.RMSE_Train,metrics.MAE_Train,metrics.MAE_Test,metrics.MAE_Test_Percentage
2,30,12,29.655223,6.882202,4.986682,21.23623,0.046895
0,20,11,29.707348,6.918967,5.021707,21.291256,0.046988
1,30,11,29.791959,6.885468,4.986722,21.339203,0.047073
3,30,14,30.095425,6.945021,5.004489,21.463001,0.047219


# Appyling the cross validation for the models trained on each cluster and the whole data, inculsive week and weekend

In [21]:
notebook_path = os.path.abspath("5. Testing.ipynb")
notebook_directory = os.path.dirname(notebook_path)

replacement_string = "models\\Tracking"
index = notebook_directory.find("\\notebooks")
modified_path = notebook_directory[:index]

modified_path += "\\" + replacement_string
tracking_uri = "file:///" + modified_path 

In [22]:
mlflow.set_tracking_uri(tracking_uri)

# Specify the experiment name
experiment_name = "Random_Forest_All_Data"

# Set the active experiment context (creates the experiment if it doesn't exist)
experiment = mlflow.set_experiment(experiment_name)

In [26]:
notebook_path = os.path.abspath("4. Cross Validation.ipynb")
notebook_directory = os.path.dirname(notebook_path)

replacement_string = "data\\interim\\Cluster_Data_All_Month\\"
index = notebook_directory.find("\\notebooks")
modified_path = notebook_directory[:index]

modified_path += "\\" + replacement_string + "df_all_counts_grouped.csv"

In [27]:
df_all_data = pd.read_csv(modified_path )

In [28]:
df_all_data.Date = pd.to_datetime(df_all_data['Date'])
df_all_data.drop(columns='Unnamed: 0', inplace=True)

In [9]:
columns_to_drop = [col for col in df_all_data.columns if col.startswith('Count_')]

current_df_dict = {}

for label in range(0,6,1):
    # Drop all columns named 'count_'
    current_df = df_all_data.drop(columns=columns_to_drop)
    current_df[f'Count_{label}'] = df_all_data[f'Count_{label}']
    # Differencing the first 24 hours
    for diff in range(1,25,1):
        current_df[f'Count_{label}{-diff}'] = current_df[f'Count_{label}'].diff(diff)
    # differencing up to a week
    for diff in range(48,169,24):
        current_df[f'Count_{label}{-diff}'] = current_df[f'Count_{label}'].diff(diff)
    current_df.dropna(inplace=True)   
    current_df.reset_index(inplace=True)  
    current_df.drop(columns=['Date', 'index'], inplace=True)             
    current_df_dict[label] = current_df

In [53]:
testing_lenght = 48
tscv = TimeSeriesSplit(n_splits=5, test_size=testing_lenght)      

In [67]:
for label in current_df_dict:
    for train_index, test_index in tscv.split(current_df_dict[label]):
        data = current_df_dict[label].iloc[np.hstack((train_index, test_index))]
        if label == 0:
            random_forest_all_data(data, label, testing_lenght, 15, 30, 1)
        elif label == 1:
            random_forest_all_data(data, label, testing_lenght, 15, 20, 2)
        elif label == 2:
            random_forest_all_data(data, label, testing_lenght, 11, 20, 1)
        elif label == 3:
            random_forest_all_data(data, label, testing_lenght, 14, 35, 1)
        elif label == 4:
            random_forest_all_data(data, label, testing_lenght, 15, 35, 1)
        elif label == 5:
            random_forest_all_data(data, label, testing_lenght, 15, 35, 2)

## Evaluation

In [23]:
metric_df_forest_data = mlflow.search_runs(experiment_ids=experiment.experiment_id)

In [24]:
result_forest = (
    metric_df_forest_data.groupby(["tags.mlflow.runName", "params.label"], as_index=False)
    .agg({"metrics.RMSE_Test": "mean", "metrics.RMSE_Train" : "mean", "metrics.MAE_Train" : "mean", "metrics.MAE_Test" : "mean", "metrics.MAE_Test_Percentage": "mean"})
)

In [25]:
result_forest 

Unnamed: 0,tags.mlflow.runName,params.label,metrics.RMSE_Test,metrics.RMSE_Train,metrics.MAE_Train,metrics.MAE_Test,metrics.MAE_Test_Percentage
0,Random_Forest_0,0,28.582933,7.514833,5.265419,21.259667,0.039553
1,Random_Forest_1,1,9.298239,2.025233,1.343381,7.701599,0.074656
2,Random_Forest_2,2,1.507372,0.41589,0.305857,1.092303,0.073015
3,Random_Forest_3,3,3.871991,1.0937,0.740751,3.144917,0.062975
4,Random_Forest_4,4,28.061904,8.607394,5.778372,20.514708,0.04146
5,Random_Forest_5,5,16.988839,3.539898,2.419618,14.095725,0.079969


### Calculating the weighted Mean

In [10]:
number_all_rides = 0
for label in range(0,6,1):
    number_all_rides += df_all_data[f'Count_{label}'].sum()

In [11]:
label_sums = []

for label in range(0,6,1):
    number = df_all_data[f'Count_{label}'].sum()
    result_forest.loc[result_forest['params.label'] == str(label), 'rides'] = number/number_all_rides

In [12]:
result_forest['weighted_MAE_Test_Percentage'] = result_forest['metrics.MAE_Test_Percentage'] * result_forest.rides

### Weigted mean for all over New York

In [13]:
result_forest.weighted_MAE_Test_Percentage.sum()

0.04753064065463