In [1]:
import os
import sys
parent_dir = os.path.dirname(os.getcwd())
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_dir)
utils_dir = os.path.join(parent_dir, grandparent_dir, "src", "utils")
sys.path.append(utils_dir)

from weighted_accuracy_and_tools import decompose_y, reconstruct_y, weighted_accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor

import numpy as np
import pandas as pd

In [2]:
X_path= os.path.join("..","..", "data","enriched_input", "X_train.csv")
X = pd.read_csv(X_path, delimiter=',')

y_path= os.path.join("..", "..", "data","enriched_input", "y_train.csv")
y = pd.read_csv(y_path, delimiter=',')

In [3]:
X.set_index("DELIVERY_START", inplace=True)
y.set_index("DELIVERY_START", inplace=True)
X.index = pd.to_datetime(X.index, utc=True)
y.index = pd.to_datetime(y.index, utc=True)
X.shape

(10605, 16)

In [4]:
_, y_magintude = decompose_y(y['spot_id_delta'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y_magintude, test_size=0.2, random_state=42)

In [6]:
linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor(random_state=42)
random_forest_reg = RandomForestRegressor(random_state=42)
gradient_boosting_reg = GradientBoostingRegressor(random_state=42)
knn_reg = KNeighborsRegressor()

# Create the voting regressor with all models
voting_reg = VotingRegressor(
    estimators=[
        ('linear_reg', linear_reg),
        ('decision_tree_reg', decision_tree_reg),
        ('random_forest_reg', random_forest_reg),
        ('gradient_boosting_reg', gradient_boosting_reg),
        ('knn_reg', knn_reg)
    ]
)

# Train the voting regressor
voting_reg.fit(X_train, y_train)

# Make predictions and evaluate the voting regressor
y_pred_voting = voting_reg.predict(X_test)
rmse_voting = np.sqrt(mean_squared_error(y_test, y_pred_voting))

In [7]:
rmse_voting

17.25741154116155

In [8]:
regressors = [linear_reg, decision_tree_reg, random_forest_reg, gradient_boosting_reg, knn_reg]
regressor_names = ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 
                   'Gradient Boosting Regressor', 'KNN Regressor']
rmse_values = {}

for reg, name in zip(regressors, regressor_names):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    rmse_values[name] = np.sqrt(mean_squared_error(y_test, y_pred))

In [9]:
rmse_values

{'Linear Regression': 23.741257273868058,
 'Decision Tree Regressor': 22.372004710839715,
 'Random Forest Regressor': 15.3864156190305,
 'Gradient Boosting Regressor': 19.553636956303297,
 'KNN Regressor': 20.580274326210798}