# Decision Tree

## Max Depth Tuning Function

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

def tune_max_depth(X_train, y_train, cv=5):
    param_grid = {'max_depth': list(range(3, 51))}
    grid = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    print("Best max_depth:", grid.best_params_['max_depth'])
    return grid.best_params_['max_depth']

## Min Samples Split Tuning Function

In [14]:
def tune_min_samples_split(X_train, y_train, max_depth, cv=5):
    param_grid = {'min_samples_split': list(range(2, 21))}
    grid = GridSearchCV(
        DecisionTreeRegressor(max_depth=max_depth, random_state=42),
        param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    print("Best min_samples_split:", grid.best_params_['min_samples_split'])
    return grid.best_params_['min_samples_split']

## Min Samples Leaf Tuning Function

In [15]:
def tune_min_samples_leaf(X_train, y_train, max_depth, min_samples_split, cv=5):
    param_grid = {'min_samples_leaf': list(range(1, 11))}
    grid = GridSearchCV(
        DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42),
        param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    print("Best min_samples_leaf:", grid.best_params_['min_samples_leaf'])
    return grid.best_params_['min_samples_leaf']

## Decision Tree Function

In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate_fixed_decision_tree(X_train, X_test, y_train, y_test,
                                 max_depth, min_samples_split, min_samples_leaf, random_state=42):
    
    # Train the model with fixed hyperparameters
    model = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )
    model.fit(X_train, y_train)

    # Predict (in log scale)
    y_train_pred_log = model.predict(X_train)
    y_test_pred_log = model.predict(X_test)

    # Exponentiate to get back to original scale
    y_train_true = np.exp(y_train)
    y_test_true = np.exp(y_test)
    y_train_pred = np.exp(y_train_pred_log)
    y_test_pred = np.exp(y_test_pred_log)
    
    # Compute MSEs and ratio
    mse_train = mean_squared_error(y_train_true, y_train_pred)
    mse_test = mean_squared_error(y_test_true, y_test_pred)
    overfit_ratio = mse_test / mse_train

    # Print results
    print(f"Train MSE: {mse_train:.6f}")
    print(f"Test MSE: {mse_test:.6f}")
    print(f"Test/Train MSE ratio: {overfit_ratio:.2f}")

## Randomnized Gradient Boosting Function

In [50]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error
import numpy as np

def gradient_boosting_tuning(X_train, X_test, y_train, y_test, n_iter=50, cv=5, random_state=42):
    # Define hyperparameter search space
    param_dist = {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(2, 10),
        'min_samples_split': randint(2, 10),
        'min_samples_leaf': randint(1, 10),
    }

    # Set up randomized search
    search = RandomizedSearchCV(
        GradientBoostingRegressor(random_state=random_state),
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='neg_mean_squared_error',
        random_state=random_state,
        n_jobs=-1
    )

    # Fit the model
    search.fit(X_train, y_train)

    # Best model
    best_model = search.best_estimator_

    # Predict
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    # Exponentiate predictions and true values to return to original scale
    y_train_exp = np.exp(y_train)
    y_test_exp = np.exp(y_test)
    y_train_pred_exp = np.exp(y_train_pred)
    y_test_pred_exp = np.exp(y_test_pred)

    # Evaluate
    mse_train = mean_squared_error(y_train_exp, y_train_pred_exp)
    mse_test = mean_squared_error(y_test_exp, y_test_pred_exp)
    ratio = mse_test / mse_train

    print("Best hyperparameters:", search.best_params_)
    print(f"Train MSE: {mse_train:.6f}")
    print(f"Test MSE: {mse_test:.6f}")
    print(f"Test/Train MSE Ratio: {ratio:.2f}")

## df_base

In [3]:
import pandas as pd

X_base_train = pd.read_pickle("X_base_train.pkl")
X_base_test = pd.read_pickle("X_base_test.pkl")
y_base_train = pd.read_pickle("y_base_train.pkl")
y_base_test = pd.read_pickle("y_base_test.pkl")

### Hyper Parameter Tuning

In [27]:
best_base_depth = tune_max_depth(X_base_train, y_base_train)
best_base_split = tune_min_samples_split(X_base_train, y_base_train, max_depth=best_base_depth)
best_base_leaf = tune_min_samples_leaf(X_base_train, y_base_train, max_depth=best_base_depth, min_samples_split=best_base_split)

Best max_depth: 29
Best min_samples_split: 13
Best min_samples_leaf: 3


### Decision Tree

In [40]:
evaluate_fixed_decision_tree(X_base_train, X_base_test, y_base_train, y_base_test,
                                 best_base_depth, best_base_split, best_base_leaf, random_state=42)

Train MSE: 956986723.985752
Test MSE: 7257474587.897000
Test/Train MSE ratio: 7.58


### Randomnized Grad Boost

In [51]:
gradient_boosting_tuning(X_base_train, X_base_test, y_base_train, y_base_test, n_iter=50, cv=5, random_state=42)

Best hyperparameters: {'learning_rate': 0.1624596330222156, 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 236}
Train MSE: 858998748.284177
Test MSE: 5639625770.303349
Test/Train MSE Ratio: 6.57


## df_post_covid

In [4]:
import pandas as pd

X_post_train = pd.read_pickle("X_post_train.pkl")
X_post_test = pd.read_pickle("X_post_test.pkl")
y_post_train = pd.read_pickle("y_post_train.pkl")
y_post_test = pd.read_pickle("y_post_test.pkl")

### Hyper Parameter Tuning

In [29]:
best_post_depth = tune_max_depth(X_post_train, y_post_train)
best_post_split = tune_min_samples_split(X_post_train, y_post_train, max_depth=best_post_depth)
best_post_leaf = tune_min_samples_leaf(X_post_train, y_post_train, max_depth=best_post_depth, min_samples_split=best_post_split)

Best max_depth: 25
Best min_samples_split: 17
Best min_samples_leaf: 3


### Decision Tree

In [39]:
evaluate_fixed_decision_tree(X_post_train, X_post_test, y_post_train, y_post_test,
                                 best_post_depth, best_post_split, best_post_leaf, random_state=42)

Train MSE: 1437356662.178113
Test MSE: 5354710257.969723
Test/Train MSE ratio: 3.73


### Randomnized Grad Boost

In [52]:
gradient_boosting_tuning(X_post_train, X_post_test, y_post_train, y_post_test, n_iter=50, cv=5, random_state=42)

Best hyperparameters: {'learning_rate': 0.07370173320348283, 'max_depth': 9, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 253}
Train MSE: 1059707867.696193
Test MSE: 3397103048.776862
Test/Train MSE Ratio: 3.21
