<a href="https://colab.research.google.com/github/MCn21thCntry/Practical-Machine-Learning---from-the-rooter-to-the-tooter/blob/main/Module_7_Comprehensive_Advanced_Ensemble_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

house_prices_advanced_regression_techniques_path = kagglehub.competition_download('house-prices-advanced-regression-techniques')
nehalbirla_vehicle_dataset_from_cardekho_path = kagglehub.dataset_download('nehalbirla/vehicle-dataset-from-cardekho')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Deep Dive into Random Forests and Bagging Versatility - Regression and Classification

## A: Hyperparameter Tuning Challenge (Regression or Classification)

* ### Select either the House Price Regression task OR the Digits Classification task from this module.
* I chose both of them as follow.

In [None]:
house_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

from sklearn.datasets import load_digits
digits_df = load_digits()
digits_df

In [None]:
X_digits, y_digits = digits_df.data, digits_df.target

In [None]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

X_train_digits, X_val_digits, y_train_digits, y_val_digits = train_test_split(X_digits, y_digits, test_size=0.2, random_state=21)

y = house_df["SalePrice"]
numeric_features = house_df.select_dtypes(include=np.number)
X = numeric_features.drop("SalePrice", axis=1)
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)


* **Systematic Hyperparameter Tuning:** Instead of manual exploration, use a systematic approach to hyperparameter tuning. Choose either **Grid Search** (GridSearchCV) or **Randomized Search** (RandomizedSearchCV) from scikit-learn to tune hyperparameters for RandomForestRegressor (or RandomForestClassifier). Tune at least **three** hyperparameters (e.g., n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, criterion, bootstrap).

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
import numpy as np

rf_cls = RandomForestClassifier(random_state=21)
rf_reg = RandomForestRegressor(random_state=21)

rf_cls.fit(X_train_digits, y_train_digits)
rf_reg.fit(X_train, y_train)

rf_cls_accuracy = accuracy_score(y_val_digits, rf_cls.predict(X_val_digits))
rf_reg_mae = mean_absolute_error(y_val, rf_reg.predict(X_val))
rf_reg_mse = mean_squared_error(y_val, rf_reg.predict(X_val))

print("rf_cls_accuracy: ", rf_cls_accuracy)
print("rf_reg_mae: ", rf_reg_mae)
print("rf_reg_rmse: ", np.sqrt(rf_reg_mse))

In [None]:
print("Clf parameters: ", rf_cls.get_params())
print("Clf parameters: ", rf_cls.n_estimators, rf_cls.max_depth, rf_cls.max_features)
print("Reg parameters: ", rf_reg.get_params())

* **Evaluate Best Model:** After tuning, evaluate the performance of your "best" Random Forest model (found by Grid Search or Randomized Search) on the validation set (or a separate test set if you create one). Report the best hyperparameters found and the corresponding performance (MAE for regression, Accuracy for classification).
    * **Compare to Baseline:** Compare the performance of your tuned Random Forest model to the baseline Random Forest model (with default hyperparameters) from this module and to the best performing Bagging model you identified in your performance comparison analysis. Did hyperparameter tuning significantly improve performance beyond Bagging or the baseline Random Forest?

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

clf_param_grid = {"n_estimators": [50, 100, 250],
                 "max_depth": [None, 10, 20, 30],
                 "max_features": ["sqrt", "log2"]}
reg_param_dist = {"n_estimators": [50, 100, 150, 200],
                 "max_depth": [None, 10, 20, 30, 40],
                 "min_samples_split": [2, 5, 10],
                 "min_samples_leaf": [1, 2, 4],
                 "max_features": ["auto", "sqrt", "log2"]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=21),
                          clf_param_grid,
                          cv=5,
                          scoring="accuracy",
                          n_jobs=-1)
grid_search.fit(X_train_digits, y_train_digits)
print("RF Classification best params: ",    grid_search.best_params_)
tuned_clf_accuracy = accuracy_score(y_val_digits, grid_search.predict(X_val_digits))
print("Tuned Classification accuracy: ",    tuned_clf_accuracy)

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=21),
                              param_distributions=reg_param_dist,
                              n_iter=20, # # number of parameter settings that are sampled
                              cv=5,
                              scoring="neg_mean_absolute_error",
                              random_state=21,
                              n_jobs=-1)
random_search.fit(X_train, y_train)
print("RF Regression best params: ",    random_search.best_params_)
tuned_reg_mae = mean_absolute_error(y_val, random_search.predict(X_val))
print("Tuned Regression mae: ",    tuned_reg_mae)

### It looks like hyperparameter optimization/tuning has unintended blowback! However, dont forget that we couldn't perpend on preprocessing stage which the most important part of ml modelling.

### Bagging

In [None]:
!pip install lazypredict
from lazypredict.Supervised import LazyRegressor
from lazypredict.Supervised import LazyClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

clf = LazyClassifier(predictions=True)
clf_models, clf_predictions = clf.fit(X_train_digits, X_val_digits, y_train_digits, y_val_digits)
print(f"Classification Models Performance:\n {clf_models}")

reg = LazyRegressor(predictions=True)
reg_models, reg_predicitions = reg.fit(X_train, X_val, y_train, y_val)
print(f"Regression Models Performance:\n {reg_models}")

In [None]:
#from sklearn.liner_model import LinearRegression, LogisticRegression
#from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
"""
The following models seem the best models according to LazyReg&Clas.
So Let's evaluate which one turns better result to use generating Bagging Model
"""
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.tree import ExtraTreeRegressor, ExtraTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

KN_clf_model = KNeighborsClassifier()
KN_clf_model.fit(X_train_digits, y_train_digits)
KN_reg_model = KNeighborsRegressor()
KN_reg_model.fit(X_train, y_train)

Lgbm_clf_model = LGBMClassifier(verbosity=0)
Lgbm_clf_model.fit(X_train_digits, y_train_digits)
Lgbm_reg_model = LGBMRegressor(verbosity=0)
Lgbm_reg_model.fit(X_train, y_train)

Xgboost_clf_model = XGBClassifier()
Xgboost_clf_model.fit(X_train_digits, y_train_digits)
Xgboost_reg_model = XGBRegressor()
Xgboost_reg_model.fit(X_train, y_train)

Extra_clf_model = ExtraTreeClassifier()
Extra_clf_model.fit(X_train_digits, y_train_digits)
Extra_reg_model = ExtraTreeRegressor()
Extra_reg_model.fit(X_train, y_train)

RForest_clf_model = RandomForestClassifier()
RForest_clf_model.fit(X_train_digits, y_train_digits)
RForest_reg_model = RandomForestRegressor()
RForest_reg_model.fit(X_train, y_train)

SVC_model = SVC()
SVC_model.fit(X_train_digits, y_train_digits)
SVR_model = SVR()
SVR_model.fit(X_train, y_train)

classifiers = [KN_clf_model, Lgbm_clf_model, Xgboost_clf_model, Extra_clf_model, RForest_clf_model, SVC_model]
regressors = [KN_reg_model, Lgbm_reg_model, Xgboost_reg_model, Extra_reg_model, RForest_reg_model, SVR_model]

classifiers_accuracies = {}
for clf in classifiers:
    accuracy = accuracy_score(y_val_digits, clf.predict(X_val_digits))
    classifiers_accuracies[clf.__class__.__name__] = accuracy
    print(f"{clf.__class__.__name__} Accuracy: {accuracy}")

regressors_errors = {}
for reg in regressors:
    predictions = reg.predict(X_val)
    mae = mean_absolute_error(y_val, predictions)
    rmse = mean_squared_error(y_val, predictions, squared=False)  # calculate the RMSE
    print(f"{reg.__class__.__name__} MAE: {mae}")
    print(f"{reg.__class__.__name__} RMSE: {rmse}")

In [None]:
## KNeighborsClassifier is the best for Classification for digits dataset
## RandomForestRegressor is the best for Regression for Home dataset
## Then Let's use:
from sklearn.ensemble import BaggingRegressor, BaggingClassifier

bagg_cls = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=10, random_state=21)
bagg_cls.fit(X_train_digits, y_train_digits)
bagg_cls_predictions = bagg_cls.predict(X_val_digits)
bagg_cls_accuracy = accuracy_score(y_val_digits, bagg_cls_predictions)
print(f"Accuracy: {bagg_cls_accuracy}")

bagg_reg = BaggingRegressor(base_estimator=RandomForestRegressor(), n_estimators=10, random_state=21)
bagg_reg.fit(X_train, y_train)
bagg_reg_predictions = bagg_reg.predict(X_val)
bagg_reg_rmse = mean_squared_error(y_val, bagg_reg_predictions, squared=False)
print(f"RMSE: {bagg_reg_rmse}")

* ### KNeighborsClassifier and RandomForestRegressor have given the best results so far, even better than BaggingClassifier and BaggingRegressor.

In [None]:
KNeighborsclf_param_grid = {"n_neighbors": [5, 10, 20, 30],
                 "weights": ["uniform", "distance"],
                 "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                 "leaf_size": [10, 20, 30, 40],
                 "p": [1, 2]} # Manhattan Distance or Euclidean Distance
RForestreg_param_dist = {"n_estimators": [50, 100, 150, 200],
                 "max_depth": [None, 10, 20, 30, 40],
                 "min_samples_split": [2, 5, 10],
                 "min_samples_leaf": [1, 2, 4],
                 "max_features": ["auto", "sqrt", "log2"]}
grid_search = GridSearchCV(KNeighborsClassifier(),
                          KNeighborsclf_param_grid,
                          cv=5,
                          scoring="accuracy",
                          n_jobs=-1)
grid_search.fit(X_train_digits, y_train_digits)
print("RF Classification best params: ",    grid_search.best_params_)
tuned_clf_accuracy = accuracy_score(y_val_digits, grid_search.predict(X_val_digits))
print("Tuned Classification accuracy: ",    tuned_clf_accuracy)

random_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=21),
                              param_distributions = RForestreg_param_dist,
                              n_iter=20, # # number of parameter settings that are sampled
                              cv=5,
                              scoring="neg_mean_absolute_error",
                              random_state=21,
                              n_jobs=-1)
random_search.fit(X_train, y_train)
print("RF Regression best params: ",    random_search.best_params_)
tuned_reg_mae = mean_absolute_error(y_val, random_search.predict(X_val))
print("Tuned Regression mae: ",    tuned_reg_mae)

* ## As a result, the best result of KNeighborsClassifier remains unchanged with GridSearchCV so there is no improvement in the best result. RandomForestRegressor had given its best results as 17699.195 and has given worse with RandomizedSearchCV.

## B: Explore a New Dataset with Random Forests (Regression or Classification)
* ### Choose a new dataset: Find a publicly available dataset suitable for either regression or classification (Kaggle, UCI Machine Learning Repository, etc.). Ensure it's manageable in size and complexity.
* ### Data Preparation: Load and prepare your chosen dataset: data cleaning, feature selection, handling missing values, train/validation split.

In [None]:
import pandas as pd
df_vehicle = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")
df_vehicle.head()

In [None]:
df_vehicle.shape # (301, 9)
df_vehicle.info() # there is no null

In [None]:
df_vehicle.describe()

## Data Preprocessing

* ### Univariate Analysis

In [None]:
## There is no missing values.
# Let's find Age of cars:
df_vehicle["Age"] = 2025 - df_vehicle["Year"]
df_vehicle.drop("Year", axis=1, inplace=True)
df_vehicle.select_dtypes(include="object").columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sbn
cat_cols = df_vehicle.select_dtypes(include="object").columns.tolist()
cat_cols.append("Owner")
cat_cols = [col for col in cat_cols if col != "Car_Name"]
i=0
while i < 4:
    fig = plt.figure(figsize=[10,4])
    #ax1 = fig.add_subplot(121)
    #ax2 = fig.add_subplot(122)

    #ax1.title.set_text(cat_cols[i])
    plt.subplot(1,2,1)
    sbn.countplot(x=cat_cols[i], data=df_vehicle)
    i += 1

    #ax2.title.set_text(cat_cols[i])
    plt.subplot(1,2,2)
    sbn.countplot(x=cat_cols[i], data=df_vehicle)
    i += 1

    plt.show()

In [None]:
num_cols = df_vehicle.select_dtypes(exclude = "object").columns.tolist()
num_cols = [col for col in num_cols if col != "Owner"]

i=0
while i < 4:
    fig = plt.figure(figsize=[13,3])
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)

    ax1.title.set_text(num_cols[i])
    plt.subplot(1, 2, 1)
    sbn.boxplot(x=num_cols[i], data = df_vehicle)
    i += 1

    ax2.title.set_text(num_cols[i])
    plt.subplot(1, 2, 2)
    sbn.boxplot(x=num_cols[i], data = df_vehicle)
    i += 1

    plt.show()

In [None]:
df_vehicle.describe()

In [None]:
for col in num_cols:
    print(f"{col} 0.99:\n {df_vehicle[df_vehicle[col] > df_vehicle[col].quantile(0.99)]} \n")
    print(f"{col} 0.01:\n {df_vehicle[df_vehicle[col] < df_vehicle[col].quantile(0.01)]} \n")

* ### Bivariate/Multi-Variate Analysis

In [None]:
heatmap = sbn.heatmap(df_vehicle[num_cols].corr(), annot=True, cmap="RdBu")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90) # Rotate x-axis to 90 for vertical
heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0) # Rotate y-axis to 0 for horizontal
plt.show()

In [None]:
cat_cols

In [None]:
## Let's look at cat_cols' relationships with Selling_Price
i = 0
while i < len(cat_cols)-1:
    for j in range(i+1, len(cat_cols)):
        print(df_vehicle.pivot_table(values="Selling_Price", index=cat_cols[i], columns = cat_cols[j]))
        print()
    i += 1

## Data Preparation

In [None]:
# Drop Car_Name as it is not useful
df_vehicle.drop("Car_Name", axis=1, inplace=True)
# One hot encoding to convert cats to nums
df_vehicle = pd.get_dummies(data = df_vehicle, drop_first=True)
bool_cols = df_vehicle.select_dtypes(include=['bool']).columns
df_vehicle[bool_cols] = df_vehicle[bool_cols].astype(int)
df_vehicle.head()

### Handle the outliers

In [None]:
def outlier_thresholds(df, col_name, q1=0.25, q3=0.75):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5*interquantile_range
    low_limit = quartile1 - 1.5*interquantile_range
    return low_limit, up_limit

def check_outlier(df, col_name):
    low_limit, up_limit = outlier_thresholds(df, col_name)
    if df[(df[col_name]<low_limit) | df[col_name]>up_limit].any(axis=None):
        return True
    else:
        return False

for col in num_cols:
    print(f"{col}: {check_outlier(df_vehicle, col)}")

* ### There is no outliers so no need to touch data and no need to apply Local Outlier Factor(LOF).

### Train-Test Split

In [None]:
y = df_vehicle["Selling_Price"]
X = df_vehicle.drop("Selling_Price", axis=1)

from sklearn.model_selection import train_test_split
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
print("x train: ",X_train.shape)
print("x test: ",X_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)


* **Random Forest Modeling and Bagging Comparison:** Choose either RandomForestRegressor or RandomForestClassifier and train a baseline model (default hyperparameters) on your new dataset.Train a Bagging ensemble using Decision Tree as the base estimator on your new dataset.Evaluate and compare the performance of the Random Forest and Bagging Decision Tree models using appropriate metrics.Perform basic hyperparameter tuning (at least for n_estimators and max_depth) for the Random Forest on your new dataset to try to optimize performance.

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

Rforest_model = RandomForestRegressor(random_state=21)
Rforest_model.fit(X_train, y_train)
RF_predictions = Rforest_model.predict(X_test)
Rforest_Rmse = mean_squared_error(y_test, RF_predictions, squared=False)
print(f"Random Forest RMSE: {Rforest_Rmse}")

DTree_model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), random_state=21)
DTree_model.fit(X_train, y_train)
DT_predictions = DTree_model.predict(X_test)
DTree_Rmse = mean_squared_error(y_test, DT_predictions, squared=False)
print(f"Decision Tree RMSE: {DTree_Rmse}")

* **Feature Importance Analysis:** Perform feature importance analysis for your Random Forest model on the new dataset, visualize importances, and interpret results in the context of your dataset.

In [None]:
import matplotlib.pyplot as plt
feature_importances = pd.DataFrame({"Value":Rforest_model.feature_importances_, "Feature": X_train.columns})
plt.figure(figsize=(10,5))
sbn.set(font_scale=1)
sbn.barplot(x="Value", y="Feature", data=feature_importances.sort_values(by="Value", ascending=False))
plt.title("Features")
plt.tight_layout()
plt.show()

* ### Present_Price dominates Selling_Price of the cars. Age and Kms_Driven are following features to impact the price of the cars.

## C: (Optional) Explore Bagging with Other Base Estimators (Expanded)


* Select either the House Price Regression task OR the Digits Classification task from this module.
* **Experiment with Bagging and Different Base Estimators (Expanded):**
    
    * Choose at least three different base estimators from scikit-learn other than Decision Trees, Linear Regression, Logistic Regression, KNN.
    * For each chosen base estimator, create a Bagging ensemble using BaggingRegressor (or BaggingClassifier). Train and evaluate these Bagging ensembles on your chosen dataset (House Prices or Digits).
    * Create at least two different Bagging ensembles with mixed base estimators. Experiment with different combinations of base estimators in your mixed ensembles (e.g., combine Decision Trees with two different new base estimators, or create a mix of three or four different types). Train and evaluate these mixed ensembles.
    * Compare the performance of all your Bagging ensembles (with different single base estimators and mixed estimators) to the baseline Bagging Decision Tree (or Bagging Logistic Regression) from Module 7 and to Random Forest.
    * Analyze and explain your findings: Which new base estimators worked well within Bagging? Did any of your mixed ensembles show improved performance? What are your overall conclusions about the versatility of Bagging and the impact of base estimator choices, especially when exploring less common base estimator types within Bagging?


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

house_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
y = house_df["SalePrice"]
numeric_features = house_df.select_dtypes(include=np.number)
X = numeric_features.drop("SalePrice", axis=1)
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)

# Base Estimators for Bagging
RFbagg_reg = BaggingRegressor(base_estimator=RandomForestRegressor(), n_estimators=10, random_state=21)
RFbagg_reg.fit(X_train, y_train)

KNbagg_reg = BaggingRegressor(base_estimator=KNeighborsRegressor(n_neighbors=20), n_estimators=10, random_state=21)
KNbagg_reg.fit(X_train, y_train)

LGBMbagg_reg = BaggingRegressor(base_estimator=LGBMRegressor(verbosity=0), n_estimators=10, random_state=21)
LGBMbagg_reg.fit(X_train, y_train)

XGBbagg_reg = BaggingRegressor(base_estimator=XGBRegressor(), n_estimators=10, random_state=21)
XGBbagg_reg.fit(X_train, y_train)

ExtraTbagg_reg = BaggingRegressor(base_estimator=ExtraTreeRegressor(), n_estimators=10, random_state=21)
ExtraTbagg_reg.fit(X_train, y_train)

SVRbagg_reg = BaggingRegressor(base_estimator=SVR(), n_estimators=10, random_state=21)
SVRbagg_reg.fit(X_train, y_train)

# Mixed Ensembles
mixed_reg1 = BaggingRegressor(base_estimator=SVR(), n_estimators=5, random_state=21)
mixed_reg2 = BaggingRegressor(base_estimator=GradientBoostingRegressor(), n_estimators=5, random_state=21)

mixed_bagging1 = BaggingRegressor(mixed_reg1, n_estimators=5, random_state=21)
mixed_bagging1.fit(X_train, y_train)

mixed_bagging2 = BaggingRegressor(mixed_reg2, n_estimators=5, random_state=21)
mixed_bagging2.fit(X_train, y_train)

regressors = [RFbagg_reg, KNbagg_reg, LGBMbagg_reg, XGBbagg_reg, ExtraTbagg_reg, SVRbagg_reg, mixed_bagging1, mixed_bagging2]
regressors_errors = {}
for reg in regressors:
    predictions = reg.predict(X_val)
    mae = mean_absolute_error(y_val, predictions)
    rmse = mean_squared_error(y_val, predictions, squared=False)  # calculate the RMSE
    print(f"{reg.__class__.__name__} MAE: {mae}")
    print(f"{reg.__class__.__name__} RMSE: {rmse}")

## Finally XGBbagg_reg, LGBMbagg_reg and mixed_reg2 with GradientBoostingRegressor passed RandomForestRegressor that having given its best result as 17699.195 and was the best amongst regressors accordingly.