In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Create a toy regression dataset
X, y = make_regression(n_samples=1000, n_features=5, noise=20, random_state=42)

df = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(X.shape[1])])
df["Target"] = y

In [4]:
X = df.drop(columns=["Target"])
y = df["Target"]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=36)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)
print("Test size:", X_test.shape)

Train size: (560, 5)
Validation size: (240, 5)
Test size: (200, 5)


In [22]:
# Linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg_pred = lin_reg.predict(X_val)

# Decision tree
tree_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(random_state=42)
hyperparams = {
    "max_depth": [3, 5, 7, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

tree_reg_gs = GridSearchCV(tree_reg, hyperparams, scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
tree_reg_gs.fit(X_train, y_train)
tree_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)

rf_reg_pred = rf_reg.predict(X_val)
tree_reg_pred = tree_reg.predict(X_val)
tree_reg_gs_pred = tree_reg_gs.predict(X_val) 

In [25]:
lr_rmse = root_mean_squared_error(lin_reg_pred, y_val)
tree_rmse_gs = root_mean_squared_error(tree_reg_gs_pred, y_val)
tree_rmse = root_mean_squared_error(tree_reg_pred, y_val)
rf_rmse = root_mean_squared_error(rf_reg_pred, y_val)

print("RMSE LR:", lr_rmse, "\nRMSE Tree gs:", tree_rmse_gs, "\nRMSE Tree:", tree_rmse, "\nRMSE RandomForesst:", rf_rmse)
print("DecisionTree Feature Importances", tree_reg.feature_importances_)
print("RandomForest Feature Improtances", rf_reg.feature_importances_)

RMSE LR: 20.725243967021466 
RMSE Tree gs: 41.18026776055873 
RMSE Tree: 43.944898665502464 
RMSE RandomForesst: 28.093936710524464
DecisionTree Feature Importances [0.19002398 0.47910195 0.07443651 0.13214341 0.12429415]
RandomForest Feature Improtances [0.18471483 0.49886951 0.07603945 0.13168255 0.10869366]


In [42]:
best_features = [0, 1, 3]
X_best = X.iloc[:, best_features]


X_train_val_best, X_test_best, y_train_val, y_test = train_test_split(X_best, y, test_size=0.2, random_state=40)
X_train_best, X_val_best, y_train, y_val = train_test_split(X_train_val_best, y_train_val, test_size=0.3, random_state=36)


lr = LinearRegression()
ran_reg = RandomForestRegressor(random_state=42)
lr.fit(X_train_best, y_train)
ran_reg.fit(X_train_best, y_train)

y_pred_ran = ran_reg.predict(X_val_best)
y_pred_lr = lr.predict(X_val_best)
lr_rmse = root_mean_squared_error(y_pred_lr, y_val)
ran_reg_rmse = root_mean_squared_error(y_pred_ran, y_val)

print("RMSE Linear Regression, best_features:", lr_rmse)
print("RMSE RandomForestRegressor, best feature:", ran_reg_rmse)

RMSE Linear Regression, best_features: 33.2699935647293
RMSE RandomForestRegressor, best feature: 35.796247754565975
