## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
from sklearn.datasets import load_diabetes

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
data_dict = load_diabetes()

In [6]:
x = pd.DataFrame(data_dict.data, columns=data_dict.feature_names)

In [7]:
y = data_dict.target

### Train test split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

### Model Training

In [11]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train,y_train)

### Model Evaluation

#### On test dataset

In [12]:
y_pred = rf_model.predict(x_test)

In [13]:
y_pred

array([107.54, 148.32, 117.5 , 137.54,  73.62, 128.05, 139.23, 205.71,
        70.03, 231.53, 136.  , 179.21, 111.29, 195.81, 158.76, 210.7 ,
       226.07, 183.29,  64.04, 124.95,  82.06, 125.14,  89.99, 130.63,
        70.85, 218.85,  89.17, 191.39, 211.17,  73.73, 106.84, 143.19,
       176.87,  93.64, 128.15, 202.8 ,  94.82, 206.71, 208.34, 169.78,
        80.17,  65.65, 168.51,  98.09, 237.27, 143.95,  96.93, 126.08,
       222.38, 162.02,  81.7 , 169.61, 167.06,  78.47,  92.37, 221.4 ,
       134.21,  81.66, 139.1 , 168.52, 172.63, 234.48,  71.27, 204.18,
       238.51, 182.23, 228.81,  88.16, 133.88, 167.91, 115.15, 106.24,
       244.79, 206.85, 167.9 , 139.14, 112.89, 177.13,  84.04, 215.59,
       144.01, 247.76, 121.57,  71.18,  95.71, 183.81, 114.82,  93.92,
        66.75, 291.55, 167.18, 118.88, 128.23, 124.74,  60.87, 172.65,
       197.32, 145.7 ,  74.97, 210.25, 190.51, 138.39, 103.16,  58.14,
       208.06,  83.61, 147.94, 220.71, 163.66, 229.78, 127.84])

In [14]:
mse = mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",mse)

mae = mean_absolute_error(y_test,y_pred)
print("Mean Absolute Error:",mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error",rmse)

r2_score_ = r2_score(y_test,y_pred)
print("R2 Score",r2_score_)

Mean Squared Error: 3436.646999099099
Mean Absolute Error: 49.129999999999995
Root Mean Squared Error 58.62292213033311
R2 Score 0.3743864871756769


In [15]:
# Model not perform well on training test data 

#### On train Dataset

In [16]:
y_pred_train = rf_model.predict(x_train)

In [17]:
mse = mean_squared_error(y_train,y_pred_train)
print("Mean Squared Error :",mse)

mae = mean_absolute_error(y_train,y_pred_train)
print("Mean Absolute Error :",mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error :",rmse)

r2_score_ = r2_score(y_train,y_pred_train)
print("R2 Score :",r2_score_)

Mean Squared Error : 458.1193347432024
Mean Absolute Error : 16.989123867069484
Root Mean Squared Error : 21.40372245062065
R2 Score : 0.9240287103082425


In [18]:
# The model perform well on train data butr not perform well on test data means overfitting 

### Hyperparameter Tuning 

In [19]:
rf_model = RandomForestRegressor()

In [22]:
# Define the hyperparameter grid for Random Forest (used in GridSearchCV)


param_grid = {
    # Number of trees in the forest; trying values from 80 to 199
    "n_estimators": np.arange(80, 200),  
    
    # The function to measure the quality of a split
    "criterion": ["squared_error", "absolute_error"],  
    
    # Maximum depth of each tree; trying values from 3 to 9
    "max_depth": np.arange(3, 10),  
    
    # Minimum number of samples required to split an internal node; values 10 to 14
    "min_samples_split": np.arange(10, 15),  
    
    # Minimum number of samples required to be at a leaf node; values 3 to 7
    "min_samples_leaf": np.arange(3, 8),  
    
    # Fixing random state for reproducibility of results
    "random_state": [11]  
}

In [None]:
gscv_rf_model = GridSearchCV(rf_model,param_grid, cv = 5)
gscv_rf_model.fit(x_train,y_train)
gscv_rf_model.best_estimator_

In [None]:
gscv_rf_model

In [None]:
new_rf_model = gscv_rf_model.best_estimator_

### Evaluation of New Model

#### On test dataset

In [None]:
y_pred = new_rf_model.predict(x_test)

In [None]:
mse = meany_pred = new_rf_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse, "\n")
rmse = np.sqrt(mse)
print("Root Mean Squared Error: ", rmse, "\n")
mae = mean_absolute_error(y_test,y_pred)
print("Mean Absolute Error: ", mae, "\n")
r2 = r2_score(y_test,y_pred)
print("R2 Score: ", r2)

In [None]:
y_pred_train = new_rf_model.predict(x_train)

In [None]:

mse = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error: ", mse, "\n")
rmse = np.sqrt(mse)
print("Root Mean Squared Error: ", rmse, "\n")
mae = mean_absolute_error(y_train, y_pred_train)
print("Mean Absolute Error: ", mae, "\n")
r2 = r2_score(y_train, y_pred_train)
print("R2 Score: ", r2)