In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint



Reading data, Extracting Features and Target

In [6]:
# Load the California Housing dataset
california = fetch_california_housing()
X = california.data
Y = california.target

Train Test Split

In [7]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Linear Regression Model

In [15]:
regressor  = LinearRegression()
regressor.fit(X_train, Y_train)

In [16]:
Y_pred = regressor.predict(X_test)
mse_LR = mean_squared_error(Y_test, Y_pred)
rmse_LR = mean_squared_error(Y_test, Y_pred, squared=False)
print("", mse_LR, "\n",rmse_LR)

 0.555891598695242 
 0.7455813830127748


### DesisionTree  Regressor Model

In [18]:
regressor  = DecisionTreeRegressor()
regressor.fit(X_train, Y_train)

In [19]:
Y_pred = regressor.predict(X_test)
mse_DT = mean_squared_error(Y_test, Y_pred)
rmse_DT = mean_squared_error(Y_test, Y_pred, squared=False)
print("", mse_DT, "\n",rmse_DT)

 0.5002938180999758 
 0.7073145114445029


### Random Forest Regresser Model

In [20]:
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X_train, Y_train)

In [21]:
Y_pred = model.predict(X_test)
mse_RF = mean_squared_error(Y_test, Y_pred)
rmse_RF = mean_squared_error(Y_test, Y_pred, squared=False)

print("", mse_RF, "\n",rmse_RF)

 0.2572979293772426 
 0.5072454330767726


### HyperParaMeter Tunning

In [35]:
param_grid = {
    'max_depth': [2, 5, 6],
    'min_samples_split': [3, 2, 5],
    'min_samples_leaf': [3, 1, 2, 4],
    'max_features': [1.0, 'sqrt']
}

### Decision Tree

In [29]:
regressor = DecisionTreeRegressor()

In [30]:
grid_search = GridSearchCV(regressor, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

In [31]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 10, 'max_features': 1.0, 'min_samples_leaf': 4, 'min_samples_split': 5}


In [32]:
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
mse_HPT_DT = mean_squared_error(Y_test,Y_pred)
mse_HPT_DT

0.40899869913992737

Random Search

In [78]:
param_dist_RS = {
    'max_depth': [2, 8, 10, 12],
    'min_samples_split': randint(1, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': [1.0, 'sqrt']
}

In [79]:
model = DecisionTreeRegressor()
random_search = RandomizedSearchCV(model, param_distributions=param_dist_RS, n_iter=10, cv=5)

random_search.fit(X_train, Y_train)

In [80]:
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'max_depth': 12, 'max_features': 1.0, 'min_samples_leaf': 10, 'min_samples_split': 10}


In [81]:
best_model = random_search.best_estimator_
Y_pred = best_model.predict(X_test)
mse_HPT_DT_RS = mean_squared_error(Y_test,Y_pred)
mse_HPT_DT_RS

0.3625873425317261

### Random Forest

In [36]:
regressor = RandomForestRegressor(n_estimators = 10)
grid_search = GridSearchCV(regressor, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

In [38]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 6, 'max_features': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 3}


In [40]:
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
mse_HPT_RF = mean_squared_error(Y_test,Y_pred)
mse_HPT_RF

0.42274634926103777

random Search

In [67]:
model = RandomForestRegressor(n_estimators = 20)
random_search = RandomizedSearchCV(model, param_distributions=param_dist_RS, n_iter=10, cv=5)

random_search.fit(X_train, Y_train)

In [68]:
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'max_depth': 12, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 17}


In [70]:
best_model = random_search.best_estimator_
Y_pred = best_model.predict(X_test)
mse_HPT_RF_RS = mean_squared_error(Y_test,Y_pred)
mse_HPT_RF_RS

0.2843673232358438

### Table for showing evaluation Results for Model

In [82]:
data = {"Mean_Squared_Error": ["Linear Regression","DecisionTreeRegressor", "DecsionTreeRegressor_RMSE", "RandomForestRegressor", "DecisionTreeRegressor_HPT_GS", "DecisionTreeRegressor_HPT_RS", "RandomForestRegressor_HPT_GS", "RandomForestRegressor_HPT_RS" ],
        "Values": [mse_LR, mse_DT, rmse_DT, mse_RF, mse_HPT_DT, mse_HPT_DT_RS, mse_HPT_RF, mse_HPT_RF_RS]}


result = pd.DataFrame(data)
result

Unnamed: 0,Mean_Squared_Error,Values
0,Linear Regression,0.555892
1,DecisionTreeRegressor,0.500294
2,DecsionTreeRegressor_RMSE,0.707315
3,RandomForestRegressor,0.257298
4,DecisionTreeRegressor_HPT_GS,0.408999
5,DecisionTreeRegressor_HPT_RS,0.362587
6,RandomForestRegressor_HPT_GS,0.422746
7,RandomForestRegressor_HPT_RS,0.284367


### Conclusion

As, best performer is Random Forest without HPT. Less the MeanSquareError, more the model will be efficient