In [None]:
from sklearn import datasets
import pandas as pd
Housing_X, Housing_y = datasets.fetch_california_housing(return_X_y=True , as_frame=True)
data = pd.concat([Housing_X, Housing_y], axis=1)
data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
data.isnull().sum().sum()

0

In [None]:
oldest_houses = data.sort_values(by="HouseAge" , ascending = False)
oldest_houses[['HouseAge',"MedHouseVal"]].head(10)

Unnamed: 0,HouseAge,MedHouseVal
18881,52.0,1.094
1671,52.0,1.557
19116,52.0,2.008
19516,52.0,0.75
16200,52.0,0.546
7848,52.0,3.5
19110,52.0,2.309
10030,52.0,1.097
16204,52.0,0.425
5670,52.0,3.215


In [None]:
populated_houses = data.sort_values(by="Population" , ascending = False)
populated_houses[['Population',"MedHouseVal"]].head(10)

Unnamed: 0,Population,MedHouseVal
15360,35682.0,1.344
9880,28566.0,1.188
13139,16305.0,1.537
10309,16122.0,3.663
6057,15507.0,2.539
6066,15037.0,3.397
12215,13251.0,2.123
9019,12873.0,3.992
17413,12427.0,0.283
922,12203.0,4.511


In [None]:
data.corr(method = "pearson")

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


# **TRY WITH SIMPLE MODELS**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
Housing_X, Housing_y = datasets.fetch_california_housing(return_X_y=True)

In [None]:
models = []
models.append( ('LR', LinearRegression()) )
models.append(('RIDGE', Ridge()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('SVR', SVR()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
    scoring = 'r2'
    cv_results = cross_val_score(model, Housing_X, Housing_y, cv = kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)

('LR', 0.604493934779471, 0.01247893342338136)
('RIDGE', 0.6044982748455532, 0.012478053264676991)
('LASSO', 0.2848390552386991, 0.00819659490141511)
('EN', 0.4228401393464143, 0.013791394466229673)
('SVR', -0.021019330589308938, 0.010024080702801634)
('KNN', 0.16355925522898246, 0.02069085112470392)
('CART', 0.6156515749238767, 0.026590020388625693)


In [None]:


pipelines = []
pipelines.append(("ScaledLR",  Pipeline([  ("Scaler", StandardScaler()) , ("LR", LinearRegression())])))
pipelines.append(("ScaledLASSO",  Pipeline([("Scaler", StandardScaler()), ("Lasso", Lasso())])))
pipelines.append(("ScaledEN",     Pipeline([("Scaler", StandardScaler()), ("EN", ElasticNet())])))
pipelines.append(("ScaledRIDGE",  Pipeline([("Scaler", StandardScaler()), ("RIDGE", Ridge())])))
pipelines.append(("ScaledKNN", Pipeline([("Scaler", StandardScaler()), ("KNN", KNeighborsRegressor())])))
pipelines.append(("ScaledD3", Pipeline([("Scaler", StandardScaler()), ("D3", DecisionTreeRegressor())])))
pipelines.append(("ScaledSVR",  Pipeline([("Scaler", StandardScaler()), ("SVR", SVR())])))

results = []
names = []
for name, model in pipelines:
  kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
  scoring = 'r2'
  cv_results = cross_val_score(model, Housing_X, Housing_y, cv = kfold, scoring = scoring)
  results.append(cv_results)
  names.append(name)
  msg = (name, cv_results.mean(), cv_results.std( ))
  print(msg)

('ScaledLR', 0.6044939347794716, 0.012478933423379449)
('ScaledLASSO', -0.00047136241715406424, 0.0005976217114028723)
('ScaledEN', 0.20457815431769816, 0.007115910732044893)
('ScaledRIDGE', 0.6044952164060867, 0.012480933095583454)
('ScaledKNN', 0.6927181476148452, 0.010938325042862879)
('ScaledD3', 0.6173267448557348, 0.02605221387966025)
('ScaledSVR', 0.7415094435764565, 0.01438830452460412)


**SVR TUNING**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler().fit(Housing_X)
rescaled_Housing_X = scaler.transform(Housing_X)

model = SVR()
grid_space = {
    'kernel': ['rbf'],  # Reduced search space
    'C': [0.1, 1, 10],  # Increased step size
    'gamma': [1, 0.1, 0.01]
}
kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
grid = GridSearchCV(model, param_grid=grid_space, cv=kfold, scoring='r2')
model_grid = grid.fit(rescaled_Housing_X, Housing_y)

print('Best hyperparameters are: ' + str(model_grid.best_params_))
print('Best score is: ' + str(model_grid.best_score_))

Best hyperparameters are: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best score is: 0.7668273507594414


In [None]:
from sklearn.model_selection import train_test_split
test_size = 0.2
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(Housing_X, Housing_y,
                                                    test_size = test_size,
                                                    random_state = seed)

In [None]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
model = Pipeline([("Scaler", StandardScaler()), ("SVR", SVR(kernel='rbf' , gamma=1 , C=10))])
model.fit(X_train, Y_train)


y_pred = model.predict(X_test)


print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Root Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred, squared = False))
print("R2: ", metrics.r2_score(Y_test, y_pred))

Mean Squared Error: 0.3182325213114278
Mean Absolute Error: 0.3749300180262569
Root Mean Squared Error: 0.5641210165482472
R2:  0.7633987961557187


# **ENSEMBLE MODELS**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

ensembles = []
ensembles.append(("ScaledRF",   Pipeline([("Scaler", StandardScaler()), ("RF", RandomForestRegressor())])))
ensembles.append(("ScaleGBM",   Pipeline([("Scaler", StandardScaler()), ("GBM", GradientBoostingRegressor())])))
ensembles.append(("ScaledET",   Pipeline([("Scaler", StandardScaler()), ("ET", ExtraTreesRegressor())])))
ensembles.append(("ScaledAB",   Pipeline([("Scaler", StandardScaler()), ("AB", AdaBoostRegressor())])))
ensembles.append(("ScaledHGBM",   Pipeline([("Scaler", StandardScaler()), ("AB", HistGradientBoostingRegressor())])))
ensembles.append(("ScaledXGB",   Pipeline([("Scaler", StandardScaler()), ("XGB", XGBRegressor())])))

# Evaluate each model
results = []
names = []
for name, model in ensembles:
  kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
  scoring = 'r2'
  cv_results = cross_val_score(model, Housing_X, Housing_y, cv = kfold, scoring = scoring)
  results.append(cv_results)
  names.append(name)
  msg = (name, cv_results.mean(), cv_results.std( ))
  print(msg)

('ScaledRF', 0.8133755810601653, 0.009904611083109967)
('ScaleGBM', 0.7885013171970441, 0.009820426838787734)
('ScaledET', 0.8175807205128175, 0.008907654202449234)
('ScaledAB', 0.4203692864302794, 0.06479126420939123)
('ScaledHGBM', 0.8377788660617942, 0.008379352808907796)
('ScaledXGB', 0.8405427523322564, 0.0073277694222337614)


**TUNING XGBREGRESSOR**

In [None]:
Housing_X, Housing_y = datasets.fetch_california_housing(return_X_y=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
scaler = StandardScaler().fit(Housing_X)
rescaled_Housing_X = scaler.transform(Housing_X)

model = XGBRegressor()
grid_space = {
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
}

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
grid = GridSearchCV(model, param_grid=grid_space, cv=kfold, scoring='r2')
model_grid = grid.fit(rescaled_Housing_X, Housing_y)

print('Best hyperparameters are: ' + str(model_grid.best_params_))
print('Best score is: ' + str(model_grid.best_score_))

Best hyperparameters are: {'gamma': 0.0, 'learning_rate': 0.15, 'max_depth': 8}
Best score is: 0.8420707221650566


In [None]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
model = Pipeline([("Scaler", StandardScaler()), ("SVR", XGBRegressor(max_depth=8 , learning_rate=0.15 , gamma=0.0))])
model.fit(X_train, Y_train)


y_pred = model.predict(X_test)


print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Root Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred, squared = False))
print("R2: ", metrics.r2_score(Y_test, y_pred))

Mean Squared Error: 0.22351470488572092
Mean Absolute Error: 0.3071936672304143
Root Mean Squared Error: 0.472773418125132
R2:  0.8338201009911624


**HISTGRADIENTBOOSTING TUNING**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
scaler = StandardScaler().fit(Housing_X)
rescaled_Housing_X = scaler.transform(Housing_X)

model = HistGradientBoostingRegressor()
grid_space = {
    'loss': ['squared_error', 'absolute_error'],
    'learning_rate': [0.001 , 0.01, 0.1, 0.5],
    'max_depth': [None , 3, 5, 7 , 10],
    'min_samples_leaf':[20 , 15 , 25]
}

kfold = KFold(n_splits = 10, random_state = 7, shuffle = True)
grid = GridSearchCV(model, param_grid=grid_space, cv=kfold, scoring='r2')
model_grid = grid.fit(rescaled_Housing_X, Housing_y)

print('Best hyperparameters are: ' + str(model_grid.best_params_))
print('Best score is: ' + str(model_grid.best_score_))

Best hyperparameters are: {'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 10, 'min_samples_leaf': 20}
Best score is: 0.8385283583842578


In [None]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
model = Pipeline([("Scaler", StandardScaler()), ("HistGradient", HistGradientBoostingRegressor(learning_rate=0.1 , loss="squared_error" , max_depth=10 , min_samples_leaf=20))])
model.fit(X_train, Y_train)


y_pred = model.predict(X_test)


print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Root Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred, squared = False))
print("R2: ", metrics.r2_score(Y_test, y_pred))

Mean Squared Error: 0.231109029215324
Mean Absolute Error: 0.32050769377774085
Root Mean Squared Error: 0.4807380047544858
R2:  0.8281738324345632


# **CONCLUSION**

In [None]:
from sklearn import metrics
from sklearn.ensemble import VotingRegressor

XGB = XGBRegressor(max_depth=8 , learning_rate=0.15 , gamma=0.0)
GB = HistGradientBoostingRegressor(learning_rate=0.1 , loss="squared_error" , max_depth=10 , min_samples_leaf=20)
# Fit the models
clfvt = VotingRegressor(estimators = [('XGB',XGB),('gb', GB)])
clfvt.fit(X_train,Y_train)
y_pred = clfvt.predict(X_test)


print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Root Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred, squared = False))
print("R2: ", metrics.r2_score(Y_test, y_pred))

Mean Squared Error: 0.21804938015636394
Mean Absolute Error: 0.30670914000045024
Root Mean Squared Error: 0.46695757854045367
R2:  0.8378834896261044
