##Problem Statement: Predicting the House prices in California using various socioeconomic features

In [2]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

#Here we are fetching the data from the california housing 
#If we won't use as_frame = True 
#Then we will get the Bunch object with 
#.data: a numpy array of feature, .target: a array of targets, .feature_names: list of feature names
#If we use the as_frame = True
#Then also we will get the Bunch object, but
#.data: a pandas Dataframe
#.target: a pandas series 
#.frame: Combined Dataframe of features + target

In [3]:
data = fetch_california_housing(as_frame = True)
data_woaf = fetch_california_housing()


In [9]:
    df = data.frame
    df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#Splitting the dependent and independet Features in X and y
#Here we dropped the the target column 
#Where axis = 1 means we are drop the MedHouseVal in column manner 
#axis = 1 for row wise

In [None]:

X = df.drop("MedHouseVal", axis = 1)
y = df["MedHouseVal"]

#Here we are normalizing the numerical features into mean =0 and Standard Deviation = 1 by using the Standard Scaler
#fit is used to calculate the meand and Standard Deviation of each feature(cloumn) in X
#transform is used to scale the data by using values we got from fit, formulae if X_scaled = (X - mean)/(StandardDeviation)
#Finally, we got the X_scaled with the mean = 0 and Standard Deviation =1 for each feature

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

##We are splitting the data into train and test split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#We are training the Model with the training data

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

#Model Evaluation

In [13]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
y_pred = model.predict(X_test)
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred))

Root Mean Absolute Error:  0.5332001304956563
Root Mean Squared Error:  0.7455813830127763
 Mean Squared Error:  0.5558915986952442
R2 score:  0.575787706032451


In [14]:
from sklearn.linear_model import RidgeCV, LassoCV
ridge_model = RidgeCV(alphas = [0.001, 0.01, 0.1, 1.0, 10.0])
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
print("Best Alpha (Ridge):", ridge_model.alpha_)
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_ridge))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred_ridge, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred_ridge))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred_ridge))

lasso_model = LassoCV(alphas = [0.001, 0.01, 0.1, 1.0, 10.0])
lasso_model.fit(X_train, y_train)
print("Best Aplha (Lasso): ", lasso_model.alpha_)
y_pred_lasso = lasso_model.predict(X_test)
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_lasso))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred_lasso, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred_lasso))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred_lasso))

Best Alpha (Ridge): 10.0
Root Mean Absolute Error:  0.5332432567214163
Root Mean Squared Error:  0.7450104388391259
 Mean Squared Error:  0.5550405539792669
R2 score:  0.5764371557310631
Best Aplha (Lasso):  0.001
Root Mean Absolute Error:  0.5332857888017251
Root Mean Squared Error:  0.7442405630689863
 Mean Squared Error:  0.5538940157172418
R2 score:  0.5773121026225017


##Adding the polynomial features
#include_bias = false means donot add a column of 1s to my features
#why a columns of 1's are added at first?
if X = [[2], [3], [4]] then it changes to X_poly = [[1, 2, 4], [1, 3, 9], [1, 4, 16]]
#1 is used by the model to learn the intercept(bias). The intercept is the value of y when x=0 it helps the model shift the predict lineup or down
#Linear Regressor already adds that automatically 

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [27]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, include_bias = False)
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.2, random_state = 42)
ridge_poly_model = RidgeCV(alphas = [0.01, 0.1, 1, 10, 20, 30, 50, 70, 100])
ridge_poly_model.fit(X_train, y_train)
y_pred_ridge_poly = ridge_poly_model.predict(X_test)
print("Best Alpha (Ridge):", ridge_poly_model.alpha_)
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_ridge_poly))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred_ridge_poly, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred_ridge_poly))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred_ridge_poly))

Best Alpha (Ridge): 100.0
Root Mean Absolute Error:  0.4765235086008149
Root Mean Squared Error:  0.6690192335852048
 Mean Squared Error:  0.4475867349069348
R2 score:  0.6584373715847264


In [28]:
lasso_poly_model = LassoCV(alphas = [0.001,0.01, 0.1, 1, 10, 20, 30, 50, 70, 100], max_iter = 1000)
lasso_poly_model.fit(X_train, y_train)
lasso_poly_predict = lasso_poly_model.predict(X_test)
print("R2 score: ", r2_score(y_test, lasso_poly_predict))

R2 score:  0.4825038620323169


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

param_grid_knn = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance']
}

knn_model = KNeighborsRegressor()
grid_knn = GridSearchCV(knn_model, param_grid_knn, cv =5, scoring = 'r2')
grid_knn.fit(X_train, y_train)

print('Best Param (KNN): ', grid_knn.best_params_)
print('Best R2 Score: ', grid_knn.best_score_)

[WinError 2] The system cannot find the file specified
  File "c:\Users\jaswe\OneDrive\Desktop\opt\Projects\mlproject\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\jaswe\OneDrive\Desktop\opt\Projects\mlproject\venv\lib\subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\jaswe\OneDrive\Desktop\opt\Projects\mlproject\venv\lib\subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\jaswe\OneDrive\Desktop\opt\Projects\mlproject\venv\lib\subprocess.py", line 1327, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Best Param (KNN):  {'n_neighbors': 10, 'weights': 'distance'}
Best R2 Score:  0.697594984371358


In [34]:
from sklearn.tree import DecisionTreeRegressor
param_grid_tree = {
    'max_depth':[None, 5, 10, 20],
    #only these many samples are there, then only split
    'min_samples_split':[2, 5, 10],
    #leaf node have atleast these many samples
    'min_samples_leaf': [1, 2, 4]
}
tree = DecisionTreeRegressor(random_state =42)
grid_tree = GridSearchCV(tree, param_grid_tree, cv =5, scoring = 'r2')
grid_tree.fit(X_train, y_train)

print("Best Params: ", grid_tree.best_params_)
print("Best R2 score: ", grid_tree.best_score_)

Best Params:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best R2 score:  0.7001265228313104


In [36]:
best_knn = grid_knn.best_estimator_
best_tree = grid_tree.best_estimator_
y_pred_knn = best_knn.predict(X_test)
y_pred_tree = best_tree.predict(X_test)
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_knn))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred_knn, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred_knn))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred_knn))
#average error in original units like inputs, all errors are equally important, that means small and large errors are equally considered
print("Root Mean Absolute Error: ", mean_absolute_error(y_test, y_pred_tree))
#average error in original units and large errors are heavily penalized
print("Root Mean Squared Error: ", mean_squared_error(y_test, y_pred_tree, squared=False))
#large errors are heavily penalized but not in the same units as the original data
print(" Mean Squared Error: ", mean_squared_error(y_test, y_pred_tree))
#It measures the variance, that means how model in prefroming on the test dataset
print("R2 score: ", r2_score(y_test, y_pred_tree))

Root Mean Absolute Error:  0.43377592257949354
Root Mean Squared Error:  0.6432874980544813
 Mean Squared Error:  0.4138188051531942
R2 score:  0.684206372190225
Root Mean Absolute Error:  0.4201477174764044
Root Mean Squared Error:  0.6316246180077832
 Mean Squared Error:  0.3989496580734781
R2 score:  0.695553323658519
