In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import math

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## imputing the null values

In [13]:
imputer = SimpleImputer()
imputer.set_output(transform="pandas")
df2 =  imputer.fit_transform(df.drop(columns='ocean_proximity'))

In [15]:
df2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [16]:
df2.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [18]:
data_x = df2.drop(columns="median_house_value")
data_y = df2["median_house_value"]

In [19]:
scaler = StandardScaler()
scaler.set_output(transform="pandas")
data_x = scaler.fit_transform(data_x)

In [24]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2)

In [23]:
sgd_reg = SGDRegressor()

In [25]:
sgd_reg.fit(train_x, train_y)

In [26]:
def regSummary(y_true, y_pred):
    return{
        "r2_score": r2_score(y_true, y_pred),
        "mae": mean_absolute_error(y_true, y_pred),
        "mse": mean_squared_error(y_true, y_pred),
        "rmse": math.sqrt(mean_squared_error(y_true, y_pred))
    }

In [27]:
y_pred = sgd_reg.predict(test_x)
regSummary(test_y, y_pred)

{'r2_score': 0.6167365394721278,
 'mae': 50931.88537292954,
 'mse': 4916563779.218932,
 'rmse': 70118.21289236436}

In [33]:
# tuning the max_iter, n_iter_no_change, eta0, power_t
sgd_reg = SGDRegressor(early_stopping=True, eta0= 0.1, max_iter=5000, n_iter_no_change=30)
sgd_reg.fit(train_x, train_y)
y_pred = sgd_reg.predict(test_x)
regSummary(test_y, y_pred)

{'r2_score': 0.6123094233695723,
 'mae': 51100.437282019695,
 'mse': 4973355518.891278,
 'rmse': 70522.0215173337}

In [36]:
test_result =  cross_val_score(sgd_reg, train_x, train_y, scoring="neg_mean_squared_error", 
                cv=10, verbose=3)

[CV] END ...................... score: (test=-5123830284.719) total time=   0.0s
[CV] END ...................... score: (test=-4319810465.030) total time=   0.0s
[CV] END ...................... score: (test=-5116085955.611) total time=   0.0s
[CV] END ...................... score: (test=-5329909332.540) total time=   0.0s
[CV] END ...................... score: (test=-4969279201.619) total time=   0.0s
[CV] END ...................... score: (test=-4840245567.049) total time=   0.0s
[CV] END ...................... score: (test=-4888346836.895) total time=   0.0s
[CV] END ...................... score: (test=-5458693646.608) total time=   0.0s
[CV] END ...................... score: (test=-8629926224.822) total time=   0.0s
[CV] END ...................... score: (test=-5345703231.694) total time=   0.0s


In [41]:
math.sqrt((test_result*-1).mean())

73499.54472415855

In [43]:

param_grid = {
    "eta0": [0.1, 0.3, 0.5, 0.8], 
    "max_iter":[1000,2000,3000,4000,5000], 
    "n_iter_no_change":[10,20,30]
}

grid_search = GridSearchCV(sgd_reg, param_grid, cv=10, n_jobs=None)

In [46]:
%%time
grid_search = GridSearchCV(sgd_reg, param_grid, cv=5, n_jobs=None, verbose=3)
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END eta0=0.1, max_iter=1000, n_iter_no_change=10;, score=0.589 total time=   0.0s
[CV 2/5] END eta0=0.1, max_iter=1000, n_iter_no_change=10;, score=0.611 total time=   0.0s
[CV 3/5] END eta0=0.1, max_iter=1000, n_iter_no_change=10;, score=0.644 total time=   0.0s
[CV 4/5] END eta0=0.1, max_iter=1000, n_iter_no_change=10;, score=0.625 total time=   0.0s
[CV 5/5] END eta0=0.1, max_iter=1000, n_iter_no_change=10;, score=0.613 total time=   0.0s
[CV 1/5] END eta0=0.1, max_iter=1000, n_iter_no_change=20;, score=0.645 total time=   0.0s
[CV 2/5] END eta0=0.1, max_iter=1000, n_iter_no_change=20;, score=0.607 total time=   0.0s
[CV 3/5] END eta0=0.1, max_iter=1000, n_iter_no_change=20;, score=0.646 total time=   0.0s
[CV 4/5] END eta0=0.1, max_iter=1000, n_iter_no_change=20;, score=0.621 total time=   0.0s
[CV 5/5] END eta0=0.1, max_iter=1000, n_iter_no_change=20;, score=0.622 total time=   0.0s
[CV 1/5] END eta0=0.1, max_i

In [47]:
grid_search.best_estimator_

In [48]:
grid_search.best_score_

0.6336372905610401

In [49]:
y_pred = grid_search.best_estimator_.predict(test_x)
regSummary(test_y, y_pred)

{'r2_score': 0.6191120050019236,
 'mae': 50701.03485919641,
 'mse': 4886090934.856228,
 'rmse': 69900.57893076586}

In [51]:
np.linspace(0.01, 0.1, 50)

array([0.01      , 0.01183673, 0.01367347, 0.0155102 , 0.01734694,
       0.01918367, 0.02102041, 0.02285714, 0.02469388, 0.02653061,
       0.02836735, 0.03020408, 0.03204082, 0.03387755, 0.03571429,
       0.03755102, 0.03938776, 0.04122449, 0.04306122, 0.04489796,
       0.04673469, 0.04857143, 0.05040816, 0.0522449 , 0.05408163,
       0.05591837, 0.0577551 , 0.05959184, 0.06142857, 0.06326531,
       0.06510204, 0.06693878, 0.06877551, 0.07061224, 0.07244898,
       0.07428571, 0.07612245, 0.07795918, 0.07979592, 0.08163265,
       0.08346939, 0.08530612, 0.08714286, 0.08897959, 0.09081633,
       0.09265306, 0.0944898 , 0.09632653, 0.09816327, 0.1       ])