In [229]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import r2_score
import random

In [230]:
df = pd.read_csv("housing.csv")
df.shape

(20640, 10)

In [231]:
df.drop(columns=['ocean_proximity'], axis=1, inplace=True)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [232]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [233]:
df = df.dropna()

In [234]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']

In [235]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Using Linear Regression for computing parameters

In [236]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(lr.intercept_)
print(lr.coef_)

r2_score(y_test, y_pred)

-3569674.526696154
[-4.25467060e+04 -4.23360430e+04  1.15509711e+03 -8.44041848e+00
  1.15830375e+02 -3.67587673e+01  4.16404344e+01  4.02932784e+04]


0.6400865688993729

## Using SDG Regression for computing parameters

In [237]:
mbgd = SGDRegressor(learning_rate='constant', eta0=0.01)

In [238]:
epochs = 100
batch_size = 128

for i in range(epochs):
    ran_batch = random.sample(range(X_train.shape[0]),batch_size)
    mbgd.partial_fit(X_train.iloc[ran_batch],y_train.iloc[ran_batch])

In [239]:
mbgd.intercept_

array([1.39847052e+11])

In [240]:
mbgd.coef_

array([-1.79708873e+13,  5.99886097e+12,  6.72693519e+12, -8.34301504e+12,
        4.52345053e+12,  3.23686418e+13,  1.34624989e+13,  1.45733440e+12])

In [241]:
r2_score(y_test, y_pred)

0.6400865688993729