# Linear Regression

In [1]:
import os
import tarfile
import requests

URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
DATASET_PATH = "datasets/housing" 
OUTPUT_FILE = os.path.join(DATASET_PATH, "housing.tag.gz")

In [2]:
def fetch_dataset(url, dataset_path, output_file):
    os.makedirs(dataset_path, exist_ok = True)
    r = requests.get(url, stream = True)
    if r.status_code == 200:
        with open(output_file, "wb") as f:
            f.write(r.raw.read())
        f = tarfile.open(output_file, "r:gz")
        f.extractall(path=dataset_path)
        f.close()

In [3]:
fetch_dataset(URL, DATASET_PATH, OUTPUT_FILE)

## Load data

In [4]:
import pandas as pd

def load_data(path):
    return pd.read_csv(os.path.join(DATASET_PATH, "housing.csv"))

data = load_data(DATASET_PATH)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### Data cleaning

In [6]:
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)
data = pd.get_dummies(data, columns=['ocean_proximity'])
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [39]:
import numpy as np
np.random.seed(39)

def split_data(data, test_radio=0.2):
    test_len = int(data.shape[0] * test_radio)
    labels = np.array(data['median_house_value'])
    data = (data-data.min())/(data.max()-data.min())
    data.drop('median_house_value', axis=1, inplace=True)
    return data.iloc[:test_len], data.iloc[test_len:], labels[:test_len], labels[test_len:]

test_x, train_x, test_y, train_y = split_data(data)
print(train_x.shape)
print(test_x.shape)

(16512, 13)
(4128, 13)


In [91]:
class LinearRegression:
    def __init(self):
        pass
    
    def fit(self, X, Y, eta = 0.5, iterations = 5001):
        self.num_features = X.shape[1] + 1 # Add bias
        self.m = X.shape[0]
        self.theta = np.random.randn(self.num_features)
        X_bias = np.c_[np.ones((self.m,1)), X]
        
        for i in range(iterations):
            y_hat = X_bias.dot(self.theta)
            error = y_hat - Y
            gradients = 1/self.m * X_bias.T.dot(error)
            self.theta = self.theta - eta * gradients
            
            if i%500 == 0:
                loss = np.sqrt(1/self.m * np.sum(np.square(error)))
                print(i, loss)
            
    def predict(self, X):
        X_bias = np.c_[np.ones((X.shape[0],1)), X]
        y_hat = X_bias.dot(self.theta)
        return y_hat
        

In [92]:
model = LinearRegression()
model.fit(train_x, train_y)

0 245689.93417702935
500 73521.77115159672
1000 73005.56324541023
1500 72608.0493864795
2000 72279.71711975755
2500 72003.4234613913
3000 71767.30767120501
3500 71562.98528177974
4000 71384.41503816293
4500 71227.14677896966
5000 71087.82184660193


In [98]:
pred = model.predict(test_x)
print(np.sqrt(1/len(pred) * np.sum(np.square(test_y - pred))))


74988.77512610343
