In [None]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("housing.csv")

In [4]:
columns = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']

In [5]:
df = df[columns]

# Question 1

In [30]:

df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

# Question 2

In [29]:
df.population.median()

1166.0

In [8]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [11]:
def prepare_X_with_0(df):
    df = df.copy()
    df['total_bedrooms'] = df['total_bedrooms'].fillna(0)

    X = df.values
    return X

In [12]:
def prepare_X_with_mean(df):
    df = df.copy()
    mean_train = df_train['total_bedrooms'].mean()
    df['total_bedrooms'] = df['total_bedrooms'].fillna(mean_train)

    X = df.values
    return X

In [13]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    if r != 0 :
        XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [14]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

# Question 3 
Both are equally good

In [15]:
r = 0
X_train = prepare_X_with_0(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)

X_val = prepare_X_with_0(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score

0.3295330365224802

In [16]:
r = 0
X_train = prepare_X_with_mean(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)

X_val = prepare_X_with_0(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score

0.32901954390063287

# Question 4
0

In [44]:
r_list = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [45]:
for r in r_list:
    X_train = prepare_X_with_0(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X_with_0(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)

    print(r ,w0 ,round(score,2))

0 -11.459046830955401 0.34
1e-06 -11.459031378049055 0.34
0.0001 -11.457501750752172 0.34
0.001 -11.443614750434485 0.34
0.01 -11.306573601817458 0.34
0.1 -10.097341975344136 0.34
1 -4.878283028050828 0.34
5 -1.477789775446845 0.34
10 -0.7885673331528591 0.35


# Question 5

In [51]:
seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
score_list = []

for s in seed_list:
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
   

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    X_train = prepare_X_with_0(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=0)

    X_val = prepare_X_with_0(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)

    score_list.append(score)

std = round(np.std(score_list),3)
std

0.004

# Question 6

In [52]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

df_full_train = pd.concat([df_train , df_val]).reset_index(drop = True)
y_full_train = np.concatenate([y_train,y_val])

In [54]:
X_full_train = prepare_X_with_0(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

X_test = prepare_X_with_0(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.3453168914374154