In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2023-09-25 07:30:19--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2023-09-25 07:30:19 (31.0 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [36]:
import pandas as pd
import numpy as np

In [15]:
data = pd.read_csv('housing.csv')

In [25]:
data.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [45]:
data = data[(data['ocean_proximity'] == '<1H OCEAN') | (data['ocean_proximity'] == 'INLAND')]

In [73]:
data = data[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']]

In [83]:
data['housing_median_age'] = np.log1p(data['housing_median_age'])

Question 1: Missing values

In [74]:
data.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

**Question 2: Median**

In [47]:
data.population.median()

1195.0

In [115]:
np.random.seed(42)

n = len(data)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = data.iloc[idx]

data_train = data_shuffled.iloc[:n_train].copy()
data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
data_test = data_shuffled.iloc[n_train+n_val:].copy()

In [116]:
y_train_orig = data_train.median_house_value.values
y_val_orig = data_val.median_house_value.values
y_test_orig = data_test.median_house_value.values

y_train = np.log1p(data_train.median_house_value.values)
y_val = np.log1p(data_val.median_house_value.values)
y_test = np.log1p(data_test.median_house_value.values)

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

In [117]:
# filling with 0
def prepare_with_zero(df):
  df = df.copy()
  df = df.fillna(0)
  return df

# filling with mean

def prepare_with_mean(df):
  df = df.copy()
  df = df.fillna(df['total_bedrooms'].mean())
  return df


In [118]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [119]:
# with encoding the 0
X_train = prepare_with_zero(data_train)
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = prepare_with_zero(data_val)
y_pred = w_0 + X_val.dot(w)
print('validation', rmse(y_val, y_pred))

train 0.34039358550007653
validation 0.34080750034509827


In [120]:
# with encoding the mean
X_train = prepare_with_mean(data_train)
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = prepare_with_mean(data_val)
y_pred = w_0 + X_val.dot(w)
print('validation', rmse(y_val, y_pred))

train 0.3401537146321478
validation 0.3405288152710008


**Question 4: Regularization**

In [121]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [122]:
def reg_train(df_train, df_val, r):
      X_train = prepare_with_zero(df_train)
      w_0, w = train_linear_regression_reg(X_train, y_train, r=r)

      y_pred = w_0 + X_train.dot(w)
      #print('r: ',r, '|','train rmse: ', rmse(y_train, y_pred))

      X_val = prepare_with_zero(df_val)
      y_pred = w_0 + X_val.dot(w)
      print('r: ', r , '|', 'val rmse:', rmse(y_val, y_pred))

In [123]:
for r in [0, 0.001, 0.01, 0.1, 1, 10]:
    reg_train(data_train, data_val, r)


r:  0 | val rmse: 0.34080750034509827
r:  0.001 | val rmse: 0.3408101397353384
r:  0.01 | val rmse: 0.3408361113739001
r:  0.1 | val rmse: 0.34123792045209755
r:  1 | val rmse: 0.34490211909366214
r:  10 | val rmse: 0.3484205919160604


**Question 5. Shuffling**

In [124]:
val_rmse = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
      np.random.seed(seed)

      n = len(data)

      n_val = int(0.2 * n)
      n_test = int(0.2 * n)
      n_train = n - (n_val + n_test)

      idx = np.arange(n)
      np.random.shuffle(idx)

      data_shuffled = data.iloc[idx]

      data_train = data_shuffled.iloc[:n_train].copy()
      data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
      data_test = data_shuffled.iloc[n_train+n_val:].copy()

      y_train_orig = data_train.median_house_value.values
      y_val_orig = data_val.median_house_value.values
      y_test_orig = data_test.median_house_value.values

      y_train = np.log1p(data_train.median_house_value.values)
      y_val = np.log1p(data_val.median_house_value.values)
      y_test = np.log1p(data_test.median_house_value.values)

      del data_train['median_house_value']
      del data_val['median_house_value']
      del data_test['median_house_value']


      # with encoding the 0
      X_train = prepare_with_zero(data_train)
      w_0, w = train_linear_regression(X_train, y_train)

      y_pred = w_0 + X_train.dot(w)
      #print('train', rmse(y_train, y_pred))

      X_val = prepare_with_zero(data_val)
      y_pred = w_0 + X_val.dot(w)
      val_rmse.append(rmse(y_val, y_pred))

print(round(np.std(val_rmse),3))

0.005


**Question 6**

In [128]:
np.random.seed(9)

n = len(data)

n_val = int(0.4 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = data.iloc[idx]

data_train = data_shuffled.iloc[:n_train].copy()
data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()

y_train_orig = data_train.median_house_value.values
y_val_orig = data_val.median_house_value.values

y_train = np.log1p(data_train.median_house_value.values)
y_val = np.log1p(data_val.median_house_value.values)

del data_train['median_house_value']
del data_val['median_house_value']


# with encoding the 0
X_train = prepare_with_zero(data_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.001)

y_pred = w_0 + X_train.dot(w)
#print('train', rmse(y_train, y_pred))

X_val = prepare_with_zero(data_val)
y_pred = w_0 + X_val.dot(w)
print(rmse(y_val, y_pred))

0.3377589445470025
