In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

'96e68d1a45a087e00ca4f7ee2ae27fff3cedfd0a'

In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
data = pd.read_csv(url)

data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
data = data[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']]

In [4]:
#Find a feature with missing values. How many missing values does it have?

data.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [5]:
#What's the median (50% percentile) for variable 'population'?

data.population.median()

1166.0

In [6]:
import numpy as np

np.random.seed(42)

n = len(data)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = data.iloc[idx]

data_train = data_shuffled.iloc[:n_train].copy()
data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
data_test = data_shuffled.iloc[n_train+n_val:].copy()

In [7]:
y_train_orig = data_train.median_house_value.values
y_val_orig = data_val.median_house_value.values
y_test_orig = data_test.median_house_value.values

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

del data_train['median_house_value']
del data_val['median_house_value']
del data_test['median_house_value']

## Lin reg

In [8]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [9]:
prediction_base = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income']


def prepare_X(df, filling_value=0):
    df_num = df[prediction_base]
    df_num = df_num.fillna(filling_value)
    X = df_num.values
    return X

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [12]:
X_train = prepare_X(data_train)
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)

score = rmse(y_train, y_pred)
round(score, 2)

0.34

In [13]:
#data["Engine Cylinders"].fillna(value = m, inplace=True)

In [14]:
X_train = prepare_X(data_train, data_train.mean())
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)

score = rmse(y_train, y_pred)
round(score, 2)

0.34

## Regularization

In [16]:
rs = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [17]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [30]:
scores = []
for r in rs:
    
    X_train = prepare_X(data_train)
    
    w_0_r, w_r = train_linear_regression_reg(X_train, y_train, r)
    
    y_pred = w_0_r + X_train.dot(w_r)
    score = rmse(y_train, y_pred)
    
    scores.append(score)
    
    if np.min(scores) == score:
        print('this is the best at the monent', r, score)
    else:
        print('not so good', r, score)

this is the best at the monent 0 0.3413135910156676
not so good 1e-06 0.3413135910156898
not so good 0.0001 0.3413135912378203
not so good 0.001 0.3413136131760776
not so good 0.01 0.3413157533022863
not so good 0.1 0.34148530628638035
not so good 1 0.3452327944054202
not so good 5 0.3501642987137403
not so good 10 0.3513878391352053


In [31]:
X_train = prepare_X(data_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=0)

X_val = prepare_X(data_val)
y_pred = w_0_r + X_val.dot(w_r)
print('validation:', rmse(y_val, y_pred))

X_test = prepare_X(data_test)
y_pred = w_0_r + X_test.dot(w_r)
print('test:', rmse(y_test, y_pred))

validation: 0.3406063807811878
test: 0.3611724433619629


In [37]:
scores_dif_seeds = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    
    np.random.seed(seed)

    n = len(data)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    data_shuffled = data.iloc[idx]

    data_train = data_shuffled.iloc[:n_train].copy()
    data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
    data_test = data_shuffled.iloc[n_train+n_val:].copy()
    
    
    y_train_orig = data_train.median_house_value.values
    y_val_orig = data_val.median_house_value.values
    y_test_orig = data_test.median_house_value.values

    y_train = np.log1p(y_train_orig)
    y_val = np.log1p(y_val_orig)
    y_test = np.log1p(y_test_orig)

    del data_train['median_house_value']
    del data_val['median_house_value']
    del data_test['median_house_value']
    
    X_train = prepare_X(data_train, data_train.mean())
    w_0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(data_val)
    y_pred = w_0 + X_val.dot(w)
    
    score = rmse(y_val, y_pred)
    rounded_score = round(score, 3) 
    
    scores_dif_seeds.append(score)
   
    print('validation:', rounded_score, score)

validation: 0.339 0.3387885504296964
validation: 0.337 0.3367046046282629
validation: 0.332 0.3324800478067977
validation: 0.341 0.34073392827221105
validation: 0.339 0.33856060293799856
validation: 0.343 0.3434475638536071
validation: 0.345 0.34513614856323444
validation: 0.34 0.3400037541192244
validation: 0.346 0.3464727334254422
validation: 0.337 0.33676169579158943


In [38]:
scores_dif_seeds

[0.3387885504296964,
 0.3367046046282629,
 0.3324800478067977,
 0.34073392827221105,
 0.33856060293799856,
 0.3434475638536071,
 0.34513614856323444,
 0.3400037541192244,
 0.3464727334254422,
 0.33676169579158943]

In [39]:
round(np.std(scores_dif_seeds), 3) 

0.004