In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
raw_df = pd.read_csv('housing.csv')

In [3]:
cond = ((raw_df['ocean_proximity'] == '<1H OCEAN') | 
        (raw_df['ocean_proximity'] == 'INLAND'))
df = raw_df[cond].copy()

In [4]:
print(raw_df.shape, df.shape)

(20640, 10) (15687, 10)


In [5]:
req_cols = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 
            'total_bedrooms', 'population', 'households', 'median_income', 
            'median_house_value']
df = df[req_cols]

Question 1
There's one feature with missing values. What is it?
- total_rooms
- total_bedrooms
- population
- households

In [6]:
[c for c in df.columns if df[c].isna().sum() > 1]

['total_bedrooms']

Question 2 What's the median (50% percentile) for variable 'population'?
- 995
- 1095
- 1195
- 1295

In [7]:
print('50th percentile of variable population: {}'.format(np.percentile(df['population'], 50)))

50th percentile of variable population: 1195.0


Prepare and split the dataset
- Shuffle the dataset (the filtered one you created above), use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Apply the log transformation to the median_house_value variable using the np.log1p() function.

In [9]:
df['log_median_house_value'] = np.log1p(df['median_house_value'])

### df.sample()

In [None]:
df_shuffled = df.sample(frac=1, random_state=42)

### np.random.seed()

In [10]:
np.random.seed(42)
n = len(df)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

### train-val-test split

In [None]:
# my code 
index1 = int((df_shuffled.shape[0]*0.6))
index2 = index1 + int((df_shuffled.shape[0]*0.2))
print(index1, index2)
train_df = df_shuffled.iloc[:index1, :].copy()
val_df = df_shuffled.iloc[index1:index2, :].copy()
test_df = df_shuffled.iloc[index2:, :].copy()
print(train_df.shape, val_df.shape, test_df.shape)

In [20]:
n = len(df_shuffled)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)
train_df = df_shuffled.iloc[:n_train].copy()
val_df = df_shuffled.iloc[n_train:n_train+n_val].copy()
test_df = df_shuffled.iloc[n_train+n_val:].copy()
print(train_df.shape, val_df.shape, test_df.shape)

(9413, 10) (3137, 10) (3137, 10)


In [21]:
pd.concat([train_df, val_df, test_df], axis=0).equals(df_shuffled)

True

### Linear regression model 
- X = [n, k+1]  where k = # features | columns; n = # instances;
- y = [n, 1]
- w = [k+1, 1]

In [14]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X]) # add bias
    XTX = X.T.dot(X) 
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    # return w
    return w[0], w[1:]

In [15]:
def RMSE(y, y_pred):
    error = y - y_pred
    return np.sqrt((error ** 2).mean())

Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?
    - With 0
    - With mean
    - Both are equally good

- https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=11&ab_channel=DataTalksClub%E2%AC%9B
- https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb

### impute missing value with 0 

In [None]:
# df2 = df_shuffled.copy()

In [13]:
train_df['total_bedrooms'].fillna(0, inplace=True)
val_df['total_bedrooms'].fillna(0, inplace=True)
test_df['total_bedrooms'].fillna(0, inplace=True)

In [None]:
# index1 = int((df2.shape[0]*0.6))
# index2 = index1 + int((df2.shape[0]*0.2))
# print(index1, index2)
# train_df = df2.iloc[:index1, :].copy()
# val_df = df2.iloc[index1:index2, :].copy()
# test_df = df2.iloc[index2:, :].copy()
# print(train_df.shape, val_df.shape, test_df.shape)

In [16]:
X = np.array(train_df.loc[:, train_df.columns[:-2]].copy())
Y = np.array(train_df.loc[:, 'log_median_house_value'])
b_v1, w_v1 = train_linear_regression(X, Y)

In [17]:
X_val = np.array(val_df.loc[:, val_df.columns[:-2]].copy())
y_val = np.array(val_df.loc[:, 'log_median_house_value'])
y_pred_val = b_v1 + X_val.dot(w_v1)

In [18]:
y_pred_val

array([11.71830182, 12.47794996, 11.84203817, ..., 12.40213723,
       12.06030557, 12.06854096])

In [19]:
np.round(RMSE(y_val, y_pred_val), 2)

0.34

### [TBN] IMPORTANT: when X (+1) is square matrix, the weight equation can be calculated using the formula, $w = X^{-1} . Y $

In [36]:
train_df.reset_index(drop=True, inplace=True)

In [None]:
x = np.array(train_df.loc[:8, train_df.columns[:-2]].copy())
ones = np.ones(x.shape[0])
x = np.column_stack([ones, x])
# [train_df.columns[:-2]].copy()
y = np.array(train_df.loc[:8, 'log_median_house_value'])

print(x.shape, y.shape)

In [54]:
w_test = np.linalg.inv(x).dot(y)

In [55]:
w_test

array([-8.25116023e+01, -9.89073723e-01, -1.10759844e+00, -4.44129418e-02,
        7.02629116e-04,  1.10884602e-02, -3.87045556e-04, -1.64633731e-02,
        5.23502027e-02])

In [60]:
x = np.array(train_df.loc[:8, train_df.columns[:-2]].copy())
y = np.array(train_df.loc[:8, 'log_median_house_value'])
train_linear_regression(x, y)

array([-8.25116023e+01, -9.89073723e-01, -1.10759844e+00, -4.44129418e-02,
        7.02629116e-04,  1.10884602e-02, -3.87045556e-04, -1.64633731e-02,
        5.23502027e-02])

### impute missing value with mean 

In [23]:
avg_value = train_df['total_bedrooms'].mean()
train_df['total_bedrooms'].fillna(avg_value, inplace=True)
val_df['total_bedrooms'].fillna(avg_value, inplace=True)
test_df['total_bedrooms'].fillna(avg_value, inplace=True)

In [25]:
avg_value

542.552956325786

In [26]:
X = np.array(train_df.loc[:, train_df.columns[:-2]].copy())
Y = np.array(train_df.loc[:, 'log_median_house_value'])
b_v1, w_v1 = train_linear_regression(X, Y)

In [27]:
X_val = np.array(val_df.loc[:, val_df.columns[:-2]].copy())
y_val = np.array(val_df.loc[:, 'log_median_house_value'])
y_pred_val = b_v1 + X_val.dot(w_v1)

In [28]:
y_pred_val

array([11.71462282, 12.47428049, 11.84127314, ..., 12.39294053,
       12.05541417, 12.06224727])

In [29]:
np.round(RMSE(y_val, y_pred_val), 2)

0.34

Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?
    - If there are multiple options, select the smallest r.
    - 0
    - 0.000001
    - 0.001
    - 0.0001

Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))
- What's the value of std?
    - 0.5
    - 0.05
    - 0.005
    - 0.0005
- Note: Standard deviation shows how different the values are. If it's low, then all values are approximately the same. If it's high, the values are different. If standard deviation of scores is low, then our model is stable.

Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?
    - 0.13
    - 0.23
    - 0.33
    - 0.43

- submission link : https://forms.gle/nNUTLzz3F9KiFHNp9