In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd

In [2]:
# Reading the csv file
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# Checking the feature names
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

**Observation**: Column names are consistent. [lowercase + underscore_seperated]

In [4]:
mask = (df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND')
df_main = df[mask]

In [5]:
base = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income','median_house_value']
df_main = df[base]

#### Question 1:
There's one feature with missing values. What is it?

In [6]:
df_main.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

**Observation:** `total_bedrooms` have missing values. 

#### Question 2:
What's the median (50% percentile) for variable 'population'?

In [7]:
median = df['population'].median()
print(median)

1166.0


In [8]:
percentile_50 = df['population'].quantile(0.5)
print(percentile_50)

1166.0


### Question 1:
There's one feature with missing values. What is it?

In [9]:
# Checking the missing values
df_main.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

### Question 2:
What's the median (50% percentile) for variable 'population'?

In [10]:
# Calculating the median
df_main['population'].median()

1166.0

In [11]:
# Calculating the 50% percentile
df_main['population'].quantile(0.5)

1166.0

### Some Processing before Question 3

In [12]:
# Total number of datapoints
n = len(df_main)
n

20640

In [13]:
n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n - n_val - n_test

In [14]:
n_train, n_val, n_test

(12384, 4128, 4128)

In [15]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [16]:
# Dividing the dataframe in three part after shuffling the index
df_train = df_main.iloc[idx[: n_train]]
df_val = df_main.iloc[idx[n_train: n_train+n_val]]
df_test = df_main.iloc[idx[n_train+n_val:]]

In [17]:
# Resetting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [19]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

####

### Question 3:
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE? r RMSE?

#### Option 1: Filling missing value with 0

In [20]:
def prepare_X_zero(df):
    df_num = df.copy()
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [21]:
# Function to train linear regression model without regaularization
def linear_regression_train(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [22]:
# Function definition to calculate RMSE
def rmse(y, y_pred):
    error = y - y_pred
    squared_error = error ** 2
    mse = squared_error.mean()
    rmse = np.sqrt(mse)

    return rmse

In [23]:
# Training
X_train = prepare_X_zero(df_train)
w_0, w = linear_regression_train(X_train, y_train)

# Validation
X_val = prepare_X_zero(df_val)
y_pred = w_0 + X_val.dot(w)

# Calculating RMSE
rmse(y_val, y_pred)

0.32953303652243554

#### Option 2: Filling missing value with mean

In [24]:
def prepare_X_mean(df):
    df_num = df.copy()
    df_num = df_num.fillna(df_num['total_bedrooms'].mean())
    X = df_num.values
    return X

In [25]:
# Training
X_train = prepare_X_mean(df_train)
w_0, w = linear_regression_train(X_train, y_train)

# Validation
X_val = prepare_X_mean(df_val)
y_pred = w_0 + X_val.dot(w)

# Calculating RMSE
rmse(y_val, y_pred)

0.3290195439005846

### Question 4:

- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

If there are multiple options, select the smallest r.

In [26]:
# Function to train linear regression model with regaularization
def linear_regression_train_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + (r * np.eye(XTX.shape[0]))
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [27]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    # Training
    X_train = prepare_X_zero(df_train)
    w_0, w = linear_regression_train_reg(X_train, y_train, r)
    
    # Validation
    X_val = prepare_X_zero(df_val)
    y_pred = w_0 + X_val.dot(w)
    
    # Calculating RMSE
    score = rmse(y_val, y_pred)

    print(f'r={r}, rmse={round(score, 2)}')

r=0, rmse=0.33
r=1e-06, rmse=0.33
r=0.0001, rmse=0.33
r=0.001, rmse=0.33
r=0.01, rmse=0.33
r=0.1, rmse=0.33
r=1, rmse=0.33
r=5, rmse=0.34
r=10, rmse=0.34


### Question 5:

- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [28]:
def split_dataset(df, seed):
    df_main = df.copy()
    n = len(df_main)

    # Splitting into 3 part
    n_val = int(n*0.2)
    n_test = int(n*0.2)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    # Dividing the dataframe in three part after shuffling the index
    df_train = df_main.iloc[idx[: n_train]]
    df_val = df_main.iloc[idx[n_train: n_train+n_val]]
    df_test = df_main.iloc[idx[n_train+n_val:]]

    # Resetting the index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    return df_train, y_train, df_val, y_val, df_test, y_test

In [29]:
scores = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    # Spliting the dataset
    df_train, y_train, df_val, y_val, df_test, y_test = split_dataset(df_main, seed)

    # Training
    X_train = prepare_X_zero(df_train)
    w_0, w = linear_regression_train(X_train, y_train)
    
    # Validation
    X_val = prepare_X_zero(df_val)
    y_pred = w_0 + X_val.dot(w)
    
    # Calculating RMSE
    score = rmse(y_val, y_pred)
    scores.append(score)

In [30]:
round(np.std(scores), 3)

0.004

### Question 6:

- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

In [31]:
# Spliting the dataset
df_train, y_train, df_val, y_val, df_test, y_test = split_dataset(df_main, 9)

In [32]:
# Combine train and validation dataset
df_full_train = pd.concat([df_train, df_val]) 
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X_zero(df_full_train)

In [33]:
y_full_train = np.concatenate([y_train, y_val])

In [34]:
# Training the model
w_0, w = linear_regression_train_reg(X_full_train, y_full_train, r=0.001)

In [36]:
# Validation
X_test = prepare_X_zero(df_test)
y_pred = w_0 + X_test.dot(w)

# Calculating RMSE
rmse(y_test, y_pred)

0.34531689143600774