### Linear Regression

#### i) Data Loading and pre-processing

In [566]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set the random seed
import numpy as np
np.random.seed(2303562)

# Import housing resale data
training_data = pd.read_csv("C:/Users/Admin/Desktop/ML_Project/full_hdb_perSqm_train_f34.csv")
test_data = pd.read_csv("C:/Users/Admin/Desktop/ML_Project/full_hdb_perSqm_test_f34.csv")

# Separate target variable, Y (resale_price_per_sqm)
train_y = training_data['resale_price_per_sqm']
train_x = training_data.drop(columns=['resale_price_per_sqm'])

test_y = test_data['resale_price_per_sqm']
test_x = test_data.drop(columns=['resale_price_per_sqm'])

# Identify numerical and categorical columns
cat_cols = list(range(5, 34))

# Convert True/False to 1/0 for categorical columns
for i in cat_cols:
    train_x.iloc[:, i] = train_x.iloc[:, i].astype(int)
    test_x.iloc[:, i] = test_x.iloc[:, i].astype(int)

#### ii) Build a OLS model

In [567]:
# Numerical columns
num_cols = list(range(1, 5))

# Calculate mean and sdev for each numerical cols [lat, minPrimary_transitTime, min_dis, remaining_lease]
train_mean = train_x.iloc[:,num_cols].mean()
train_sdev = train_x.iloc[:,num_cols].std()

# Standardize training and testing datasets
scaled_train_x = (train_x.iloc[:, num_cols] - train_mean) / train_sdev
scaled_test_x = (test_x.iloc[:, num_cols] - train_mean) / train_sdev

# Recombine scaled train, test data with categorical data
scaled_train_x = pd.concat([scaled_train_x, train_x.iloc[:,cat_cols]], axis=1)
scaled_test_x = pd.concat([scaled_test_x, test_x.iloc[:,cat_cols]], axis=1)

scaled_train_x = pd.DataFrame(scaled_train_x)
scaled_test_x = pd.DataFrame(scaled_test_x)

# Train the Linear Regression model
LR_model = LinearRegression()
LR_model.fit(scaled_train_x, train_y)
LR_pred = LR_model.predict(scaled_test_x)

# Calculate prediction error
LR_mae = mean_absolute_error(test_y, LR_pred)
LR_rmse = np.sqrt(mean_squared_error(test_y, LR_pred))
print('Test MAE of Ordinary linear regression model:', LR_mae)
print('Test prediction error of Ordinary linear regression model:', LR_rmse)

Test MAE of Ordinary linear regression model: 506.4963999637628
Test prediction error of Ordinary linear regression model: 670.4898085819206


### PCA + Linear Regression

In [568]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(scaled_train_x.iloc[:, 0:4])
print(pca.explained_variance_ratio_)

# To select the number of PCs based on variance coverage (95%)
var = np.cumsum(pca.explained_variance_ratio_)
pc_num = np.argmax(var >= 0.95) + 1
print(pc_num)

# Selected PCs = 4
selected_PCs = pca.transform(scaled_train_x.iloc[:, 0:4])[:, :pc_num]
selected_PCs = pd.DataFrame(selected_PCs)

selected_PCs_test = pca.transform(scaled_test_x.iloc[:, 0:4])[:, :pc_num]
selected_PCs_test = pd.DataFrame(selected_PCs_test)

# Transform Training and testing data by number of PCs (3)
PC_train_data = pd.concat([selected_PCs, scaled_train_x.iloc[:, 4:33]], axis=1)
PC_test_data = pd.concat([selected_PCs_test, scaled_test_x.iloc[:, 4:33]], axis=1)

# Convert col name to str
PC_train_data.columns = PC_train_data.columns.astype(str)
PC_test_data.columns = PC_test_data.columns.astype(str)

# Train a PCA Linear Regression model
PCA_LR_model = LinearRegression()
PCA_LR_model.fit(PC_train_data, train_y)

# Make predictions on the test data
PCA_LR_pred = PCA_LR_model.predict(PC_test_data)

# Calculate prediction error
PCA_LR_mae = mean_absolute_error(test_y, PCA_LR_pred)
PCA_LR_rmse = np.sqrt(mean_squared_error(test_y, PCA_LR_pred))

print('Test MAE of Linear regression with PCA:', PCA_LR_mae)
print('Test RMSE of Linear regression with PCA:', PCA_LR_rmse)

[0.35259718 0.25444489 0.23506246 0.15789547]
4
Test MAE of Linear regression with PCA: 506.49639996376266
Test RMSE of Linear regression with PCA: 670.4898085819206


### Linear Regression + Ridge regularization

In [569]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge

# Create a new ln(y) column to minimize Lambda
ln_train_y = np.log(train_y)
XtY = np.dot(scaled_train_x.iloc[:, 0:4].T, ln_train_y)
abs_XtY = np.abs(XtY)
max_lambda = np.max(abs_XtY)
print(max_lambda)

# Setup a grid of lambda
grid = np.linspace(1e-6, max_lambda, num=5000)

# Ridge regression
ridge_model = RidgeCV(alphas=grid, store_cv_values=True)
ridge_model.fit(scaled_train_x.iloc[:, 0:4], ln_train_y)
best_ridge_lambda = ridge_model.alpha_
print(best_ridge_lambda)

# Re-build Ridge regression based on best lambda
from sklearn.linear_model import Ridge
best_ridge_model = Ridge(alpha=best_ridge_lambda)
best_ridge_model.fit(scaled_train_x, ln_train_y)

# Calculate Prediction Error
ridge_pred = best_ridge_model.predict(scaled_test_x)
ridge_pred = np.exp(ridge_pred)
ridge_rmse = np.sqrt(mean_squared_error(ridge_pred, test_y))
print('Test prediction error of Ridge regression model:', ridge_rmse)

1396.0190541980212
1.6755589754327118
Test prediction error of Ridge regression model: 650.442412238654


### Linear Regression + Lasso regularization

In [570]:
# By using the same Lambda grid as above, we can build a Lasso Regression model for comparison

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

lasso_model = LassoCV(alphas=grid, cv=10)
lasso_model.fit(scaled_train_x.iloc[:, 0:4], ln_train_y)
best_lasso_lambda = lasso_model.alpha_
print(best_lasso_lambda)

# Re-build Lasso regression based on best lambda
best_lasso_model = Lasso(alpha=best_lasso_lambda)
best_lasso_model.fit(scaled_train_x, ln_train_y)

# Calculate Prediction Error
lasso_pred = best_lasso_model.predict(scaled_test_x)
lasso_pred = np.exp(lasso_pred)
lasso_rmse = np.sqrt(mean_squared_error(lasso_pred, test_y))
print('Test prediction error of Lasso regression model:', lasso_rmse)


1e-06
Test prediction error of Lasso regression model: 650.9404450816197
