In [8]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import BaggingRegressor 
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

## Cross-Validation for models with best parameters

### Linear models

In [15]:
ridge_model = Ridge(alpha=1.6755589754327118)
lasso_model = Lasso(alpha=1e-06)

### Tree-based models

In [16]:
tree = DecisionTreeRegressor()

param_grid_tree = {
    'max_depth': [15],  
    'min_samples_split': [17], 
    'min_samples_leaf': [2],
    'ccp_alpha':[0.1],
    'random_state':[817]
}


bagging = BaggingRegressor()

param_grid_bagging = {
    "n_estimators":[285],
    "max_features":[28],
    'random_state': [817],
    'oob_score': [True]
}

forest = RandomForestRegressor()

param_grid_forest = {
    "n_estimators":[290],
    "max_features":[21],
    'ccp_alpha': [0.2],
    'max_depth': [21],
    'random_state': [817],
    'oob_score': [True]
}

In [17]:
train34 = pd.read_csv('full_hdb_perSqm_train_f34.csv')
train34 = train34.drop(['Unnamed: 0'], axis = 1)
val_sets = np.array_split(train34, 5)
val_sets[0]

Unnamed: 0,lat,minPrimary_transitTime,min_dis,remaining_lease,DBSS,Improved,Model A,New Generation,Type S1,Type S2,...,BUKIT MERAH,CENTRAL AREA,CHOA CHU KANG,CLEMENTI,JURONG WEST,KALLANG/WHAMPOA,QUEENSTOWN,WOODLANDS,YISHUN,resale_price_per_sqm
0,1.377567,759.0,0.001040,93.166667,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,6410.714286
1,1.371036,368.0,0.018341,60.833333,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,5186.813187
2,1.430421,964.0,0.005845,71.083333,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5335.365854
3,1.352865,448.0,0.009913,94.833333,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,6691.176471
4,1.371233,454.0,0.005618,61.000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5476.190476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2758,1.339195,275.0,0.007477,44.750000,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5220.588235
2759,1.449125,1000.0,0.005346,77.750000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5000.000000
2760,1.337461,706.0,0.007455,73.416667,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,4466.019417
2761,1.318470,545.0,0.003546,55.416667,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,6593.406593


In [18]:
trace=[0,1,2,3,4]
k = 0
mae=[[],[],[],[],[]]
for i in range(5):
    temp = [x for x in trace if x != i]
    train_val = pd.DataFrame()
    for j in temp:
        train_val = pd.concat([train_val, val_sets[j]])

    temp_train_y = train_val['resale_price_per_sqm']
    temp_train_x = train_val.drop(['resale_price_per_sqm'], axis = 1) 

    temp_test_y = val_sets[i]['resale_price_per_sqm']
    temp_test_x = val_sets[i].drop(['resale_price_per_sqm'], axis = 1) 

    # num_train = temp_train_x.loc[:,:'remaining_lease']
    # dum_train = temp_train_x.loc[:,'remaining_lease':]

    # num_test = temp_test_x.loc[:,:'remaining_lease']
    # dum_test = temp_test_x.loc[:,'remaining_lease':]
    
    # column_ = ['lat','minPrimary_transitTime','min_dis','remaining_lease']
    
    scaled_train_x = StandardScaler().fit_transform(temp_train_x)
    scaled_test_x = StandardScaler().fit_transform(temp_test_x)

    # scaled_train_x = pd.concat([scaled_train_x,dum_train], axis=1)
    # scaled_test_x = pd.concat([scaled_test_x,dum_test], axis=1)
    
    # scaled_train_x = scaled_train_x.dropna(inplace=True)
    # scaled_test_x = scaled_test_x.dropna(inplace=True)

    ridge_model = Ridge(alpha=1.6755589754327118)
    ridge_model.fit(scaled_train_x,temp_train_y)
    ridge_pred = ridge_model.predict(scaled_test_x)

    min_length = min(len(ridge_pred), len(temp_test_y))
    ridge_pred = ridge_pred[:min_length]
    temp_ridge = temp_test_y[:min_length]
    ridge_mae = mean_absolute_error(ridge_pred, temp_ridge)

    mae[k].append(ridge_mae)

    lasso_model = Lasso(alpha=1e-06)
    lasso_model.fit(scaled_train_x,temp_train_y)
    lasso_pred = lasso_model.predict(scaled_test_x)

    min_length = min(len(lasso_pred), len(temp_test_y))
    lasso_pred = lasso_pred[:min_length]
    temp_lasso = temp_test_y[:min_length]
    lasso_mae = mean_absolute_error(lasso_pred, temp_lasso)

    mae[k+1].append(lasso_mae)


    tree_model = DecisionTreeRegressor(criterion = 'absolute_error',
                                   max_depth = 15,
                                   min_samples_split = 17,
                                   min_samples_leaf = 2,
                                   ccp_alpha = 0.1,
                                   random_state = 817
                                   )
    tree_model.fit(temp_train_x,temp_train_y)
    tree_pred = tree_model.predict(temp_test_x)

    min_length = min(len(tree_pred), len(temp_test_y))
    tree_pred = tree_pred[:min_length]
    temp_tree = temp_test_y[:min_length]
    tree_mae = mean_absolute_error(tree_pred, temp_tree)

    mae[k+2].append(tree_mae)


    bagging_model = BaggingRegressor(
                                   n_estimators = 285,
                                   max_features = 28,
                                   random_state = 817
                                   )
    bagging_model.fit(temp_train_x,temp_train_y)
    bagging_pred = bagging_model.predict(temp_test_x)

    min_length = min(len(bagging_pred), len(temp_test_y))
    bagging_pred = bagging_pred[:min_length]
    temp_bagging = temp_test_y[:min_length]
    bagging_mae = mean_absolute_error(bagging_pred, temp_bagging)

    mae[k+3].append(bagging_mae)



    forest_model = RandomForestRegressor(
                                   n_estimators = 290,
                                   max_features = 21,
                                   ccp_alpha = 0.2,
                                   max_depth = 21,
                                   random_state = 817                                   
                                   )
    forest_model.fit(temp_train_x,temp_train_y)
    forest_pred = forest_model.predict(temp_test_x)

    min_length = min(len(forest_pred), len(temp_test_y))
    forest_pred = forest_pred[:min_length]
    temp_forest = temp_test_y[:min_length]
    forest_mae = mean_absolute_error(forest_pred, temp_forest)

    mae[k+4].append(forest_mae)

for i in range(5):
    print(sum(mae[i])/len(mae[i]))

508.59357300832187
508.6235367408199
351.5927849766899
288.2168810049192
284.33288543438226


These five numbers are the MAE of Ridge Regression, Lasso Regression, Decision Tree, Bagging and Random Forest respectively.

### Calculate R^2 (without doing cross-validation)

In [31]:
from sklearn.model_selection import train_test_split

# read data with 33 features
train = pd.read_csv('full_hdb_perSqm_train_f34.csv').drop(['Unnamed: 0'], axis = 1)
test = pd.read_csv('full_hdb_perSqm_test_f34.csv').drop(['Unnamed: 0'], axis = 1)

# train set
y_train = train['resale_price_per_sqm']
x_train = train.drop(['resale_price_per_sqm'], axis = 1)

# test set
y_test = test['resale_price_per_sqm']
x_test = test.drop(['resale_price_per_sqm'], axis = 1).astype(float)

# standardization
scaler = StandardScaler()

x_columns = list(train.columns)
x_columns.remove('resale_price_per_sqm')

x_trains = scaler.fit_transform(x_train)

x_trains = pd.DataFrame(x_trains, columns = x_columns)

x_tests = scaler.fit_transform(x_test)

x_tests = pd.DataFrame(x_tests, columns = x_columns)

x_tests

Unnamed: 0,lat,minPrimary_transitTime,min_dis,remaining_lease,DBSS,Improved,Model A,New Generation,Type S1,Type S2,...,BISHAN,BUKIT MERAH,CENTRAL AREA,CHOA CHU KANG,CLEMENTI,JURONG WEST,KALLANG/WHAMPOA,QUEENSTOWN,WOODLANDS,YISHUN
0,-0.060351,0.318355,-0.867728,-1.366380,-0.101196,1.817030,-0.822372,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,-0.280591
1,0.078639,-0.475963,1.311708,-0.796253,-0.101196,1.817030,-0.822372,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,-0.280591
2,-0.135560,0.204881,0.228632,0.975014,-0.101196,-0.550349,1.215995,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,-0.280591
3,-0.113284,0.873457,0.957676,-1.360845,-0.101196,-0.550349,-0.822372,2.77108,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,-0.280591
4,-0.110791,1.778183,1.010474,-1.194788,-0.101196,-0.550349,-0.822372,2.77108,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,-0.280591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5914,1.187451,0.342890,1.335777,0.914127,-0.101196,-0.550349,1.215995,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,3.563901
5915,1.126262,-0.488230,1.556378,0.986085,-0.101196,-0.550349,1.215995,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,3.563901
5916,1.293202,0.299954,-0.918701,-0.962309,-0.101196,-0.550349,-0.822372,2.77108,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,3.563901
5917,0.997835,0.511567,0.312637,1.296057,-0.101196,1.817030,-0.822372,-0.36087,-0.04315,-0.012999,...,-0.117791,-0.193207,-0.094144,-0.221576,-0.153929,-0.249551,-0.183099,-0.148672,-0.300092,3.563901


In [32]:
ordinary_linear = LinearRegression()

ridge_model = Ridge(alpha=1.6755589754327118)

lasso_model = Lasso(alpha=1e-06)

tree = DecisionTreeRegressor(
    criterion = 'absolute_error',
    max_depth = 15,
    min_samples_split = 17,
    min_samples_leaf = 2,
    ccp_alpha = 0.1,
    random_state = 817)

bagging = BaggingRegressor(
    n_estimators = 285,
    max_features = 28,
    random_state = 817
)

forest = RandomForestRegressor(
    n_estimators = 290,
    max_features = 21,
    ccp_alpha = 0.2,
    max_depth = 21,
    random_state = 817
)


In [34]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# non standardizatioin
count = 0
model_list = ['OLR', 'Decision Tree', 'Bagging', 'Random Forest']
for model in [ordinary_linear, tree, bagging, forest]:

    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)

    r2 = r2_score(y_test, y_pred)

    print(model_list[count] + ' R2: ' + str(r2) + '  MAE: ' + str(mean_absolute_error(y_test, y_pred)))

    count += 1

# standardization
count = 0
model_list = ['Ridge', 'Lasso']
for model in [ridge_model, lasso_model]:

    model.fit(x_trains, y_train)
    
    y_pred = model.predict(x_tests)

    r2 = r2_score(y_test, y_pred)

    print(model_list[count] + ' R2: ' + str(r2) + '  MAE: ' + str(mean_absolute_error(y_test, y_pred)))

    count += 1
    

OLR R2: 0.7573262875507546  MAE: 506.4963999637789
Decision Tree R2: 0.8710513491093698  MAE: 349.38685314738126
Bagging R2: 0.9218813828472134  MAE: 280.46301611204416
Random Forest R2: 0.9211734230221325  MAE: 280.72960971306856
Ridge R2: 0.7541337699378984  MAE: 509.6115307018445
Lasso R2: 0.7541350522910986  MAE: 509.6326313177599
