In [5]:
import pandas as pd
import numpy as np
from numpy.linalg import LinAlgError
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import logging

logger = logging.getLogger(__name__)

def hedonic(transactions, y_col, m_or_qtr, index_len):
    HEDONIC_PENALTY = 1.5  # multiplier to evaluation metrics (MSE), to favour more for repeated-sales method

    cat_val = [
        m_or_qtr,
        "devt_class",
        "source",
        "zone",
        "region"
    ]
    num_val = [
        "site_area_sqm",
        "gpr",
        "lease_term",
        "num_bidders",
        "timediff_launch_to_close",
        "timediff_launch_to_award",
    ]
    if "age" in transactions.columns:
        transactions["new_building"] = transactions["age"].apply(
            lambda a: True if a < 0 else False
        )
    col = transactions.columns[transactions.nunique() > 1]
    cat_val = list(np.intersect1d(col, cat_val))
    num_val = list(np.intersect1d(col, num_val))

    if num_val:
        scaler = MinMaxScaler()
        transactions[num_val] = scaler.fit_transform(transactions[num_val])
    transactions = transactions.dropna(how='any', subset=[y_col])
    y = np.log(transactions[y_col])
    hedonic_index = None
    mse = 1
    try:
        x = pd.get_dummies(
            data=transactions[num_val + cat_val], columns=cat_val, drop_first=False
        )
        x = x.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    except ValueError:
        logger.warning("ValueError when converting x to dummies")
        return hedonic_index, mse

    try:
        model = LinearRegression()
        fit = model.fit(x, y)
        r2 = model.score(x, y)
        r2_adj = 1 - (1-r2)*(len(y)-1)/(len(y)-x.shape[1]-1)
        print(f"R2 score {r2} \nAdjusted R2 score {r2_adj}")
    except LinAlgError:
        logger.warning("LinAlgError encountered, skip")
        return hedonic_index, mse
    except ValueError:
        logger.warning("ValueError encountered, skip")
        return hedonic_index, mse
    hedonic_index = pd.DataFrame(
        [[fit.coef_[i], s] for i, s in enumerate(x.columns) if m_or_qtr in s],
        columns=["hi", m_or_qtr],
    )

    if len(hedonic_index) < index_len:
        mse = 1
        hedonic_index = None
    else:
        hedonic_index.sort_values(by=m_or_qtr, inplace=True)
        mse = mean_squared_error(y, fit.predict(x)) * HEDONIC_PENALTY

        hedonic_index[m_or_qtr] = hedonic_index[m_or_qtr].apply(
            lambda x: x.split("_")[-1]
        )
        base_coef = hedonic_index.hi.iloc[0]
        hedonic_index["hi"] = hedonic_index["hi"] - base_coef
        hedonic_index["hi"] = hedonic_index["hi"].apply(lambda x: np.exp(x))
        logger.info(f"categorical cols: {cat_val}")
        logger.info(f"numerical cols: {num_val}")

    return hedonic_index, mse


def rebase(series, base=0):
    base_value = series[base]
    return series.apply(lambda x: x/base_value)


In [14]:
gls = pd.read_csv(r"G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_no_detail.csv")
gls = gls[(gls.devt_class=='residential') | (gls.devt_class=='rc')].reset_index(drop=True)
gls

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,month_launch,day_launch,close_month_index,year_close,month_close,day_close,award_month_index,year_award,month_award,day_award
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,4,29,201506,2015,6,23,201506,2015,6,30
1,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,4,29,201506,2015,6,18,201506,2015,6,23
2,384815dd4cafcbf88b2f11099d8bced7a584736ec36742...,30/12/2013,29/4/2014,30/4/2014,Geylang S6,Sims Drive,geylang,central region,project name,0,...,12,30,201404,2014,4,29,201404,2014,4,30
3,05c11060bf1cbcb2db7aa3ed898c19a09fd298c3b1c3a4...,15/4/2013,13/6/2013,14/6/2013,Sengkang S12,Fernvale Close,sengkang,north-east region,project name,0,...,4,15,201306,2013,6,13,201306,2013,6,14
4,d9fcb7d323ca5b77f6a22635200afafdd99e67f6feb109...,28/2/2013,11/4/2013,12/4/2013,Sengkang S11,Sengkang West Way,sengkang,north-east region,project name,0,...,2,28,201304,2013,4,11,201304,2013,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,d4813559d3164173d084094221d2bce33dfebd51ab4e28...,13/7/1994,14/9/1994,15/11/1994,Lorong 42 Geylang,Lorong 42 Geylang,geylang,central region,street name,poor matched,...,7,13,199409,1994,9,14,199411,1994,11,15
412,33f01859b6459d8187211166006aaebc715b49feadbdea...,27/5/1993,19/8/1993,6/11/1993,Robertson Quay/Nanson Road,Robertson Quay/Nanson Road,downtown core,central region,street name,poor matched,...,5,27,199308,1993,8,19,199311,1993,11,6
413,8eb15f52eae64386c44bb23f974d85aed3b2d97d77deb9...,12/11/1992,7/1/1993,2/3/1993,Bayshore Road,Bayshore Road,bedok,east region,street name,poor matched,...,11,12,199301,1993,1,7,199303,1993,3,2
414,073c0804e1dfae10c4f7167eed3261597b4e9b6d1c51fc...,12/11/1992,7/1/1993,2/3/1993,Robertson Quay,Robertson Quay,singapore river,central region,street name,poor matched,...,11,12,199301,1993,1,7,199303,1993,3,2


In [15]:
y_col = "price_psm_gfa"
hi = hedonic(gls, y_col, "year_launch", 5)[0]

y_col2 = "tender_price"
hi2 = hedonic(gls, y_col2, "year_launch", 5)[0]

R2 score 0.9231427318193016 
Adjusted R2 score 0.9017520358207901
R2 score 0.7900527098712921 
Adjusted R2 score 0.7359753775654128


In [18]:
hi_rebase = pd.DataFrame({"hi_price_psm_gfa": rebase(hi.hi, base=10), "year_launch": hi.year_launch})
hi_rebase.insert(loc=0, column="mean_price_psm_gfa", value=gls[[y_col, "year_launch"]].groupby(by="year_launch").mean().values)
hi_rebase["mean_price_psm_gfa"] = rebase(hi_rebase.mean_price_psm_gfa, 10)


In [19]:
hi_rebase["mean_price_psm_gfa"] = rebase(hi_rebase.mean_price_psm_gfa, 10)
hi_rebase.insert(0, "hi_tender_price", rebase(hi2.hi, 10))
hi_rebase.insert(0, "mean_tender_price", value=gls[[y_col2, "year_launch"]].groupby(by="year_launch").mean().values)
hi_rebase["mean_tender_price"] = rebase(hi_rebase.mean_tender_price, 10)
hi_rebase

Unnamed: 0,mean_tender_price,hi_tender_price,mean_price_psm_gfa,hi_price_psm_gfa,year_launch
0,0.305481,0.312368,0.349006,0.307811,1989
1,0.270916,0.236696,0.19314,0.205263,1990
2,0.698279,0.450955,0.471616,0.395002,1991
3,0.438343,0.218948,0.568608,0.510641,1992
4,0.901816,0.71336,0.804007,0.736724,1993
5,1.098434,0.827765,1.200729,1.11783,1994
6,1.241663,1.12289,1.402526,1.284535,1995
7,1.325935,0.825495,1.416102,1.250612,1996
8,1.399009,1.377642,1.347482,1.02181,1997
9,1.930472,1.012344,0.655243,0.58522,1998


In [20]:
hi_rebase.to_csv("hi_2000_res.csv", header=True, index=False)