# Zone & Region

In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
import pdfplumber
from typing import List

## Try using project name

In [376]:
geo = pd.read_csv("reference_table.csv")
gls = pd.read_csv("gls_full.csv")
header_gls = list(gls.columns)

### Fill in project names

In [377]:
gls.proj_name_raw[gls.proj_name_raw.isna()]

71     NaN
121    NaN
122    NaN
123    NaN
143    NaN
      ... 
591    NaN
592    NaN
593    NaN
594    NaN
595    NaN
Name: proj_name_raw, Length: 346, dtype: object

In [378]:
# once-for-all
def parse_text(text_containing_land_parcel: str)-> str:
    pattern1 = r'(.*?)\('
    pattern2 = r'\((.*?)\)'

    try:
        found_proj = '-'.join(re.findall(pattern1, text_containing_land_parcel)[0].strip().lower().split(' '))
        found_land = re.findall(pattern2, text_containing_land_parcel)[0].strip()
    except:
        found_proj = np.nan
        found_land = np.nan

    return found_proj, found_land


file_list = os.listdir(r'G:\REA\Working files\land-bidding\ura_scrape\sg land bidding\tender details')
parsed_names = list(map(parse_text, file_list))
more_proj = pd.DataFrame({'proj_name_raw_1': [item[0] for item in parsed_names],
                          'land_parcel': [item[1] for item in parsed_names]})
# more_proj.to_csv("more_proj.csv")

### Merge by project name

In [379]:
gls = pd.merge(gls, geo, how='left', left_on='proj_name_res', right_on='project_name')
gls_merged1 = gls[["land_parcel", "street", "zone", "region", "proj_name_res", "project_name"]]
gls.rename(columns={'zone': 'zone_by_proj', 'region': 'region_by_proj'}, inplace=True)
gls_merged1.isna().sum()

land_parcel        0
street             0
zone             406
region           406
proj_name_res    349
project_name     406
dtype: int64

## Try using land parcel name

In [380]:
zone_region = geo[["zone", "region"]].drop_duplicates()

land_zone = gls.land_parcel.apply(lambda x: ' '.join(x.lower().split(' ')[:-1]))
gls.insert(loc=list(gls.columns).index("land_parcel")+1, column="zone_formatted", value=land_zone.values)
land_zone_distinct = pd.DataFrame(land_zone.unique(), columns=["zone"])

In [15]:
# funcs to identify zone names
def match_score(dictionary: str, lookup_value: str):
    if lookup_value:
        a = list("".join(dictionary.split(" ")))
        b = list("".join(lookup_value.split(" ")))
        score_a = 0
        score_b = 0
        penalty_a = 0
        penalty_b = 0
        total_len_a = len(a)
        total_len_b = len(b)
        if total_len_a * total_len_b > 0:
            for char in a:
                if char in b:
                    score_a += 1
                else:
                    penalty_a += 1
            for char in b:
                if char in a:
                    score_b += 1
                else:
                    penalty_b += 1
            return (score_a / total_len_a) * (score_b / total_len_b) - (penalty_a / total_len_a) * (penalty_b / total_len_b)
        else:
            return np.nan

    else:
        return np.nan


def order_score(dictionary: str, lookup_value: str):
    char_list_dict = list(dictionary)
    charset_dict = list(set(char_list_dict))
    order_dict_a = dict()
    for char in charset_dict:
        order_dict_a[char] = char_list_dict.index(char)

    char_list_value = list(lookup_value)
    charset_value = list(set(char_list_value))
    order_dict_b = dict()
    for char in charset_value:
        order_dict_b[char] = char_list_value.index(char)

    # get the index of first char of dict in value dict
    first_char = dictionary[0]

    try:
        first_char_idx = order_dict_b[first_char]
        # print(first_char_idx)
        # recalculate index for value dict
        for char in order_dict_b.keys():
            order_dict_b[char] -= first_char_idx

    except KeyError:
        pass

    char_order_dict = [order_dict_a.get(key) for key in char_list_dict]
    char_order_dict_in_value = [order_dict_b.get(key) for key in char_list_dict]

    # calculate variation score
    sum_ = 0
    len_ = 0
    for k in range(len(char_order_dict)):
        if char_order_dict_in_value[k] is None:
            sum_ += 1
        else:
            sum_ += (char_order_dict[k] - char_order_dict_in_value[k]) ** 2
        len_ += 1

    try:
        variation = (sum_ / len_) ** 0.5
    except ZeroDivisionError:
        variation = np.nan

    return variation


def name_match(name: str, dictionary, limit=None):
    # zone = region_na.zone.reset_index(drop=True)[7]
    # dictionary = pd.DataFrame(dictionary)
    match_score_list = dictionary.apply(match_score, lookup_value=name)
    order_score_list = dictionary.apply(order_score, lookup_value=name)
    max_order_var = order_score_list.max()
    min_order_var = order_score_list.min()
    order_var_range = max_order_var - min_order_var
    if order_var_range:
        order_score_list = order_score_list.apply(lambda x: (max_order_var - x)/order_var_range)
    else:
        order_score_list = order_score_list.apply(lambda x: np.nan)
    matched_df = pd.DataFrame({"std_name": dictionary.values, f"{name}_match": match_score_list, f"{name}_order": order_score_list})
    matched_df["compound_score"] = matched_df[f"{name}_match"] * matched_df[f"{name}_order"]
    sorted_matched_df = matched_df.sort_values(by="compound_score", ascending=False)
    sorted_matched_df.reset_index(drop=True, inplace=True)
    best_matched = sorted_matched_df.std_name[0]
    best_matched_score = sorted_matched_df.compound_score[0]
    if limit:
        return (best_matched, best_matched_score) if best_matched_score >= limit else (np.nan, best_matched_score)
    return best_matched, best_matched_score
    # return sorted_matched_df


def clean_street_name(street: str)-> str:
    suffix = ['street', 'st', 'road', 'rd', 'avenue', 'ave', 'crescent', 'drive', 'dr', 'boulevard', 'blvd', 'rise', 'way', 'lane', 'alley', 'link', 'walk', 'vista', 'track', 'vale']
    prefix = ['upper', 'lower']
    news = ['north', 'east', 'west', 'north']
    street = street.lower()
    street = re.sub(r'/', ' ', street)
    street = ''.join([char for char in list(street) if char not in string.punctuation])

    street = re.sub(r'\d+', '', street)
    street = re.sub(r' +', ' ', street).strip()
    st_list = street.split(' ')

    if ' '.join(st_list) != 'bukit drive':
        # check prefix
        if st_list[0] in prefix:
            st_list = st_list[1:]

        # check suffix & direction: 'direction+suffix' format
        if st_list[-1] in suffix:
            st_list = st_list[:-1]
            if st_list[-1] in news and len(st_list) > 1 and st_list[-2] != 'jurong':
                    st_list = st_list[:-1]
        elif st_list[-1] in news and len(st_list) > 1 and st_list[-2] != 'jurong':
            st_list = st_list[:-1]
            if st_list[-1] in suffix:
                st_list = st_list[:-1]
        return ' '.join(st_list)
    else:
        return 'bukit drive'


def st_name_match(name: str, ref_df, key):
    key_list = list(ref_df[key])
    matched = np.nan
    for i in range(len(key_list)):
        check = key_list[i]
        if key_list[i] == name:
            matched = key_list[i]
            break
    if pd.isna(matched):
        for i in range(len(key_list)):
            if key_list[i] in name:
                matched = key_list[i]
                break
    return matched


In [382]:
gls.insert(loc=list(gls.columns).index("zone_formatted")+1, column="zone_guess", value=gls.zone_formatted.apply(name_match, dictionary=zone_region.zone, limit=0.6).apply(lambda x: x[0]))
gls

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,zone_formatted,zone_guess,street,site_area_sqm,devt_type,...,month_close,day_close,award_month_index,year_award,month_award,day_award,project_name,region_by_proj,zone_by_proj,address_street
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,queenstown,queenstown,Dundee Road,10516.1,*CO,...,6,23,201506,2015,6,30,queens-peak,central region,queenstown,dundee road
1,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,toa payoh,toa payoh,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,12154.6,*CO,...,6,18,201506,2015,6,23,gem-residences,central region,toa payoh,lorong 5 toa payoh
2,384815dd4cafcbf88b2f11099d8bced7a584736ec36742...,30/12/2013,29/4/2014,30/4/2014,Geylang S6,geylang,geylang,Sims Drive,23900.1,*CO,...,4,29,201404,2014,4,30,sims-urban-oasis,central region,geylang,sims drive
3,05c11060bf1cbcb2db7aa3ed898c19a09fd298c3b1c3a4...,15/4/2013,13/6/2013,14/6/2013,Sengkang S12,sengkang,sengkang,Fernvale Close,14930.5,*CO,...,6,13,201306,2013,6,14,rivertrees-residences,north-east region,sengkang,fernvale close
4,d9fcb7d323ca5b77f6a22635200afafdd99e67f6feb109...,28/2/2013,11/4/2013,12/4/2013,Sengkang S11,sengkang,sengkang,Sengkang West Way,16603.9,*CO,...,4,11,201304,2013,4,12,riverbank-at-fernvale,north-east region,sengkang,fernvale close
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,19/10/1992,9/2/1993,21/5/1993,Kaki Bukit,kaki,,Kaki Bukit,60000.0,Industrial,...,2,9,199305,1993,5,21,,,,
592,40e8902a48c5f3b717a76c37e270d2360da9e1172da322...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 3/Tuas South Avenue 4,tuas south avenue 3/tuas south avenue,,Tuas South Avenue 3/Tuas South Avenue 4,102264.0,Industrial,...,2,9,199305,1993,5,21,,,,
593,43e84a0ed4744cddede5fb08c7162e61eb86cdf7b7823d...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 4/Tuas South Avenue 5,tuas south avenue 4/tuas south avenue,,Tuas South Avenue 4/Tuas South Avenue 5,102264.0,Industrial,...,2,9,199305,1993,5,21,,,,
594,473d6c80a232fc34e07f2a9ef544b66270d175537468eb...,31/8/1992,24/11/1992,15/2/1993,Merchant Road/Angus Street,merchant road/angus,,Merchant Road/Angus Street,2609.0,Hotel,...,11,24,199302,1993,2,15,,,,


In [383]:
land_zone = gls[["land_parcel", "zone_guess"]]
gls = pd.merge(gls, zone_region, how="left", left_on="zone_guess", right_on="zone")
# region_na = merged_region[merged_region.region.isna()]
# region_na
gls.rename(columns={'zone': 'zone_by_land_parcel', 'region': 'region_by_land_parcel'}, inplace=True)
gls

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,zone_formatted,zone_guess,street,site_area_sqm,devt_type,...,award_month_index,year_award,month_award,day_award,project_name,region_by_proj,zone_by_proj,address_street,zone_by_land_parcel,region_by_land_parcel
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,queenstown,queenstown,Dundee Road,10516.1,*CO,...,201506,2015,6,30,queens-peak,central region,queenstown,dundee road,queenstown,central region
1,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,toa payoh,toa payoh,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,12154.6,*CO,...,201506,2015,6,23,gem-residences,central region,toa payoh,lorong 5 toa payoh,toa payoh,central region
2,384815dd4cafcbf88b2f11099d8bced7a584736ec36742...,30/12/2013,29/4/2014,30/4/2014,Geylang S6,geylang,geylang,Sims Drive,23900.1,*CO,...,201404,2014,4,30,sims-urban-oasis,central region,geylang,sims drive,geylang,central region
3,05c11060bf1cbcb2db7aa3ed898c19a09fd298c3b1c3a4...,15/4/2013,13/6/2013,14/6/2013,Sengkang S12,sengkang,sengkang,Fernvale Close,14930.5,*CO,...,201306,2013,6,14,rivertrees-residences,north-east region,sengkang,fernvale close,sengkang,north-east region
4,d9fcb7d323ca5b77f6a22635200afafdd99e67f6feb109...,28/2/2013,11/4/2013,12/4/2013,Sengkang S11,sengkang,sengkang,Sengkang West Way,16603.9,*CO,...,201304,2013,4,12,riverbank-at-fernvale,north-east region,sengkang,fernvale close,sengkang,north-east region
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,19/10/1992,9/2/1993,21/5/1993,Kaki Bukit,kaki,,Kaki Bukit,60000.0,Industrial,...,199305,1993,5,21,,,,,,
592,40e8902a48c5f3b717a76c37e270d2360da9e1172da322...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 3/Tuas South Avenue 4,tuas south avenue 3/tuas south avenue,,Tuas South Avenue 3/Tuas South Avenue 4,102264.0,Industrial,...,199305,1993,5,21,,,,,,
593,43e84a0ed4744cddede5fb08c7162e61eb86cdf7b7823d...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 4/Tuas South Avenue 5,tuas south avenue 4/tuas south avenue,,Tuas South Avenue 4/Tuas South Avenue 5,102264.0,Industrial,...,199305,1993,5,21,,,,,,
594,473d6c80a232fc34e07f2a9ef544b66270d175537468eb...,31/8/1992,24/11/1992,15/2/1993,Merchant Road/Angus Street,merchant road/angus,,Merchant Road/Angus Street,2609.0,Hotel,...,199302,1993,2,15,,,,,,


## Try using street name

In [384]:
gls[(gls.zone_by_land_parcel.isna()) & (gls.zone_by_land_parcel.isna())]

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,zone_formatted,zone_guess,street,site_area_sqm,devt_type,...,award_month_index,year_award,month_award,day_award,project_name,region_by_proj,zone_by_proj,address_street,zone_by_land_parcel,region_by_land_parcel
90,28a4504a5ccbb82851214a73f5654bb8b27dddde3b01ff...,26/8/1994,27/10/1994,9/12/1994,Bt Batok S2,bt batok,,Bt Batok East Ave 3,50000.0,LP,...,199412,1994,12,9,,,,,,
95,46f27a2ed3f4b06d2e7c555c2f9866dba9929bc73a2f22...,25/2/1994,28/4/1994,19/8/1994,Bt Panjang S1,bt panjang,,Petir Rd,25000.0,AP,...,199408,1994,8,19,maysprings,west region,bukit panjang,petir road,,
96,778c2cc437d91d18db23819250da67a91744843be70d6c...,13/8/1993,14/10/1993,21/1/1994,Bt Merah S4,bt merah,,Tanglin Rd/Alexandra Rd,10542.1,CO,...,199401,1994,1,21,tanglin-regency,central region,bukit merah,tanglin road,,
97,c1ee0b83a5ca26200826d9f25a6824abff2214cb0cf4b5...,13/8/1993,14/10/1993,21/1/1994,Bt Panjang S2,bt panjang,,Petir Rd,58838.3,BH/LP,...,199401,1994,1,21,,,,,,
98,631f75552be42de32da7caa2235c23a82a02e29a5cf175...,13/8/1993,14/10/1993,21/1/1994,Bt Batok S3,bt batok,,Bt Batok East Ave 3,25000.0,AP,...,199401,1994,1,21,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,19/10/1992,9/2/1993,21/5/1993,Kaki Bukit,kaki,,Kaki Bukit,60000.0,Industrial,...,199305,1993,5,21,,,,,,
592,40e8902a48c5f3b717a76c37e270d2360da9e1172da322...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 3/Tuas South Avenue 4,tuas south avenue 3/tuas south avenue,,Tuas South Avenue 3/Tuas South Avenue 4,102264.0,Industrial,...,199305,1993,5,21,,,,,,
593,43e84a0ed4744cddede5fb08c7162e61eb86cdf7b7823d...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 4/Tuas South Avenue 5,tuas south avenue 4/tuas south avenue,,Tuas South Avenue 4/Tuas South Avenue 5,102264.0,Industrial,...,199305,1993,5,21,,,,,,
594,473d6c80a232fc34e07f2a9ef544b66270d175537468eb...,31/8/1992,24/11/1992,15/2/1993,Merchant Road/Angus Street,merchant road/angus,,Merchant Road/Angus Street,2609.0,Hotel,...,199302,1993,2,15,,,,,,


In [385]:
# clean street names, remove numbers, punctuations; trans to lower case, remove suffix

# pattern = r'\d+'
# text = "clementi ave .22"
# suffix = ['street', 'st', 'road', 'rd', 'avenue', 'ave', 'crescent', 'drive', 'dr', 'boulevard', 'blvd', 'rise', 'way', 'lane', 'alley', 'link', 'walk', 'vista', 'track', 'vale']
# prefix = ['upper', 'lower']
# news = ['north', 'east', 'west', 'north']
# rm_num = re.sub(pattern, '', text)
# rm_punc = ''.join([char for char in list(rm_num) if char not in string.punctuation])
# rm_punc

geo["cleaned_st"] = geo.address_street.apply(clean_street_name)
gls.insert(loc=list(gls.columns).index("street")+1, column="cleaned_st", value=gls.street.apply(clean_street_name))
gls[["street", "cleaned_st"]]

Unnamed: 0,street,cleaned_st
0,Dundee Road,dundee
1,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,lorong toa payoh lorong toa payoh
2,Sims Drive,sims
3,Fernvale Close,fernvale close
4,Sengkang West Way,sengkang
...,...,...
591,Kaki Bukit,kaki bukit
592,Tuas South Avenue 3/Tuas South Avenue 4,tuas south avenue tuas south
593,Tuas South Avenue 4/Tuas South Avenue 5,tuas south avenue tuas south
594,Merchant Road/Angus Street,merchant road angus


In [386]:
street_ref = geo[["cleaned_st", "zone", "region"]]
street_ref = street_ref.drop_duplicates(subset='cleaned_st')
# gls = pd.merge(gls, street_ref, how='left', left_on='cleaned_st', right_on='cleaned_st')
st_name_list = list(gls.cleaned_st)
st_name_ref = list(street_ref.cleaned_st)
# zone_by_st = [street_ref.zone[street_ref.cleaned_st.str.contains(name)] for name in st_name_list
gls["st_guess"] = gls.cleaned_st.apply(st_name_match, ref_df=street_ref, key="cleaned_st")
gls = pd.merge(gls, street_ref, how='left', left_on='st_guess', right_on='cleaned_st').drop(["cleaned_st_x", "cleaned_st_y"], axis=1)

In [387]:
gls.rename(columns={"zone": "zone_by_st", "region": "region_by_st"}, inplace=True)
gls[["land_parcel", "street", "st_guess", "zone_by_st", "region_by_st"]]

Unnamed: 0,land_parcel,street,st_guess,zone_by_st,region_by_st
0,Queenstown S9b,Dundee Road,dundee,queenstown,central region
1,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,lorong toa payoh,toa payoh,central region
2,Geylang S6,Sims Drive,sims,geylang,central region
3,Sengkang S12,Fernvale Close,fernvale close,sengkang,north-east region
4,Sengkang S11,Sengkang West Way,sengkang,sengkang,north-east region
...,...,...,...,...,...
591,Kaki Bukit,Kaki Bukit,kaki bukit,bedok,east region
592,Tuas South Avenue 3/Tuas South Avenue 4,Tuas South Avenue 3/Tuas South Avenue 4,tuas south,tuas,west region
593,Tuas South Avenue 4/Tuas South Avenue 5,Tuas South Avenue 4/Tuas South Avenue 5,tuas south,tuas,west region
594,Merchant Road/Angus Street,Merchant Road/Angus Street,merchant,singapore river,central region


In [388]:
gls[gls.sg_gls_id=='2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c556b20932d6962e3586']

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,zone_formatted,zone_guess,street,site_area_sqm,devt_type,...,day_award,project_name,region_by_proj,zone_by_proj,address_street,zone_by_land_parcel,region_by_land_parcel,st_guess,zone_by_st,region_by_st
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,19/10/1992,9/2/1993,21/5/1993,Kaki Bukit,kaki,,Kaki Bukit,60000.0,Industrial,...,21,,,,,,,kaki bukit,bedok,east region


## Combine results to one column

In [389]:
zone_region_cols = gls[["zone_by_proj", "zone_by_land_parcel", "zone_by_st", "region_by_proj", "region_by_land_parcel", "region_by_st"]]
zone_region_cols

Unnamed: 0,zone_by_proj,zone_by_land_parcel,zone_by_st,region_by_proj,region_by_land_parcel,region_by_st
0,queenstown,queenstown,queenstown,central region,central region,central region
1,toa payoh,toa payoh,toa payoh,central region,central region,central region
2,geylang,geylang,geylang,central region,central region,central region
3,sengkang,sengkang,sengkang,north-east region,north-east region,north-east region
4,sengkang,sengkang,sengkang,north-east region,north-east region,north-east region
...,...,...,...,...,...,...
591,,,bedok,,,east region
592,,,tuas,,,west region
593,,,tuas,,,west region
594,,,singapore river,,,central region


In [390]:
zone = []
region = []
join_by = []
check = []
discrepancy = []
for i in range(zone_region_cols.shape[0]):
    if pd.notna(zone_region_cols.zone_by_proj[i]):
        zone.append(zone_region_cols.zone_by_proj[i])
        join_by.append('project name')
    elif pd.notna(zone_region_cols.zone_by_st[i]):
        zone.append(zone_region_cols.zone_by_st[i])
        join_by.append('street name')
    elif pd.notna(zone_region_cols.zone_by_land_parcel[i]):
        zone.append(zone_region_cols.zone_by_land_parcel[i])
        join_by.append('land parcel/zone')
    else:
        zone.append(np.nan)
        join_by.append(np.nan)
    check = [item for item in [zone_region_cols.zone_by_proj[i], zone_region_cols.zone_by_st[i], zone_region_cols.zone_by_land_parcel[i]] if pd.notna(item)]
    if len(check)<=1:
        discrepancy.append('poor matched')
    elif len(set(check))>1:
        discrepancy.append('multiple matched')
    else:
        discrepancy.append(0)

    if pd.notna(zone_region_cols.region_by_proj[i]):
        region.append(zone_region_cols.region_by_proj[i])
    elif pd.notna(zone_region_cols.region_by_st[i]):
        region.append(zone_region_cols.region_by_st[i])
    elif pd.notna(zone_region_cols.region_by_land_parcel[i]):
        region.append(zone_region_cols.region_by_land_parcel[i])
    else:
        region.append(np.nan)

In [391]:
gls.insert(loc=list(header_gls).index("street")+1, column='error_check', value=discrepancy)
gls.insert(loc=list(header_gls).index("street")+1, column='join_by', value=join_by)
gls.insert(loc=list(header_gls).index("street")+1, column='region', value=region)
gls.insert(loc=list(header_gls).index("street")+1, column='zone', value=zone)

In [392]:
gls_check = gls[["sg_gls_id", "proj_name_res", "land_parcel",
                 "street", "zone_by_proj", "zone_by_land_parcel",
                 "zone_by_st", "region_by_proj", "region_by_land_parcel",
                 "region_by_st", "error_check", "join_by", "region", "zone"]]
gls_check[gls_check.sg_gls_id=='2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c556b20932d6962e3586']

Unnamed: 0,sg_gls_id,proj_name_res,land_parcel,street,zone_by_proj,zone_by_land_parcel,zone_by_st,region_by_proj,region_by_land_parcel,region_by_st,error_check,join_by,region,zone
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,,Kaki Bukit,Kaki Bukit,,,bedok,,,east region,poor matched,street name,east region,bedok


## Export matched table

In [393]:
new_header = header_gls[:6]+["zone","region","join_by","error_check"]+header_gls[6:]
gls_output = gls[new_header]
# gls_output.to_csv("gls_full_loc_filled.csv", header=True, index=False)

# Fill in tenderer details HDB

In [2]:
gls = pd.read_csv("gls_full_loc_filled.csv")
gls.rename(columns={"tenderer_name": "successful_tenderer_name", "tender_price": "successful_tender_price", "price_psm_gfa": "successful_price_psm_gfa"}, inplace=True)
gls.drop('tenderer_rank', axis=1, inplace=True)
gls

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,month_launch,day_launch,close_month_index,year_close,month_close,day_close,award_month_index,year_award,month_award,day_award
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,4,29,201506,2015,6,23,201506,2015,6,30
1,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,4,29,201506,2015,6,18,201506,2015,6,23
2,384815dd4cafcbf88b2f11099d8bced7a584736ec36742...,30/12/2013,29/4/2014,30/4/2014,Geylang S6,Sims Drive,geylang,central region,project name,0,...,12,30,201404,2014,4,29,201404,2014,4,30
3,05c11060bf1cbcb2db7aa3ed898c19a09fd298c3b1c3a4...,15/4/2013,13/6/2013,14/6/2013,Sengkang S12,Fernvale Close,sengkang,north-east region,project name,0,...,4,15,201306,2013,6,13,201306,2013,6,14
4,d9fcb7d323ca5b77f6a22635200afafdd99e67f6feb109...,28/2/2013,11/4/2013,12/4/2013,Sengkang S11,Sengkang West Way,sengkang,north-east region,project name,0,...,2,28,201304,2013,4,11,201304,2013,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,19/10/1992,9/2/1993,21/5/1993,Kaki Bukit,Kaki Bukit,bedok,east region,street name,poor matched,...,10,19,199302,1993,2,9,199305,1993,5,21
592,40e8902a48c5f3b717a76c37e270d2360da9e1172da322...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 3/Tuas South Avenue 4,Tuas South Avenue 3/Tuas South Avenue 4,tuas,west region,street name,poor matched,...,10,19,199302,1993,2,9,199305,1993,5,21
593,43e84a0ed4744cddede5fb08c7162e61eb86cdf7b7823d...,19/10/1992,9/2/1993,21/5/1993,Tuas South Avenue 4/Tuas South Avenue 5,Tuas South Avenue 4/Tuas South Avenue 5,tuas,west region,street name,poor matched,...,10,19,199302,1993,2,9,199305,1993,5,21
594,473d6c80a232fc34e07f2a9ef544b66270d175537468eb...,31/8/1992,24/11/1992,15/2/1993,Merchant Road/Angus Street,Merchant Road/Angus Street,singapore river,central region,street name,poor matched,...,8,31,199211,1992,11,24,199302,1993,2,15


In [64]:
tender_details = pd.read_csv("hdb_tenderer_details.csv")
list(tender_details.columns)

['land_parcel_std',
 'tenderer_rank',
 'tenderer_name',
 'tender_price',
 'tender_price_text']

### Align land parcel names

In [13]:
unique_land_details = pd.Series(tender_details.land_parcel.unique()).reset_index(drop=True)

In [14]:
unique_land_gls = pd.Series(gls[gls.source=='hdb'].land_parcel.unique()).reset_index(drop=True)

In [25]:
name_match('Ang Mok Kio S6', unique_land_details)
align_names = pd.DataFrame({'original': unique_land_gls, 'aligned': unique_land_gls.apply(name_match, dictionary=unique_land_details)})

In [28]:
align_names['ori_name'] = align_names.original.apply(lambda x: ' '.join(x.split(' ')[:-1]))
align_names['parcel_code'] = align_names.original.apply(lambda x: x.split(' ')[-1])
align_names['aligned'] = align_names.ori_name.apply(name_match, dictionary = unique_land_details.apply(lambda x: ' '.join(x.split(' ')[:-1])))
align_names = align_names[['ori_name', 'parcel_code', 'aligned', 'original']]

In [37]:
bt_idx = align_names[align_names.ori_name.str.contains('Bt')].index
align_names.iloc[bt_idx, 0] = align_names.iloc[bt_idx, 0].apply(lambda x: x.replace('Bt', 'Bukit'))
align_names['aligned'] = align_names.ori_name.apply(name_match, dictionary=unique_land_details.apply(lambda x: ' '.join(x.split(' ')[:-1])))
align_names['name_repl'] = align_names.aligned.apply(lambda x: x[0]) + ' ' + align_names.parcel_code

In [41]:
name_replace = align_names[['original', 'name_repl']]
name_replace.rename(columns={'original': 'land_parcel'}, inplace=True)
name_replace

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,land_parcel,name_repl
0,Queenstown S9b,Queenstown S9b
1,Toa Payoh S4,Toa Payoh S4
2,Geylang S6,Geylang S6
3,Sengkang S12,Sengkang S12
4,Sengkang S11,Sengkang S11
...,...,...
200,Sengkang N1NC,Sengkang N1NC
201,Sembawang P1,Sembawang P1
202,Woodlands N4NC,Woodlands N4NC
203,Hougang N9NC,Hougang N9NC


In [None]:
# replace names
gls.insert(loc=5, column='land_parcel_std', value=pd.merge(gls, name_replace, on='land_parcel').name_repl)


In [52]:
# gls.to_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_no_detail_v2.csv', index=False)

### Merge tenderer details

In [65]:
gls_details = pd.merge(gls[gls.source=='hdb'], tender_details, how='left', on='land_parcel_std')
gls_details[gls_details.tenderer_rank.isna()]

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,month_close,day_close,award_month_index,year_award,month_award,day_award,tenderer_rank,tenderer_name,tender_price,tender_price_text


In [67]:
gls_details['price_psm_gfa'] = gls_details.tender_price / gls_details.gfa_sqm
gls_details['price_psm_gfa'] = gls_details.price_psm_gfa.apply(lambda x: '%.2f' %x)
gls_details

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,day_close,award_month_index,year_award,month_award,day_award,tenderer_rank,tenderer_name,tender_price,tender_price_text,price_psm_gfa
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,23,201506,2015,6,30,1,HY Realty Pte Ltd,483178000.0,483178000.00,9376.84
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,23,201506,2015,6,30,2,Allgreen Properties Limited,445910000.0,445910000.00,8653.59
2,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,23,201506,2015,6,30,3,"Intrepid Investments Pte Ltd, Verwood \nHoldin...",432774525.0,432774525.00,8398.68
3,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,23,201506,2015,6,30,4,FCL Tampines Court Pte. Ltd. and KH Capital \n...,421500000.0,421500000.00,8179.88
4,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,23,201506,2015,6,30,5,"UOL Venture Investments Pte. Ltd., Singland \n...",417280000.0,417280000.00,8097.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442,99b11f1948b4fc3e79bba927f654f433a5a55d2dff7c1e...,28/7/1995,28/9/1995,30/11/1995,Hougang N9NC,Hougang N9NC,Hougang Avenue 9,hougang,north-east region,street name,...,28,199511,1995,11,30,7,Silverdale Investment Pte Ltd,21000000.0,21000000.00,1129.03
1443,7f55fed1fc42dd52d24742b2c93ed9b9d36bc65deeedbb...,25/2/1994,28/4/1994,19/8/1994,Hougang N5NC,Hougang N5NC,Hougang St 51/Buangkok Green,hougang,north-east region,project name,...,28,199408,1994,8,19,1,Hiap Hoe Holdings Pte ltd,38800000.0,38800000.00,1949.75
1444,7f55fed1fc42dd52d24742b2c93ed9b9d36bc65deeedbb...,25/2/1994,28/4/1994,19/8/1994,Hougang N5NC,Hougang N5NC,Hougang St 51/Buangkok Green,hougang,north-east region,project name,...,28,199408,1994,8,19,2,Far East Organization Centre Pte Ltd,34390000.0,34390000.00,1728.14
1445,7f55fed1fc42dd52d24742b2c93ed9b9d36bc65deeedbb...,25/2/1994,28/4/1994,19/8/1994,Hougang N5NC,Hougang N5NC,Hougang St 51/Buangkok Green,hougang,north-east region,project name,...,28,199408,1994,8,19,3,United Industrial Corporation \nLtd/Shenton Ho...,26888000.0,26888000.00,1351.16


In [97]:
gls_details = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_hdb_details_filled.csv')
groupby = pd.DataFrame(gls_details.groupby('land_parcel_std').count()['land_parcel']).reset_index()
# gls_details.to_csv("gls_hdb_details_filled.csv", index=False)
check = pd.merge(gls_details[['land_parcel_std', 'num_bidders']], groupby, on='land_parcel_std')
check['diff'] = check.num_bidders - check.land_parcel
check


Unnamed: 0,land_parcel_std,num_bidders,land_parcel,diff
0,Queenstown S9b,9,9,0
1,Queenstown S9b,9,9,0
2,Queenstown S9b,9,9,0
3,Queenstown S9b,9,9,0
4,Queenstown S9b,9,9,0
...,...,...,...,...
1442,Hougang N9NC,7,7,0
1443,Hougang N5NC,4,4,0
1444,Hougang N5NC,4,4,0
1445,Hougang N5NC,4,4,0


In [99]:
gls_details_only = gls_details[['sg_gls_id', 'land_parcel_std', 'tenderer_rank', 'tenderer_name', 'tender_price', 'tender_price_text', 'price_psm_gfa']]
gls_details_only.to_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_details_only.csv', index=False)