## Run after extract tables

1. fill gfa/gpr - done
2. create psm price - done
3. create year/month/day cols - done
4. clean devt_type - done - done
5. format all names with slash - done
6. fill street name with land parcel - done
7. format proj_name (create 2 cols) - done
8. create uuid - done

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import hashlib


# func to fill gfa, gpr and site area
def balance(df, gfa_col, gpr_col, site_area_col):
    idx_gfa_na = df[gfa_col][df[gfa_col].isna()].index
    idx_gpr_na = df[gpr_col][df[gpr_col].isna()].index
    idx_site_na = df[site_area_col][df[site_area_col].isna()].index
    if len(idx_gfa_na) > 0:
        df[gfa_col][idx_gfa_na] = df[gpr_col][idx_gfa_na] * df[site_area_col][idx_gfa_na]
    if len(idx_gpr_na) > 0:
        df[gpr_col][idx_gpr_na] = df[gfa_col][idx_gpr_na] / df[site_area_col][idx_gpr_na]
    if len(idx_site_na) > 0:
        df[site_area_col][idx_site_na] = df[gfa_col][idx_site_na] / df[gpr_col][idx_site_na]
    return df


# func to clean devt_type col
def devt_classify(df, devt_type_col):
    devt_type_col_idx = list(gls.columns).index(devt_type_col)
    classes = pd.read_csv("devt_type_classification.csv")
    devt_type = list(classes.devt_type)
    devt_class = list(classes.devt_class2)
    class_dict = dict(zip(devt_type, devt_class))
    df.insert(loc=devt_type_col_idx+1, column="devt_class", value=df[devt_type_col].replace(class_dict))
    return df


# func to format names with slash '/'
def format_name(text, pattern, repl, trim=True):
    try:
        if trim:
            text = text.strip()
        text = re.sub(' +', ' ', text)
        return re.sub(pattern, repl, text)
    except (TypeError, AttributeError):
        return text

In [2]:
text = ' asf dasfa     s      '
re.sub(' +', ' ', text.strip())

'asf dasfa s'

In [3]:
gls = pd.read_excel('gls_hdb_ura.xlsx')
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,lease_term,gpr,gfa_sqm,num_bidders,tenderer_rank,tenderer_name,tender_price,proj_name,source
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,99,4.9,51528.89,9,1,HY Realty Pte Ltd,483178000.0,Queens Peak,hdb
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh / Lorong 4 Toa Payoh,12154.6,*CO,99,3.5,42541.10,14,1,"Evia Real Estate (7) Pte Ltd, Maxdin Pte Ltd a...",345860000.0,Gem Residences,hdb
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,99,3.0,71700.30,4,1,First Changi Development Pte Ltd,530891000.0,Sims Urban Oasis,hdb
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,99,3.0,44791.50,9,1,"FCL Topaz Pte. Ltd., Far East Orchard Limited ...",256980000.0,Rivertrees Residences,hdb
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,99,3.0,49811.70,8,1,Secure Development Pte. Ltd.,262100000.0,Riverbank @Fernvale,hdb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,,60000.0,Industrial,60,2.0,120000.00,2,1,Technology Parks Pte Ltd,44880000.0,,ura
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3 / Tuas South Avenue 4,,102264.0,Industrial,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24338800.0,,ura
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4 / Tuas South Avenue 5,,102264.0,Industrial,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24032000.0,,ura
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road / Angus Street,,2609.0,Hotel,99,2.4,6167.00,2,1,Food Alley Pte Ltd,6890000.0,,ura


In [4]:
gls = balance(gls, "gfa_sqm", "gpr", "site_area_sqm")
gls.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[gfa_col][idx_gfa_na] = df[gpr_col][idx_gfa_na] * df[site_area_col][idx_gfa_na]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[gpr_col][idx_gpr_na] = df[gfa_col][idx_gpr_na] / df[site_area_col][idx_gpr_na]


date_launch        0
date_close         0
date_award         0
land_parcel        0
street           391
site_area_sqm      0
devt_type          0
lease_term         0
gpr               20
gfa_sqm           20
num_bidders        0
tenderer_rank      0
tenderer_name      0
tender_price       0
proj_name        361
source             0
dtype: int64

In [5]:
# create psm price column
gls.insert(loc=14, column="price_psm_gfa", value=(gls.tender_price / gls.gfa_sqm).round(2))
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,lease_term,gpr,gfa_sqm,num_bidders,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,proj_name,source
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,99,4.9,51528.89,9,1,HY Realty Pte Ltd,483178000.0,9376.84,Queens Peak,hdb
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh / Lorong 4 Toa Payoh,12154.6,*CO,99,3.5,42541.10,14,1,"Evia Real Estate (7) Pte Ltd, Maxdin Pte Ltd a...",345860000.0,8130.02,Gem Residences,hdb
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,99,3.0,71700.30,4,1,First Changi Development Pte Ltd,530891000.0,7404.31,Sims Urban Oasis,hdb
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,99,3.0,44791.50,9,1,"FCL Topaz Pte. Ltd., Far East Orchard Limited ...",256980000.0,5737.25,Rivertrees Residences,hdb
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,99,3.0,49811.70,8,1,Secure Development Pte. Ltd.,262100000.0,5261.82,Riverbank @Fernvale,hdb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,,60000.0,Industrial,60,2.0,120000.00,2,1,Technology Parks Pte Ltd,44880000.0,374.00,,ura
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3 / Tuas South Avenue 4,,102264.0,Industrial,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24338800.0,238.00,,ura
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4 / Tuas South Avenue 5,,102264.0,Industrial,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24032000.0,235.00,,ura
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road / Angus Street,,2609.0,Hotel,99,2.4,6167.00,2,1,Food Alley Pte Ltd,6890000.0,1117.24,,ura


In [6]:
gls.devt_type.unique()

array(['*CO', 'LP / CO / FT', 'CO', 'LP', 'LP / CO', 'CO/LP', 'LP/CO',
       'AP', 'BH', 'TH', 'BH / LP', 'EC', 'Mixed',
       'Residential (Non-Landed)', 'White Site',
       'Residential with Commercial at 1st Sty',
       'Commercial and Residential', 'Hotel',
       'Residential & Residential with Commercial at 1st Sty',
       'Commercial', 'Industrial', 'Transitional Office',
       'Industrial - White', 'Hospital', 'Recreation', 'Driving Centre',
       'Entertainment', 'Others'], dtype=object)

In [7]:
gls = devt_classify(gls, "devt_type")
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,devt_class,lease_term,gpr,gfa_sqm,num_bidders,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,proj_name,source
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,residential,99,4.9,51528.89,9,1,HY Realty Pte Ltd,483178000.0,9376.84,Queens Peak,hdb
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh / Lorong 4 Toa Payoh,12154.6,*CO,residential,99,3.5,42541.10,14,1,"Evia Real Estate (7) Pte Ltd, Maxdin Pte Ltd a...",345860000.0,8130.02,Gem Residences,hdb
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,residential,99,3.0,71700.30,4,1,First Changi Development Pte Ltd,530891000.0,7404.31,Sims Urban Oasis,hdb
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,residential,99,3.0,44791.50,9,1,"FCL Topaz Pte. Ltd., Far East Orchard Limited ...",256980000.0,5737.25,Rivertrees Residences,hdb
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,residential,99,3.0,49811.70,8,1,Secure Development Pte. Ltd.,262100000.0,5261.82,Riverbank @Fernvale,hdb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,,60000.0,Industrial,others,60,2.0,120000.00,2,1,Technology Parks Pte Ltd,44880000.0,374.00,,ura
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3 / Tuas South Avenue 4,,102264.0,Industrial,others,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24338800.0,238.00,,ura
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4 / Tuas South Avenue 5,,102264.0,Industrial,others,60,1.0,102264.00,1,1,Technology Parks Pte Ltd,24032000.0,235.00,,ura
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road / Angus Street,,2609.0,Hotel,others,99,2.4,6167.00,2,1,Food Alley Pte Ltd,6890000.0,1117.24,,ura


In [8]:
gls["timediff_launch_to_close"] = (gls.date_close - gls.date_launch).apply(lambda x: x.days)
gls["timediff_close_to_award"] = (gls.date_award - gls.date_close).apply(lambda x: x.days)
gls["timediff_launch_to_award"] = (gls.date_award - gls.date_launch).apply(lambda x: x.days)
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,devt_class,lease_term,gpr,...,num_bidders,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,proj_name,source,timediff_launch_to_close,timediff_close_to_award,timediff_launch_to_award
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,residential,99,4.9,...,9,1,HY Realty Pte Ltd,483178000.0,9376.84,Queens Peak,hdb,55,7,62
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh / Lorong 4 Toa Payoh,12154.6,*CO,residential,99,3.5,...,14,1,"Evia Real Estate (7) Pte Ltd, Maxdin Pte Ltd a...",345860000.0,8130.02,Gem Residences,hdb,50,5,55
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,residential,99,3.0,...,4,1,First Changi Development Pte Ltd,530891000.0,7404.31,Sims Urban Oasis,hdb,120,1,121
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,residential,99,3.0,...,9,1,"FCL Topaz Pte. Ltd., Far East Orchard Limited ...",256980000.0,5737.25,Rivertrees Residences,hdb,59,1,60
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,residential,99,3.0,...,8,1,Secure Development Pte. Ltd.,262100000.0,5261.82,Riverbank @Fernvale,hdb,42,1,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,,60000.0,Industrial,others,60,2.0,...,2,1,Technology Parks Pte Ltd,44880000.0,374.00,,ura,113,101,214
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3 / Tuas South Avenue 4,,102264.0,Industrial,others,60,1.0,...,1,1,Technology Parks Pte Ltd,24338800.0,238.00,,ura,113,101,214
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4 / Tuas South Avenue 5,,102264.0,Industrial,others,60,1.0,...,1,1,Technology Parks Pte Ltd,24032000.0,235.00,,ura,113,101,214
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road / Angus Street,,2609.0,Hotel,others,99,2.4,...,2,1,Food Alley Pte Ltd,6890000.0,1117.24,,ura,85,83,168


In [9]:
timetype = ["year", "month", "day"]
activity = ["launch", "close", "award"]
for action in activity:
    colname = f"date_{action}"
    timeidx = f"{action}_month_index"
    gls[timeidx] = gls[colname].apply(lambda x: "".join([str(x.year), str(x.month).zfill(2)]))
    for time in timetype:
        new_colname = f"{time}_{action}"
        if time == "year":
            gls[new_colname] = gls[colname].apply(lambda x: x.year)
        if time == "month":
            gls[new_colname] = gls[colname].apply(lambda x: x.month)
        if time == "day":
            gls[new_colname] = gls[colname].apply(lambda x: x.day)
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,devt_class,lease_term,gpr,...,month_launch,day_launch,close_month_index,year_close,month_close,day_close,award_month_index,year_award,month_award,day_award
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,residential,99,4.9,...,4,29,201506,2015,6,23,201506,2015,6,30
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh / Lorong 4 Toa Payoh,12154.6,*CO,residential,99,3.5,...,4,29,201506,2015,6,18,201506,2015,6,23
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,residential,99,3.0,...,12,30,201404,2014,4,29,201404,2014,4,30
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,residential,99,3.0,...,4,15,201306,2013,6,13,201306,2013,6,14
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,residential,99,3.0,...,2,28,201304,2013,4,11,201304,2013,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,,60000.0,Industrial,others,60,2.0,...,10,19,199302,1993,2,9,199305,1993,5,21
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3 / Tuas South Avenue 4,,102264.0,Industrial,others,60,1.0,...,10,19,199302,1993,2,9,199305,1993,5,21
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4 / Tuas South Avenue 5,,102264.0,Industrial,others,60,1.0,...,10,19,199302,1993,2,9,199305,1993,5,21
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road / Angus Street,,2609.0,Hotel,others,99,2.4,...,8,31,199211,1992,11,24,199302,1993,2,15


In [10]:
text_name = ["land_parcel", "street", "devt_type", "tenderer_name"]
for col in text_name:
    gls[col] = gls[col].apply(format_name, pattern=' */ *', repl='/')
gls.proj_name = gls.proj_name.apply(format_name, pattern=' *@ *', repl='-at-')
gls.proj_name = gls.proj_name.apply(format_name, pattern=' +', repl='-')
gls.proj_name = gls.proj_name.apply(lambda x: x.lower() if pd.notna(x) else x)

In [11]:
# fill missing street values with land parcel name
street_na_idx = gls.street[gls.street.isna()].index
gls.street[street_na_idx] = gls.land_parcel[street_na_idx]
gls.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.street[street_na_idx] = gls.land_parcel[street_na_idx]


date_launch                   0
date_close                    0
date_award                    0
land_parcel                   0
street                        0
site_area_sqm                 0
devt_type                     0
devt_class                    0
lease_term                    0
gpr                          20
gfa_sqm                      20
num_bidders                   0
tenderer_rank                 0
tenderer_name                 0
tender_price                  0
price_psm_gfa                20
proj_name                   361
source                        0
timediff_launch_to_close      0
timediff_close_to_award       0
timediff_launch_to_award      0
launch_month_index            0
year_launch                   0
month_launch                  0
day_launch                    0
close_month_index             0
year_close                    0
month_close                   0
day_close                     0
award_month_index             0
year_award                    0
month_aw

In [12]:
# separate residential and non-residential proj name
gls.rename(columns={"proj_name": "proj_name_raw"}, inplace=True)
gls

Unnamed: 0,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,devt_class,lease_term,gpr,...,month_launch,day_launch,close_month_index,year_close,month_close,day_close,award_month_index,year_award,month_award,day_award
0,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,residential,99,4.9,...,4,29,201506,2015,6,23,201506,2015,6,30
1,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,12154.6,*CO,residential,99,3.5,...,4,29,201506,2015,6,18,201506,2015,6,23
2,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,residential,99,3.0,...,12,30,201404,2014,4,29,201404,2014,4,30
3,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,residential,99,3.0,...,4,15,201306,2013,6,13,201306,2013,6,14
4,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,residential,99,3.0,...,2,28,201304,2013,4,11,201304,2013,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,Kaki Bukit,60000.0,Industrial,others,60,2.0,...,10,19,199302,1993,2,9,199305,1993,5,21
592,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3/Tuas South Avenue 4,Tuas South Avenue 3/Tuas South Avenue 4,102264.0,Industrial,others,60,1.0,...,10,19,199302,1993,2,9,199305,1993,5,21
593,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4/Tuas South Avenue 5,Tuas South Avenue 4/Tuas South Avenue 5,102264.0,Industrial,others,60,1.0,...,10,19,199302,1993,2,9,199305,1993,5,21
594,1992-08-31,1992-11-24,1993-02-15,Merchant Road/Angus Street,Merchant Road/Angus Street,2609.0,Hotel,others,99,2.4,...,8,31,199211,1992,11,24,199302,1993,2,15


In [13]:
# for res, just proj-name-res = proj name
# for commercial, others, just proj-name-non-res = proj name
# for mixed devt with 2 names, separate; with 1 name only, both = proj name
gls.insert(loc=17, column="proj_name_res", value=np.nan)
gls.insert(loc=18, column="proj_name_non_res", value=np.nan)

In [14]:
res_idx = gls.devt_class[gls.devt_class=='residential'].index
non_res_idx = gls.devt_class[(gls.devt_class=='others') | (gls.devt_class=='others')].index
mixed_idx_double_name = gls.devt_class[(gls.devt_class=='rc') & (gls.proj_name_raw.str.contains('&'))].index
mixed_idx_single_name = gls.devt_class[(gls.devt_class=='rc') & (~gls.proj_name_raw.str.contains('&', na=False))].index

In [15]:
gls.proj_name_res[res_idx] = gls.proj_name_raw[res_idx]
gls.proj_name_res[mixed_idx_single_name] = gls.proj_name_raw[mixed_idx_single_name]
gls.proj_name_non_res[non_res_idx] = gls.proj_name_raw[non_res_idx]
gls.proj_name_non_res[mixed_idx_single_name] = gls.proj_name_raw[mixed_idx_single_name]
gls[["devt_class", "proj_name_raw", "proj_name_res", "proj_name_non_res"]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.proj_name_res[res_idx] = gls.proj_name_raw[res_idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.proj_name_res[mixed_idx_single_name] = gls.proj_name_raw[mixed_idx_single_name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.proj_name_non_res[non_res_idx] = gls.proj_name_raw[non_res_idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

Unnamed: 0,devt_class,proj_name_raw,proj_name_res,proj_name_non_res
0,residential,queens-peak,queens-peak,
1,residential,gem-residences,gem-residences,
2,residential,sims-urban-oasis,sims-urban-oasis,
3,residential,rivertrees-residences,rivertrees-residences,
4,residential,riverbank-at-fernvale,riverbank-at-fernvale,
...,...,...,...,...
591,others,,,
592,others,,,
593,others,,,
594,others,,,


In [16]:
gls.proj_name_res[mixed_idx_double_name] = gls.proj_name_raw[mixed_idx_double_name].apply(lambda x: x.split('-&-')[1])
gls.proj_name_non_res[mixed_idx_double_name] = gls.proj_name_raw[mixed_idx_double_name].apply(lambda x: x.split('-&-')[0])
gls[["devt_class", "proj_name_raw", "proj_name_res", "proj_name_non_res"]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.proj_name_res[mixed_idx_double_name] = gls.proj_name_raw[mixed_idx_double_name].apply(lambda x: x.split('-&-')[1])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls.proj_name_non_res[mixed_idx_double_name] = gls.proj_name_raw[mixed_idx_double_name].apply(lambda x: x.split('-&-')[0])


Unnamed: 0,devt_class,proj_name_raw,proj_name_res,proj_name_non_res
0,residential,queens-peak,queens-peak,
1,residential,gem-residences,gem-residences,
2,residential,sims-urban-oasis,sims-urban-oasis,
3,residential,rivertrees-residences,rivertrees-residences,
4,residential,riverbank-at-fernvale,riverbank-at-fernvale,
...,...,...,...,...
591,others,,,
592,others,,,
593,others,,,
594,others,,,


### create uuid

In [17]:
# create uuid
gls["gls_text_id"] = gls.date_launch.astype(str) + gls.land_parcel + gls.site_area_sqm.astype(str) + gls.source
gls.gls_text_id

0                     2015-04-29Queenstown S9b10516.1hdb
1                       2015-04-29Toa Payoh S412154.6hdb
2                         2013-12-30Geylang S623900.1hdb
3                       2013-04-15Sengkang S1214930.5hdb
4                       2013-02-28Sengkang S1116603.9hdb
                             ...                        
591                       1992-10-19Kaki Bukit60000.0ura
592    1992-10-19Tuas South Avenue 3/Tuas South Avenu...
593    1992-10-19Tuas South Avenue 4/Tuas South Avenu...
594        1992-08-31Merchant Road/Angus Street2609.0ura
595       1992-08-31Merchant Road/Magazine Road4806.0ura
Name: gls_text_id, Length: 596, dtype: object

In [18]:
gls.insert(loc=0, column="sg_gls_id", value=gls.gls_text_id.apply(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest()))

In [19]:
gls.drop("gls_text_id", axis=1, inplace=True)
gls

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,site_area_sqm,devt_type,devt_class,lease_term,...,month_launch,day_launch,close_month_index,year_close,month_close,day_close,award_month_index,year_award,month_award,day_award
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,2015-04-29,2015-06-23,2015-06-30,Queenstown S9b,Dundee Road,10516.1,*CO,residential,99,...,4,29,201506,2015,6,23,201506,2015,6,30
1,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,2015-04-29,2015-06-18,2015-06-23,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,12154.6,*CO,residential,99,...,4,29,201506,2015,6,18,201506,2015,6,23
2,384815dd4cafcbf88b2f11099d8bced7a584736ec36742...,2013-12-30,2014-04-29,2014-04-30,Geylang S6,Sims Drive,23900.1,*CO,residential,99,...,12,30,201404,2014,4,29,201404,2014,4,30
3,05c11060bf1cbcb2db7aa3ed898c19a09fd298c3b1c3a4...,2013-04-15,2013-06-13,2013-06-14,Sengkang S12,Fernvale Close,14930.5,*CO,residential,99,...,4,15,201306,2013,6,13,201306,2013,6,14
4,d9fcb7d323ca5b77f6a22635200afafdd99e67f6feb109...,2013-02-28,2013-04-11,2013-04-12,Sengkang S11,Sengkang West Way,16603.9,*CO,residential,99,...,2,28,201304,2013,4,11,201304,2013,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,2b49a7c56443c8359c7d541c2f8c44889120a0b497d9c5...,1992-10-19,1993-02-09,1993-05-21,Kaki Bukit,Kaki Bukit,60000.0,Industrial,others,60,...,10,19,199302,1993,2,9,199305,1993,5,21
592,40e8902a48c5f3b717a76c37e270d2360da9e1172da322...,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 3/Tuas South Avenue 4,Tuas South Avenue 3/Tuas South Avenue 4,102264.0,Industrial,others,60,...,10,19,199302,1993,2,9,199305,1993,5,21
593,43e84a0ed4744cddede5fb08c7162e61eb86cdf7b7823d...,1992-10-19,1993-02-09,1993-05-21,Tuas South Avenue 4/Tuas South Avenue 5,Tuas South Avenue 4/Tuas South Avenue 5,102264.0,Industrial,others,60,...,10,19,199302,1993,2,9,199305,1993,5,21
594,473d6c80a232fc34e07f2a9ef544b66270d175537468eb...,1992-08-31,1992-11-24,1993-02-15,Merchant Road/Angus Street,Merchant Road/Angus Street,2609.0,Hotel,others,99,...,8,31,199211,1992,11,24,199302,1993,2,15


In [20]:
gls.to_csv("gls_full.csv", index=False)

In [21]:
len(hashlib.sha256("2015-04-29Queenstown S9b10516.1hdb".encode('utf-8')).hexdigest())

64

In [4]:
import hashlib
import pandas as pd
import numpy as np
pred = pd.read_excel(r'G:\REA\Working files\land-bidding\prediction\parcels for prediction.xlsx')
pred['id_text'] = pred.land_parcel_std + pred.latitude.astype(str) + pred.longitude.astype(str)
pred['land_parcel_id'] = pred.id_text.apply(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest())