# Insights from tenderer names

In [1]:
import pandas as pd
import numpy as np
import re
from typing import List
# pd.set_option('display.float_format', lambda x: '%.0f' % x)
pd.reset_option('display.float_format')

In [2]:
def nameFormat(companyName: str)-> str:

    import re
    pte_suffix = ['[Pp]rivate', '[Pp][Tt][Ee]']
    ltd_suffix = ['[Ll]imited', '[Ll]imit', '[Ll][Tt][Dd]']

    try:
        # remove line breaks and slashes
        companyName = companyName.strip()
        companyName = re.sub(r' +', r' ', companyName)
        # companyName = re.sub(r'\\+', r'\\', companyName)
        companyName = re.sub(r'\\+n?', '', companyName)
        companyName = re.sub(r'\n', r'', companyName)

        # replace suffix with identical format
        for suffix in pte_suffix:
            pattern = f'\(?{suffix}\.?,?\)?'
            companyName = re.sub(pattern, 'Pte.', companyName)

        for suffix in ltd_suffix:
            pattern = f'\(?{suffix}\.?\)?'
            companyName = re.sub(pattern, 'Ltd.', companyName)

        companyName = re.sub('\(?[Pp][Ll][.]?\)?[\W]?$', 'Pte. Ltd.', companyName)
        companyName = re.sub('\(?[Pp][Ll][ ,./]\)?', 'Pte. Ltd.,', companyName)
        companyName = re.sub('\(?[Ii][Nn][Cc][.]?\)?[\W]?$', 'Inc.', companyName)
        companyName = re.sub('\(?[Ii]ncorporate[d]?\)?', 'Inc.', companyName)
        companyName = re.sub('\(?[Ii][Nn][Cc][.]? +\)?', 'Inc. ', companyName)
        companyName = re.sub('\(?[Jj][oint]*?[ -/&]?[Vv]e?n?t?u?r?e?[.]?\)?', 'J.V.', companyName)
        companyName = re.sub('\(?[Cc][Oo][Rr][Pp][oration]*\)?', 'Corp.', companyName)

        # identify separators and split multiple company names
        sep_id = ['[Ll][Tt][Dd]', '[Ii][Nn][Cc]', '[Cc][Oo][Rr][Pp]', '[Gg][Mm][Bb][Hh]', '[Jj].?[Vv].?', '[Ll][Ll][Cc]', '[Pp][Ll][Cc]', '[Ll][Ll][Pp]', '[Gg][Rr][Oo][Uu][Pp]']
        repl = ['Ltd', 'Inc', 'Corp', 'Gmbh', 'J.V', 'LLC', 'Plc', 'LLP', 'Group']
        repl_dict = dict(zip(sep_id, repl))
        for suffix in repl_dict.keys():
            sep_pattern_and = f'{suffix}[.]?[ ,;][\W]*?[Aa]nd? +'
            # sep_pattern_comma = 'ltd[.]?[ ]*[,;&][\W]?[,;]?[ ]?'
            sep_pattern_comma_ampersand = f'{suffix}[.]?[ ]*[,;&/][\W]?[ ]?'
            suffix_repl = repl_dict[suffix]
            companyName = re.sub(sep_pattern_and, f'{suffix_repl}. | ', companyName)
            companyName = re.sub(sep_pattern_comma_ampersand, f'{suffix_repl}. | ', companyName)

    except AttributeError:
        pass

    return companyName


In [3]:
def stripName(companyName: List[str]):
    strip_pattern = [f' +{string}[.]?$' for string in ['[Ll][Tt][Dd]', '[Pp][Tt][Ee]', '[Pp][Tt][Ee][.]? +[Ll][Tt][Dd]', '[Ii][Nn][Cc]', '[Cc][Oo][Rr][Pp]', '[Gg][Mm][Bb][Hh]', '[Jj].?[Vv]', '[Ll][Ll][Cc]', '[Pp][Ll][Cc]', '[Ll][Ll][Pp]', '[Cc][Oo][.]?[,]? +[Pp][Tt][Ee][.]?[,]? +[Ll][Tt][Dd]', '[Cc][Oo][.]?[,]? +[Ll][Tt][Dd]', '[Cc][Oo][Rr][Pp][.]?[,]? +[Pp][Tt][Ee][.]?[,]? +[Ll][Tt][Dd]']]
    pattern = '|'.join(strip_pattern)
    companyName = [companyName] if isinstance(companyName, str) else companyName
    stripped_name = []
    for name in companyName:
        try:
            stripped_name.append(re.sub(pattern, '', name))
        except AttributeError and TypeError:
            stripped_name.append(name)
    return stripped_name


texts = ['abc Pte. Ltd.', 'ccc inc', 'cbc gMbh', 'ddd J.V', 'abc Co. Pte. Ltd', np.nan, 7]
stripName(texts)

['abc', 'ccc', 'cbc', 'ddd', 'abc', nan, 7]

In [4]:
# test
company = 'REA pte., LTD, SOreal LLC and Cushman & Wakefield, Incorporated/ P&G PlC & JLL and Lasalle, consultant, IP inc. ; CDL intl pte., ltd.;,Capitaland and hdb joint-venture and cc group,and VW GMbH'
nameFormat(company)

'REA Pte. Ltd. | SOreal LLC. | Cushman & Wakefield, Inc. | P&G Plc. | JLL and Lasalle, consultant, IP Inc. | CDL intl Pte. Ltd. | Capitaland and hdb J.V. | cc Group. | VW GMbH'

In [5]:
gls = pd.read_csv(r"G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_hdb_details_filled.csv")
gls['price_psm_gfa'] = gls.tender_price/gls.gfa_sqm
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,day_close,award_month_index,year_award,month_award,day_award,tenderer_rank,tenderer_name,tender_price,tender_price_text,price_psm_gfa
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,23,201506,2015,6,30,1,HY Realty Pte Ltd,483178000.0,483178000.0,9376.836955
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,23,201506,2015,6,30,2,Allgreen Properties Limited,445910000.0,445910000.0,8653.592189
2,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,23,201506,2015,6,30,3,"Intrepid Investments Pte Ltd, Verwood \nHoldin...",432774525.0,432774525.0,8398.677422
3,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,23,201506,2015,6,30,4,FCL Tampines Court Pte. Ltd. and KH Capital \n...,421500000.0,421500000.0,8179.877346
4,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,23,201506,2015,6,30,5,"UOL Venture Investments Pte. Ltd., Singland \n...",417280000.0,417280000.0,8097.98154


In [6]:
gls = gls.dropna(subset=['tenderer_name'], axis=0)
# gls_top3 = pd.DataFrame(gls.tenderer_name.dropna())
gls["separated_names"] = gls.tenderer_name.apply(nameFormat)
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,award_month_index,year_award,month_award,day_award,tenderer_rank,tenderer_name,tender_price,tender_price_text,price_psm_gfa,separated_names
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,201506,2015,6,30,1,HY Realty Pte Ltd,483178000.0,483178000.0,9376.836955,HY Realty Pte. Ltd.
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,201506,2015,6,30,2,Allgreen Properties Limited,445910000.0,445910000.0,8653.592189,Allgreen Properties Ltd.
2,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,201506,2015,6,30,3,"Intrepid Investments Pte Ltd, Verwood \nHoldin...",432774525.0,432774525.0,8398.677422,Intrepid Investments Pte. Ltd. | Verwood Holdi...
3,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,201506,2015,6,30,4,FCL Tampines Court Pte. Ltd. and KH Capital \n...,421500000.0,421500000.0,8179.877346,FCL Tampines Court Pte. Ltd. | KH Capital Pte....
4,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,201506,2015,6,30,5,"UOL Venture Investments Pte. Ltd., Singland \n...",417280000.0,417280000.0,8097.98154,UOL Venture Investments Pte. Ltd. | Singland H...


In [7]:
gls['tenderer_rank'] = gls.tenderer_rank.apply(lambda x: int(re.findall('\d+', x)[0]))


In [8]:
gls_top3 = gls[gls.tenderer_rank<=3]
gls_top3["list_of_tenderers"] = gls_top3.separated_names.apply(lambda x: x.split(' | '))
gls_top3["num_tenderers_same_rank"] = gls_top3.list_of_tenderers.apply(lambda x: len(x))
gls_top3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls_top3["list_of_tenderers"] = gls_top3.separated_names.apply(lambda x: x.split(' | '))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls_top3["num_tenderers_same_rank"] = gls_top3.list_of_tenderers.apply(lambda x: len(x))


Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,month_award,day_award,tenderer_rank,tenderer_name,tender_price,tender_price_text,price_psm_gfa,separated_names,list_of_tenderers,num_tenderers_same_rank
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,6,30,1,HY Realty Pte Ltd,483178000.0,483178000.00,9376.836955,HY Realty Pte. Ltd.,[HY Realty Pte. Ltd.],1
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,6,30,2,Allgreen Properties Limited,445910000.0,445910000.00,8653.592189,Allgreen Properties Ltd.,[Allgreen Properties Ltd.],1
2,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,6,30,3,"Intrepid Investments Pte Ltd, Verwood \nHoldin...",432774525.0,432774525.00,8398.677422,Intrepid Investments Pte. Ltd. | Verwood Holdi...,"[Intrepid Investments Pte. Ltd., Verwood Holdi...",3
9,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,6,23,1,"Evia Real Estate (7) Pte Ltd, Maxdin Pte Ltd \...",345860000.0,345860000.00,8130.020145,Evia Real Estate (7) Pte. Ltd. | Maxdin Pte. L...,"[Evia Real Estate (7) Pte. Ltd., Maxdin Pte. L...",3
10,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,6,23,2,Sing Holdings Limited,342100000.0,342100000.00,8041.635031,Sing Holdings Ltd.,[Sing Holdings Ltd.],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345,99b11f1948b4fc3e79bba927f654f433a5a55d2dff7c1e...,28/7/1995,28/9/1995,30/11/1995,Hougang N9NC,Hougang Avenue 9,hougang,north-east region,street name,0,...,11,30,2,Ruby Holdings Pte Ltd,61135000.0,61135000.00,3286.827957,Ruby Holdings Pte. Ltd.,[Ruby Holdings Pte. Ltd.],1
2346,99b11f1948b4fc3e79bba927f654f433a5a55d2dff7c1e...,28/7/1995,28/9/1995,30/11/1995,Hougang N9NC,Hougang Avenue 9,hougang,north-east region,street name,0,...,11,30,3,"Gutherie GTS Ltd, Tang Eng Pte Ltd, Bellora \n...",55000000.0,55000000.00,2956.989247,Gutherie GTS Ltd. | Tang Eng Pte. Ltd. | Bello...,"[Gutherie GTS Ltd., Tang Eng Pte. Ltd., Bellor...",7
2351,7f55fed1fc42dd52d24742b2c93ed9b9d36bc65deeedbb...,25/2/1994,28/4/1994,19/8/1994,Hougang N5NC,Hougang St 51/Buangkok Green,hougang,north-east region,project name,0,...,8,19,1,Hiap Hoe Holdings Pte ltd,38800000.0,38800000.00,1949.748744,Hiap Hoe Holdings Pte. Ltd.,[Hiap Hoe Holdings Pte. Ltd.],1
2352,7f55fed1fc42dd52d24742b2c93ed9b9d36bc65deeedbb...,25/2/1994,28/4/1994,19/8/1994,Hougang N5NC,Hougang St 51/Buangkok Green,hougang,north-east region,project name,0,...,8,19,2,Far East Organization Centre Pte Ltd,34390000.0,34390000.00,1728.140704,Far East Organization Centre Pte. Ltd.,[Far East Organization Centre Pte. Ltd.],1


In [9]:
gls_top3.num_tenderers_same_rank.describe()

count    985.000000
mean       1.467005
std        0.696374
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        7.000000
Name: num_tenderers_same_rank, dtype: float64

In [10]:
# gls_top3['stripped_names'] = td_names.list_of_tenderers.apply(stripName)
# create additional cols for multiple tenderers
max_num_td = gls_top3.num_tenderers_same_rank.max()
gls_top3['tenderer_names_filled'] = gls_top3.list_of_tenderers.apply(lambda x: x + [np.nan] * (max_num_td - len(x)))
for col in range(max_num_td):
    gls_top3[f"tenderer_{col + 1}"] = gls_top3.tenderer_names_filled.apply(lambda x: x[col])
gls_top3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls_top3['tenderer_names_filled'] = gls_top3.list_of_tenderers.apply(lambda x: x + [np.nan] * (max_num_td - len(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gls_top3[f"tenderer_{col + 1}"] = gls_top3.tenderer_names_filled.apply(lambda x: x[col])


Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,street,zone,region,join_by,error_check,...,list_of_tenderers,num_tenderers_same_rank,tenderer_names_filled,tenderer_1,tenderer_2,tenderer_3,tenderer_4,tenderer_5,tenderer_6,tenderer_7
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,[HY Realty Pte. Ltd.],1,"[HY Realty Pte. Ltd., nan, nan, nan, nan, nan,...",HY Realty Pte. Ltd.,,,,,,
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,[Allgreen Properties Ltd.],1,"[Allgreen Properties Ltd., nan, nan, nan, nan,...",Allgreen Properties Ltd.,,,,,,
2,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Dundee Road,queenstown,central region,project name,0,...,"[Intrepid Investments Pte. Ltd., Verwood Holdi...",3,"[Intrepid Investments Pte. Ltd., Verwood Holdi...",Intrepid Investments Pte. Ltd.,Verwood Holdings Pte. Ltd.,Hong Realty Pte. Ltd.,,,,
9,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,"[Evia Real Estate (7) Pte. Ltd., Maxdin Pte. L...",3,"[Evia Real Estate (7) Pte. Ltd., Maxdin Pte. L...",Evia Real Estate (7) Pte. Ltd.,Maxdin Pte. Ltd.,Gamuda Berhad,,,,
10,f2e43515a8e783bc2314727cb58587c8ee761ab7a4a016...,29/4/2015,18/6/2015,23/6/2015,Toa Payoh S4,Lorong 6 Toa Payoh/Lorong 4 Toa Payoh,toa payoh,central region,project name,0,...,[Sing Holdings Ltd.],1,"[Sing Holdings Ltd., nan, nan, nan, nan, nan, ...",Sing Holdings Ltd.,,,,,,


In [11]:
name_dict = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\tenderer_name_dict_1.csv')
# na_idx = name_dict[name_dict.unified_name.isna()].index
# name_dict.iloc[na_idx, 2] = name_dict.iloc[na_idx, 0]
# name_dict = name_dict.iloc[:, [0,2]]
name_dict

Unnamed: 0,tenderer_name,unified_name
0,Acresvale Investment Pte. Ltd.,Keppel Land
1,Act - Nobel Homes Pte. Ltd.,Act 1 Engineering
2,Actbilt Pte. Ltd.,Actbilt Pte. Ltd.
3,Allamanda Residential Development Pte. Ltd.,CapitaLand
4,Allgreen Properties Ltd.,Allgreen Properties
...,...,...
350,Yanlord Singapore Residential Pte. Ltd.,Yanlord Land Group Limited
351,Yeo Hiap Seng Ltd.,Yeo Hiap Seng Ltd.
352,ZACD Investments Pte. Ltd.,ZACD Investments
353,Zenlead Investments Pte. Ltd.,City Developments Limited (CDL)


In [12]:
# unpivot table
import re
td_df = gls_top3[['land_parcel', 'separated_names', 'tenderer_rank', 'num_tenderers_same_rank', 'tender_price', 'price_psm_gfa', 'tenderer_1', 'tenderer_2', 'tenderer_3', 'tenderer_4', 'tenderer_5', 'tenderer_6', 'tenderer_7',]]
td_df = td_df.melt(id_vars=['land_parcel', 'separated_names', 'tenderer_rank', 'num_tenderers_same_rank', 'tender_price', 'price_psm_gfa'], var_name='tenderer_id', value_name='tenderer_name').dropna(axis = 0, subset = ['tenderer_name'])
td_df['tenderer_name'] = td_df.tenderer_name.apply(lambda x: re.sub('\(.*?\)', '', x)).apply(lambda x: re.sub(' +', ' ', x)).apply(lambda x: x.strip())
td_df

Unnamed: 0,land_parcel,separated_names,tenderer_rank,num_tenderers_same_rank,tender_price,price_psm_gfa,tenderer_id,tenderer_name
0,Queenstown S9b,HY Realty Pte. Ltd.,1,1,483178000.0,9376.836955,tenderer_1,HY Realty Pte. Ltd.
1,Queenstown S9b,Allgreen Properties Ltd.,2,1,445910000.0,8653.592189,tenderer_1,Allgreen Properties Ltd.
2,Queenstown S9b,Intrepid Investments Pte. Ltd. | Verwood Holdi...,3,3,432774525.0,8398.677422,tenderer_1,Intrepid Investments Pte. Ltd.
3,Toa Payoh S4,Evia Real Estate (7) Pte. Ltd. | Maxdin Pte. L...,1,3,345860000.0,8130.020145,tenderer_1,Evia Real Estate Pte. Ltd.
4,Toa Payoh S4,Sing Holdings Ltd.,2,1,342100000.0,8041.635031,tenderer_1,Sing Holdings Ltd.
...,...,...,...,...,...,...,...,...
3923,Sengkang P1,Pidemco Realty Pte. Ltd. | Germiston Pte. Ltd....,2,4,149999990.0,1319.462352,tenderer_4,I. P. Property Fund Asia Ltd.
3936,Hougang N9NC,Gutherie GTS Ltd. | Tang Eng Pte. Ltd. | Bello...,3,7,55000000.0,2956.989247,tenderer_4,First Systems Holding Pte. Ltd.
4921,Hougang N9NC,Gutherie GTS Ltd. | Tang Eng Pte. Ltd. | Bello...,3,7,55000000.0,2956.989247,tenderer_5,Columbia Trading Pte. Ltd.
5906,Hougang N9NC,Gutherie GTS Ltd. | Tang Eng Pte. Ltd. | Bello...,3,7,55000000.0,2956.989247,tenderer_6,Mr Pang Pok / Proposed J. Y. Pte. Ltd.


In [13]:
gls_top3_uniname = pd.merge(td_df, name_dict, how='left', on='tenderer_name')
gls_top3_uniname.head()

Unnamed: 0,land_parcel,separated_names,tenderer_rank,num_tenderers_same_rank,tender_price,price_psm_gfa,tenderer_id,tenderer_name,unified_name
0,Queenstown S9b,HY Realty Pte. Ltd.,1,1,483178000.0,9376.836955,tenderer_1,HY Realty Pte. Ltd.,Hao Yuan Investment
1,Queenstown S9b,Allgreen Properties Ltd.,2,1,445910000.0,8653.592189,tenderer_1,Allgreen Properties Ltd.,Allgreen Properties
2,Queenstown S9b,Intrepid Investments Pte. Ltd. | Verwood Holdi...,3,3,432774525.0,8398.677422,tenderer_1,Intrepid Investments Pte. Ltd.,Hong Leong Investment Holdings
3,Toa Payoh S4,Evia Real Estate (7) Pte. Ltd. | Maxdin Pte. L...,1,3,345860000.0,8130.020145,tenderer_1,Evia Real Estate Pte. Ltd.,Evia Real Estate
4,Toa Payoh S4,Sing Holdings Ltd.,2,1,342100000.0,8041.635031,tenderer_1,Sing Holdings Ltd.,Sing Holdings Limited


In [14]:
count_td = gls_top3_uniname.groupby('unified_name').land_parcel.count()
# count_td.sort_values(by='tenderer_id', ascending=False).head()
count_td

unified_name
Act 1 Engineering              1
Actbilt Pte. Ltd.              1
Allgreen Properties           48
Ang Hock Beng Pte. Ltd.        1
Arama Holdings                 4
                              ..
Whye Wah Group                 4
Wing Tai Land                 18
Yanlord Land Group Limited     8
Yeo Hiap Seng Ltd.             4
ZACD Investments               1
Name: land_parcel, Length: 189, dtype: int64

In [15]:
avg_td = gls_top3_uniname.groupby('unified_name')['tender_price', 'price_psm_gfa'].mean()
avg_td = avg_td.transform({'tender_price': lambda x: '%.2f' %x, 'price_psm_gfa': lambda x: '%.2f' %x})
avg_td

  avg_td = gls_top3_uniname.groupby('unified_name')['tender_price', 'price_psm_gfa'].mean()


Unnamed: 0_level_0,tender_price,price_psm_gfa
unified_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Act 1 Engineering,35121000.00,
Actbilt Pte. Ltd.,60280000.00,1595.15
Allgreen Properties,134905833.33,2329.03
Ang Hock Beng Pte. Ltd.,55000000.00,2956.99
Arama Holdings,177669241.25,3305.92
...,...,...
Whye Wah Group,70900000.00,2026.56
Wing Tai Land,117592666.67,2315.41
Yanlord Land Group Limited,404498956.38,5543.09
Yeo Hiap Seng Ltd.,61313000.00,2758.43


In [16]:
td_count_price = pd.merge(count_td, avg_td, how='left', on='unified_name')

In [17]:
rank1_only = gls_top3_uniname[gls_top3_uniname.tenderer_rank==1]
rank1_count = rank1_only.groupby('unified_name').land_parcel.count().rename('num_of_top1')
rank1_count

unified_name
Allgreen Properties           7
Arama Holdings                3
Asiawide Holdings             1
Aspial Corp.                  3
BBR Property                  5
                             ..
Whye Wah Group                1
Wing Tai Land                 6
Yanlord Land Group Limited    1
Yeo Hiap Seng Ltd.            4
ZACD Investments              1
Name: num_of_top1, Length: 102, dtype: int64

In [21]:
td_overall = pd.merge(td_count_price, rank1_count, how='left', on='unified_name')
td_overall['top1%'] = td_overall.num_of_top1 / td_overall.land_parcel
td_overall['top1%'] = td_overall['top1%'].transform(lambda x: '%.1f' %(x*100))
td_overall.to_csv("tenderer_name_groupby.csv")

In [None]:
# funcs to calculate matching score of two words: complete match, return 1
def match_score(dictionary: str, lookup_value: str):
    if lookup_value:
        a = list("".join(dictionary.split(" ")))
        b = list("".join(lookup_value.split(" ")))
        score_a = 0
        score_b = 0
        penalty_a = 0
        penalty_b = 0
        total_len_a = len(a)
        total_len_b = len(b)
        if total_len_a * total_len_b > 0:
            for char in a:
                if char in b:
                    score_a += 1
                else:
                    penalty_a += 1
            for char in b:
                if char in a:
                    score_b += 1
                else:
                    penalty_b += 1
            return (score_a / total_len_a) * (score_b / total_len_b) - (penalty_a / total_len_a) * (penalty_b / total_len_b)
        else:
            return np.nan

    else:
        return np.nan

# funcs to calculate order score of two words: same order, return 0
def order_score(dictionary: str, lookup_value: str):
    char_list_dict = list(dictionary)
    charset_dict = list(set(char_list_dict))
    order_dict_a = dict()
    for char in charset_dict:
        order_dict_a[char] = char_list_dict.index(char)

    char_list_value = list(lookup_value)
    charset_value = list(set(char_list_value))
    order_dict_b = dict()
    for char in charset_value:
        order_dict_b[char] = char_list_value.index(char)

    # get the index of first char of dict in value dict
    first_char = dictionary[0]

    try:
        first_char_idx = order_dict_b[first_char]
        # print(first_char_idx)
        # recalculate index for value dict
        for char in order_dict_b.keys():
            order_dict_b[char] -= first_char_idx

    except KeyError:
        pass

    char_order_dict = [order_dict_a.get(key) for key in char_list_dict]
    char_order_dict_in_value = [order_dict_b.get(key) for key in char_list_dict]

    # calculate variation score
    sum_ = 0
    len_ = 0
    for k in range(len(char_order_dict)):
        if char_order_dict_in_value[k] is None:
            sum_ += 1
        else:
            sum_ += (char_order_dict[k] - char_order_dict_in_value[k]) ** 2
        len_ += 1

    try:
        variation = (sum_ / len_) ** 0.5
    except ZeroDivisionError:
        variation = np.nan

    return variation

def unpack_poly1d(poly1d):
    return list(poly1d.coef)

# decay func to calculate weight of words in different place: front words have higher weight
def decayFunc(start_x, start_y, end_x, end_y):
    from numpy import polyfit, poly1d
    df = pd.DataFrame({'x': [start_x, end_x], 'y': [start_y, end_y]})
    fit = polyfit(df.x, df.y, 3)
    equation = poly1d(fit)

    # import matplotlib.pyplot as plt

    # xmesh = np.linspace(min(df['x']), max(df['x']), 100)
    # plt.plot(df['x'], df['y'], 'bo', label='data')
    # plt.plot(xmesh, equation(xmesh), '-b', label='fit')
    # plt.legend(fontsize=20)
    # plt.xlabel('x', fontsize=20)
    # plt.ylabel('y', fontsize=20)
    # plt.show()

    return unpack_poly1d(equation)

# func to generate weight list
def weightList(coef: List[float], n_weight: List):
    n_var = len(coef) - 1
    w_list = []

    for x in n_weight:
        sum = 0
        for i in range(len(coef)):
            sum += coef[::-1][i] * x ** i
        w_list.append(sum)

    return w_list

def get_max_min_len(nameA: str, nameB: str)-> int:
    Alist = nameA.split(' ')
    Blist = nameB.split(' ')
    return max(len(Alist), len(Blist)), min(len(Alist), len(Blist))

# func to calculate similarity
def similarity(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1)-> float:
    min_len = get_max_min_len(nameA, nameB)[1]
    max_len = get_max_min_len(nameA, nameB)[0]
    w_list1 = weightList(decayFunc(1, w_h, min_len, w_l), list(range(1, min_len+1)))
    w_list2 = weightList(decayFunc(1, w_h, max_len, w_l), list(range(1, max_len+1)))

    Alist_0 = nameA.split(' ')
    Blist_0 = nameB.split(' ')

    # Alist, Blist = Alist_0[:same_len], Blist_0[:same_len]
    # if len(w_list) < same_len:
    #     w_list += [w_list[-1]]*(same_len-len(w_list))
    # w_list = w_list[:same_len]
    if len(Alist_0) != min_len:
        Alist_0, Blist_0 = Blist_0, Alist_0

    score_a = 0
    score_b = 0
    for i in range(min_len):
        if Alist_0[i] in Blist_0:
            score_a += 1*w_list1[i]

    for j in range(max_len):
        if Blist_0[j] in Alist_0:
            score_b += 1*w_list2[j]

    incl_a = score_a/len(Alist_0)
    incl_b = score_b/len(Blist_0)

    return incl_a + incl_b

# func to calculate distance between two words
def distance(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1)-> float:
    same_len = get_max_min_len(nameA, nameB)[1]
    weight_n = list(range(1, same_len+1))
    w_list = weightList(decayFunc(1, w_h, same_len, w_l), weight_n)

    Alist_0 = nameA.split(' ')
    Blist_0 = nameB.split(' ')

    Alist, Blist = Alist_0[:same_len], Blist_0[:same_len]
    if len(w_list) < same_len:
        w_list += [w_list[-1]]*(same_len-len(w_list))
    w_list = w_list[:same_len]

    euc_d = [(((1-match_score(Alist[i], Blist[i]))**2 + order_score(Alist[i], Blist[i])**2)**0.5) for i in range(same_len)]
    df = pd.DataFrame({'euc_index': euc_d, 'weight': w_list})
    distance = df.iloc[:, 0].dot(df.weight)

    len_mean = (len(Alist_0) + len(Blist_0)) / 2
    len_var = (((len(Alist_0)-len_mean)**2 + (len(Blist_0)-len_mean)**2)/2) ** 0.5

    return distance + len_var

# func to combine similarity and distance to get a comprehensive score
def overallSim(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1):
    sim = similarity(nameA, nameB, w_h, w_l)
    dis = distance(nameA, nameB, w_h, w_l)
    return sim - dis*0.1


In [None]:
companyA = 'CDL Properties'
companyB = 'CDL Real Estate'
companyC = 'Hon Sui Sen Properties What what'
companyD = 'Hon Heung Investments'
companyE = 'Hon Tian Sen Properties'
companyF = 'Hon Sui Sen'

overallSim(companyA, companyB)


In [None]:
td_minor = count_td[count_td.successful_tenderer_id < 10]
unique_name = pd.Series(td_minor.successful_tenderer_name.unique())
unique_name

In [None]:
# create a dict to find full name for stripped names
stripped = unique_name.apply(lambda x: stripName(x)[0])

In [None]:
def find_most_likely_name(name, ref: List[str]):
    name_dict = {}
    for item in ref:
        if item != name:
            name_dict[item] = overallSim(name, item)
    return max(name_dict, key=name_dict.get), max(name_dict.values())

# td_minor['most_likely_name'] = td_minor.reset_index().successful_tenderer_name.apply(lambda x: find_most_likely_name(x, ref=unique_name))
likely_names = stripped.apply(lambda x: find_most_likely_name(x, ref=stripped))

In [None]:
name_dict = pd.DataFrame({"names": unique_name, "likely_names": likely_names.apply(lambda x: x[0]), "likelihood": likely_names.apply(lambda x: x[1])})
name_dict.to_csv("tenderer_name_dict.csv", index=False)

In [None]:
td_minor = td_minor.reset_index()
test.successful_tenderer_name.apply(lambda x: find_most_likely_name(x, unique_name)[0])