# Insights from tenderer names

In [1]:
import pandas as pd
import numpy as np
import re
from typing import List
# pd.set_option('display.float_format', lambda x: '%.0f' % x)
pd.reset_option('display.float_format')

In [2]:
def nameFormat(companyName: str)-> str:

    import re
    pte_suffix = ['[Pp]rivate', '[Pp][Tt][Ee]']
    ltd_suffix = ['[Ll]imited', '[Ll]imit', '[Ll][Tt][Dd]']

    try:
        # remove line breaks and slashes
        companyName = companyName.strip()
        companyName = re.sub(r' +', r' ', companyName)
        # companyName = re.sub(r'\\+', r'\\', companyName)
        companyName = re.sub(r'\\+n?', '', companyName)
        companyName = re.sub(r'\n', r'', companyName)

        # replace suffix with identical format
        for suffix in pte_suffix:
            pattern = f'\(?{suffix}\.?,?\)?'
            companyName = re.sub(pattern, 'Pte.', companyName)

        for suffix in ltd_suffix:
            pattern = f'\(?{suffix}\.?\)?'
            companyName = re.sub(pattern, 'Ltd.', companyName)

        companyName = re.sub('\(?[Pp][Ll][.]?\)?[\W]?$', 'Pte. Ltd.', companyName)
        companyName = re.sub('\(?[Pp][Ll][ ,./]\)?', 'Pte. Ltd.,', companyName)
        companyName = re.sub('\(?[Ii][Nn][Cc][.]?\)?[\W]?$', 'Inc.', companyName)
        companyName = re.sub('\(?[Ii]ncorporate[d]?\)?', 'Inc.', companyName)
        companyName = re.sub('\(?[Ii][Nn][Cc][.]? +\)?', 'Inc. ', companyName)
        companyName = re.sub('\(?[Jj][oint]*?[ -/&]?[Vv]e?n?t?u?r?e?[.]?\)?', 'J.V.', companyName)
        companyName = re.sub('\(?[Cc][Oo][Rr][Pp][oration]*\)?', 'Corp.', companyName)

        # identify separators and split multiple company names
        sep_id = ['[Ll][Tt][Dd]', '[Ii][Nn][Cc]', '[Cc][Oo][Rr][Pp]', '[Gg][Mm][Bb][Hh]', '[Jj].?[Vv].?', '[Ll][Ll][Cc]', '[Pp][Ll][Cc]', '[Ll][Ll][Pp]', '[Gg][Rr][Oo][Uu][Pp]']
        repl = ['Ltd', 'Inc', 'Corp', 'Gmbh', 'J.V', 'LLC', 'Plc', 'LLP', 'Group']
        repl_dict = dict(zip(sep_id, repl))
        for suffix in repl_dict.keys():
            sep_pattern_and = f'{suffix}[.]?[ ,;]?[\W]*?[Aa][Nn][Dd]? +'
            # sep_pattern_comma = 'ltd[.]?[ ]*[,;&][\W]?[,;]?[ ]?'
            sep_pattern_comma_ampersand = f'{suffix}[.]?[ ]*[,;&/][\W]?[ ]?'
            suffix_repl = repl_dict[suffix]
            companyName = re.sub(sep_pattern_and, f'{suffix_repl}. | ', companyName)
            companyName = re.sub(sep_pattern_comma_ampersand, f'{suffix_repl}. | ', companyName)

    except AttributeError:
        pass

    return companyName


In [3]:
def stripName(companyName: List[str])->List[str] :
    strip_pattern = [f' +{string}[.]?$' for string in ['[Ll][Tt][Dd]', '[Pp][Tt][Ee]', '[Pp][Tt][Ee][.]? +[Ll][Tt][Dd]', '[Ii][Nn][Cc]', '[Cc][Oo][Rr][Pp]', '[Gg][Mm][Bb][Hh]', '[Jj].?[Vv]', '[Ll][Ll][Cc]', '[Pp][Ll][Cc]', '[Ll][Ll][Pp]', '[Cc][Oo][.]?[,]? +[Pp][Tt][Ee][.]?[,]? +[Ll][Tt][Dd]', '[Cc][Oo][.]?[,]? +[Ll][Tt][Dd]', '[Cc][Oo][Rr][Pp][.]?[,]? +[Pp][Tt][Ee][.]?[,]? +[Ll][Tt][Dd]']]
    pattern = '|'.join(strip_pattern)
    companyName = [companyName] if isinstance(companyName, str) else companyName
    stripped_name = []
    for name in companyName:
        try:
            stripped_name.append(re.sub(pattern, '', name))
        except AttributeError and TypeError:
            stripped_name.append(name)
    return stripped_name


texts = ['abc Pte. Ltd.', 'ccc inc', 'cbc gMbh', 'ddd J.V', 'abc Co. Pte. Ltd', np.nan, 7]
stripName(texts)

['abc', 'ccc', 'cbc', 'ddd', 'abc', nan, 7]

In [44]:
def name_by_keyword(name: str, dictionary: dict):
    for text in dictionary.keys():
        if text in name.lower():
            return dictionary[text]
    return 0

In [4]:
# test
company = 'REA pte., LTD, SOreal LLC and Cushman & Wakefield, Incorporated/ P&G PlC & JLL and Lasalle, consultant, IP inc.AND CDL intl pte., ltd.;,Capitaland and hdb joint-venture and cc group,and VW GMbH'
nameFormat(company)

'REA Pte. Ltd. | SOreal LLC. | Cushman & Wakefield, Inc. | P&G Plc. | JLL and Lasalle, consultant, IP Inc. | CDL intl Pte. Ltd. | Capitaland and hdb J.V. | cc Group. | VW GMbH'

In [5]:
gls = pd.read_csv(r"G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\gls_details_filled_full.csv")
# gls['price_psm_gfa'] = gls.tender_price/gls.gfa_sqm
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,award_month_index,year_award,month_award,day_award,merge_key,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,source_file
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,201506,2015,6,30,Queenstown S9b,1,HY Realty Pte Ltd,483178000.0,9376.82,
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,201506,2015,6,30,Queenstown S9b,2,Allgreen Properties Limited,445910000.0,8653.57,
2,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,201803,2018,3,7,Punggol E13,2,Qingjian Realty (Residential) Pte. Ltd.,486000000.0,5987.51,
3,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,201803,2018,3,7,Punggol E13,1,CDL Constellation Pte. Ltd. and TID Residentia...,509370000.0,6275.43,
4,c278da3f1270e62fd626cb02ebbc60f36d81f51ae33e3a...,29/6/2016,23/8/2016,5/9/2016,Sengkang E20,Sengkang E20,Anchorvale Lane,sengkang,north-east region,project name,...,201609,2016,9,5,Sengkang E20,2,Wee Hur Development Pte Ltd,235000000.0,3727.56,


In [6]:
gls = gls.dropna(subset=['tenderer_name'], axis=0)
gls.tenderer_name = gls.tenderer_name.apply(lambda x: re.sub('\(? ?[Aa][Ss] ?[Tt][Rr][Uu][Ss][Tt][Ee][Ee].*?[Tt][Rr][Uu][Ss][Tt] ?\)?', '', x))

In [7]:
# gls_top3 = pd.DataFrame(gls.tenderer_name.dropna())
gls["separated_names"] = gls.tenderer_name.apply(nameFormat)
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,year_award,month_award,day_award,merge_key,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,source_file,separated_names
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,2015,6,30,Queenstown S9b,1,HY Realty Pte Ltd,483178000.0,9376.82,,HY Realty Pte. Ltd.
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,2015,6,30,Queenstown S9b,2,Allgreen Properties Limited,445910000.0,8653.57,,Allgreen Properties Ltd.
2,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,2018,3,7,Punggol E13,2,Qingjian Realty (Residential) Pte. Ltd.,486000000.0,5987.51,,Qingjian Realty (Residential) Pte. Ltd.
3,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,2018,3,7,Punggol E13,1,CDL Constellation Pte. Ltd. and TID Residentia...,509370000.0,6275.43,,CDL Constellation Pte. Ltd. | TID Residential ...
4,c278da3f1270e62fd626cb02ebbc60f36d81f51ae33e3a...,29/6/2016,23/8/2016,5/9/2016,Sengkang E20,Sengkang E20,Anchorvale Lane,sengkang,north-east region,project name,...,2016,9,5,Sengkang E20,2,Wee Hur Development Pte Ltd,235000000.0,3727.56,,Wee Hur Development Pte. Ltd.


In [8]:
# gls['tenderer_rank'] = gls.tenderer_rank.apply(lambda x: int(re.findall('\d+', x)[0]))

In [9]:
# gls_top3 = gls[gls.tenderer_rank<=3]
gls["list_of_tenderers"] = gls.separated_names.apply(lambda x: x.split(' | '))
gls["num_tenderers_same_rank"] = gls.list_of_tenderers.apply(lambda x: len(x))
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,day_award,merge_key,tenderer_rank,tenderer_name,tender_price,price_psm_gfa,source_file,separated_names,list_of_tenderers,num_tenderers_same_rank
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,30,Queenstown S9b,1,HY Realty Pte Ltd,483178000.0,9376.82,,HY Realty Pte. Ltd.,[HY Realty Pte. Ltd.],1
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,30,Queenstown S9b,2,Allgreen Properties Limited,445910000.0,8653.57,,Allgreen Properties Ltd.,[Allgreen Properties Ltd.],1
2,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,7,Punggol E13,2,Qingjian Realty (Residential) Pte. Ltd.,486000000.0,5987.51,,Qingjian Realty (Residential) Pte. Ltd.,[Qingjian Realty (Residential) Pte. Ltd.],1
3,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,7,Punggol E13,1,CDL Constellation Pte. Ltd. and TID Residentia...,509370000.0,6275.43,,CDL Constellation Pte. Ltd. | TID Residential ...,"[CDL Constellation Pte. Ltd., TID Residential ...",2
4,c278da3f1270e62fd626cb02ebbc60f36d81f51ae33e3a...,29/6/2016,23/8/2016,5/9/2016,Sengkang E20,Sengkang E20,Anchorvale Lane,sengkang,north-east region,project name,...,5,Sengkang E20,2,Wee Hur Development Pte Ltd,235000000.0,3727.56,,Wee Hur Development Pte. Ltd.,[Wee Hur Development Pte. Ltd.],1


In [10]:
gls.num_tenderers_same_rank.describe()

count    881.000000
mean       1.379115
std        0.668594
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        4.000000
Name: num_tenderers_same_rank, dtype: float64

In [11]:
# gls_top3['stripped_names'] = td_names.list_of_tenderers.apply(stripName)
# create additional cols for multiple tenderers
max_num_td = gls.num_tenderers_same_rank.max()
gls['tenderer_names_filled'] = gls.list_of_tenderers.apply(lambda x: x + [np.nan] * (max_num_td - len(x)))
for col in range(max_num_td):
    gls[f"tenderer_{col + 1}"] = gls.tenderer_names_filled.apply(lambda x: x[col])
gls.head()

Unnamed: 0,sg_gls_id,date_launch,date_close,date_award,land_parcel,land_parcel_std,street,zone,region,join_by,...,price_psm_gfa,source_file,separated_names,list_of_tenderers,num_tenderers_same_rank,tenderer_names_filled,tenderer_1,tenderer_2,tenderer_3,tenderer_4
0,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,9376.82,,HY Realty Pte. Ltd.,[HY Realty Pte. Ltd.],1,"[HY Realty Pte. Ltd., nan, nan, nan]",HY Realty Pte. Ltd.,,,
1,8c101477584713a9310e02940aac6b059e37833ab6848d...,29/4/2015,23/6/2015,30/6/2015,Queenstown S9b,Queenstown S9b,Dundee Road,queenstown,central region,project name,...,8653.57,,Allgreen Properties Ltd.,[Allgreen Properties Ltd.],1,"[Allgreen Properties Ltd., nan, nan, nan]",Allgreen Properties Ltd.,,,
2,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,5987.51,,Qingjian Realty (Residential) Pte. Ltd.,[Qingjian Realty (Residential) Pte. Ltd.],1,"[Qingjian Realty (Residential) Pte. Ltd., nan,...",Qingjian Realty (Residential) Pte. Ltd.,,,
3,f503abbef02408955cd92d63528651696f846da9faa95b...,12/12/2017,27/2/2018,7/3/2018,Punggol E13,Punggol E13,Sumang Walk,punggol,north-east region,project name,...,6275.43,,CDL Constellation Pte. Ltd. | TID Residential ...,"[CDL Constellation Pte. Ltd., TID Residential ...",2,"[CDL Constellation Pte. Ltd., TID Residential ...",CDL Constellation Pte. Ltd.,TID Residential Pte. Ltd.,,
4,c278da3f1270e62fd626cb02ebbc60f36d81f51ae33e3a...,29/6/2016,23/8/2016,5/9/2016,Sengkang E20,Sengkang E20,Anchorvale Lane,sengkang,north-east region,project name,...,3727.56,,Wee Hur Development Pte. Ltd.,[Wee Hur Development Pte. Ltd.],1,"[Wee Hur Development Pte. Ltd., nan, nan, nan]",Wee Hur Development Pte. Ltd.,,,


In [34]:
name_dict = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\ready for uploading\tenderer_name_dict_keywords.csv')
# na_idx = name_dict[name_dict.unified_name.isna()].index
# name_dict.iloc[na_idx, 2] = name_dict.iloc[na_idx, 0]
# name_dict = name_dict.iloc[:, [0,2]]
name_dict = name_dict[['key_word', 'unified_name']]
name_dict_dic = dict(zip(list(name_dict.key_word.apply(lambda x: x.lower())), list(name_dict.unified_name)))

In [35]:
# unpivot table
import re
td_df = gls[['land_parcel_std', 'separated_names', 'tenderer_rank', 'num_tenderers_same_rank', 'tender_price', 'price_psm_gfa', 'tenderer_1', 'tenderer_2', 'tenderer_3', 'tenderer_4',]]
td_df = td_df.melt(id_vars=['land_parcel_std', 'separated_names', 'tenderer_rank', 'num_tenderers_same_rank', 'tender_price', 'price_psm_gfa'], var_name='tenderer_id', value_name='tenderer_name').dropna(axis = 0, subset = ['tenderer_name'])
td_df['tenderer_name'] = td_df.tenderer_name.apply(lambda x: re.sub('\(.*?\)', '', x)).apply(lambda x: re.sub(' +', ' ', x)).apply(lambda x: x.strip())
td_df

Unnamed: 0,land_parcel_std,separated_names,tenderer_rank,num_tenderers_same_rank,tender_price,price_psm_gfa,tenderer_id,tenderer_name
0,Queenstown S9b,HY Realty Pte. Ltd.,1,1,4.831780e+08,9376.82,tenderer_1,HY Realty Pte. Ltd.
1,Queenstown S9b,Allgreen Properties Ltd.,2,1,4.459100e+08,8653.57,tenderer_1,Allgreen Properties Ltd.
2,Punggol E13,Qingjian Realty (Residential) Pte. Ltd.,2,1,4.860000e+08,5987.51,tenderer_1,Qingjian Realty Pte. Ltd.
3,Punggol E13,CDL Constellation Pte. Ltd. | TID Residential ...,1,2,5.093700e+08,6275.43,tenderer_1,CDL Constellation Pte. Ltd.
4,Sengkang E20,Wee Hur Development Pte. Ltd.,2,1,2.350000e+08,3727.56,tenderer_1,Wee Hur Development Pte. Ltd.
...,...,...,...,...,...,...,...,...
2753,Jurong West E1,Lum Chang Bldg Contractors Pte. Ltd. | L.C. De...,1,4,1.390000e+08,2363.95,tenderer_4,Comfort Group Ltd.
2798,Tampines E11,Maxdin Pte. Ltd. | BPK Development Pte. Ltd. |...,2,4,2.307000e+08,3970.67,tenderer_4,LMG Realty Pte. Ltd.
3037,Jurong West S3a,The Dynasty Corp. (S) Pte. Ltd. | SEADC Invest...,1,4,4.300000e+07,0.00,tenderer_4,GEMT Trading Pte. Ltd.
3221,Martin Place,Intrepid Investments Pte. Ltd. | Verwood Holdi...,2,4,5.880000e+08,13177.36,tenderer_4,Garden Estates Pte. Ltd.


In [36]:
name_by_keyword('I. P.', name_dict_dic)

'I. P. Property Fund Asia Ltd.'

In [46]:
td_df['unified_name'] = td_df.tenderer_name.apply(name_by_keyword, dictionary=name_dict_dic)
td_df[td_df.unified_name==0].to_csv('name_no_keyword.csv', index=False)

In [None]:
count_td = gls_uniname.groupby('unified_name').land_parcel.count()
# count_td.sort_values(by='tenderer_id', ascending=False).head()
count_td

In [None]:
avg_td = gls_top3_uniname.groupby('unified_name')['tender_price', 'price_psm_gfa'].mean()
avg_td = avg_td.transform({'tender_price': lambda x: '%.2f' %x, 'price_psm_gfa': lambda x: '%.2f' %x})
avg_td

In [None]:
td_count_price = pd.merge(count_td, avg_td, how='left', on='unified_name')

In [None]:
rank1_only = gls_top3_uniname[gls_top3_uniname.tenderer_rank==1]
rank1_count = rank1_only.groupby('unified_name').land_parcel.count().rename('num_of_top1')
rank1_count

In [None]:
td_overall = pd.merge(td_count_price, rank1_count, how='left', on='unified_name')
td_overall['top1%'] = td_overall.num_of_top1 / td_overall.land_parcel
td_overall['top1%'] = td_overall['top1%'].transform(lambda x: '%.1f' %(x*100))
td_overall.to_csv("tenderer_name_groupby.csv")

In [None]:
# funcs to calculate matching score of two words: complete match, return 1
def match_score(dictionary: str, lookup_value: str):
    if lookup_value:
        a = list("".join(dictionary.split(" ")))
        b = list("".join(lookup_value.split(" ")))
        score_a = 0
        score_b = 0
        penalty_a = 0
        penalty_b = 0
        total_len_a = len(a)
        total_len_b = len(b)
        if total_len_a * total_len_b > 0:
            for char in a:
                if char in b:
                    score_a += 1
                else:
                    penalty_a += 1
            for char in b:
                if char in a:
                    score_b += 1
                else:
                    penalty_b += 1
            return (score_a / total_len_a) * (score_b / total_len_b) - (penalty_a / total_len_a) * (penalty_b / total_len_b)
        else:
            return np.nan

    else:
        return np.nan

# funcs to calculate order score of two words: same order, return 0
def order_score(dictionary: str, lookup_value: str):
    char_list_dict = list(dictionary)
    charset_dict = list(set(char_list_dict))
    order_dict_a = dict()
    for char in charset_dict:
        order_dict_a[char] = char_list_dict.index(char)

    char_list_value = list(lookup_value)
    charset_value = list(set(char_list_value))
    order_dict_b = dict()
    for char in charset_value:
        order_dict_b[char] = char_list_value.index(char)

    # get the index of first char of dict in value dict
    first_char = dictionary[0]

    try:
        first_char_idx = order_dict_b[first_char]
        # print(first_char_idx)
        # recalculate index for value dict
        for char in order_dict_b.keys():
            order_dict_b[char] -= first_char_idx

    except KeyError:
        pass

    char_order_dict = [order_dict_a.get(key) for key in char_list_dict]
    char_order_dict_in_value = [order_dict_b.get(key) for key in char_list_dict]

    # calculate variation score
    sum_ = 0
    len_ = 0
    for k in range(len(char_order_dict)):
        if char_order_dict_in_value[k] is None:
            sum_ += 1
        else:
            sum_ += (char_order_dict[k] - char_order_dict_in_value[k]) ** 2
        len_ += 1

    try:
        variation = (sum_ / len_) ** 0.5
    except ZeroDivisionError:
        variation = np.nan

    return variation

def unpack_poly1d(poly1d):
    return list(poly1d.coef)

# decay func to calculate weight of words in different place: front words have higher weight
def decayFunc(start_x, start_y, end_x, end_y):
    from numpy import polyfit, poly1d
    df = pd.DataFrame({'x': [start_x, end_x], 'y': [start_y, end_y]})
    fit = polyfit(df.x, df.y, 3)
    equation = poly1d(fit)

    # import matplotlib.pyplot as plt

    # xmesh = np.linspace(min(df['x']), max(df['x']), 100)
    # plt.plot(df['x'], df['y'], 'bo', label='data')
    # plt.plot(xmesh, equation(xmesh), '-b', label='fit')
    # plt.legend(fontsize=20)
    # plt.xlabel('x', fontsize=20)
    # plt.ylabel('y', fontsize=20)
    # plt.show()

    return unpack_poly1d(equation)

# func to generate weight list
def weightList(coef: List[float], n_weight: List):
    n_var = len(coef) - 1
    w_list = []

    for x in n_weight:
        sum = 0
        for i in range(len(coef)):
            sum += coef[::-1][i] * x ** i
        w_list.append(sum)

    return w_list

def get_max_min_len(nameA: str, nameB: str)-> int:
    Alist = nameA.split(' ')
    Blist = nameB.split(' ')
    return max(len(Alist), len(Blist)), min(len(Alist), len(Blist))

# func to calculate similarity
def similarity(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1)-> float:
    min_len = get_max_min_len(nameA, nameB)[1]
    max_len = get_max_min_len(nameA, nameB)[0]
    w_list1 = weightList(decayFunc(1, w_h, min_len, w_l), list(range(1, min_len+1)))
    w_list2 = weightList(decayFunc(1, w_h, max_len, w_l), list(range(1, max_len+1)))

    Alist_0 = nameA.split(' ')
    Blist_0 = nameB.split(' ')

    # Alist, Blist = Alist_0[:same_len], Blist_0[:same_len]
    # if len(w_list) < same_len:
    #     w_list += [w_list[-1]]*(same_len-len(w_list))
    # w_list = w_list[:same_len]
    if len(Alist_0) != min_len:
        Alist_0, Blist_0 = Blist_0, Alist_0

    score_a = 0
    score_b = 0
    for i in range(min_len):
        if Alist_0[i] in Blist_0:
            score_a += 1*w_list1[i]

    for j in range(max_len):
        if Blist_0[j] in Alist_0:
            score_b += 1*w_list2[j]

    incl_a = score_a/len(Alist_0)
    incl_b = score_b/len(Blist_0)

    return incl_a + incl_b

# func to calculate distance between two words
def distance(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1)-> float:
    same_len = get_max_min_len(nameA, nameB)[1]
    weight_n = list(range(1, same_len+1))
    w_list = weightList(decayFunc(1, w_h, same_len, w_l), weight_n)

    Alist_0 = nameA.split(' ')
    Blist_0 = nameB.split(' ')

    Alist, Blist = Alist_0[:same_len], Blist_0[:same_len]
    if len(w_list) < same_len:
        w_list += [w_list[-1]]*(same_len-len(w_list))
    w_list = w_list[:same_len]

    euc_d = [(((1-match_score(Alist[i], Blist[i]))**2 + order_score(Alist[i], Blist[i])**2)**0.5) for i in range(same_len)]
    df = pd.DataFrame({'euc_index': euc_d, 'weight': w_list})
    distance = df.iloc[:, 0].dot(df.weight)

    len_mean = (len(Alist_0) + len(Blist_0)) / 2
    len_var = (((len(Alist_0)-len_mean)**2 + (len(Blist_0)-len_mean)**2)/2) ** 0.5

    return distance + len_var

# func to combine similarity and distance to get a comprehensive score
def overallSim(nameA: str, nameB: str, w_h = 0.9, w_l = 0.1):
    sim = similarity(nameA, nameB, w_h, w_l)
    dis = distance(nameA, nameB, w_h, w_l)
    return sim - dis*0.1


In [None]:
companyA = 'CDL Properties'
companyB = 'CDL Real Estate'
companyC = 'Hon Sui Sen Properties What what'
companyD = 'Hon Heung Investments'
companyE = 'Hon Tian Sen Properties'
companyF = 'Hon Sui Sen'

overallSim(companyA, companyB)


In [None]:
td_minor = count_td[count_td.successful_tenderer_id < 10]
unique_name = pd.Series(td_minor.successful_tenderer_name.unique())
unique_name

In [None]:
# create a dict to find full name for stripped names
stripped = unique_name.apply(lambda x: stripName(x)[0])

In [None]:
def find_most_likely_name(name, ref: List[str]):
    name_dict = {}
    for item in ref:
        if item != name:
            name_dict[item] = overallSim(name, item)
    return max(name_dict, key=name_dict.get), max(name_dict.values())

# td_minor['most_likely_name'] = td_minor.reset_index().successful_tenderer_name.apply(lambda x: find_most_likely_name(x, ref=unique_name))
likely_names = stripped.apply(lambda x: find_most_likely_name(x, ref=stripped))

In [None]:
name_dict = pd.DataFrame({"names": unique_name, "likely_names": likely_names.apply(lambda x: x[0]), "likelihood": likely_names.apply(lambda x: x[1])})
name_dict.to_csv("tenderer_name_dict.csv", index=False)

In [None]:
td_minor = td_minor.reset_index()
test.successful_tenderer_name.apply(lambda x: find_most_likely_name(x, unique_name)[0])

# Missing parcels in master table

In [11]:
missing = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\missing_parcels.csv')

In [14]:
import re
missing['land_parcel'] = missing.land_parcel.apply(lambda x: re.sub('/', ' + ', x))
miss_list = list(missing.land_parcel)

In [20]:
import os
all_scraped = os.listdir(r'G:\REA\Working files\land-bidding\Table extraction\tenderer_details_ura\temp')
status = [0]*len(miss_list)
for i in range(len(miss_list)):
    for item in all_scraped:
        if miss_list[i] in item:
            status[i] = 1
pd.DataFrame({'missing_parcel': miss_list, 'scraped': status})

Unnamed: 0,missing_parcel,scraped
0,Lengkong Empat,1
1,Tuas Bay Drive,1
2,Raffles Quay + Marina Boulevard,0
3,Mount Sinai Drive,0
4,Joo Chiat Place + Everitt Road,0
...,...,...
145,Bukit Batok Street 23,1
146,Bedok South Avenue 3,1
147,Boon Lay Way,1
148,Soon Lee Street,1


In [27]:
gls.index = gls.sg_gls_id

In [5]:
from datetime import date
from dateutil.relativedelta import relativedelta
# record = gls.loc['1151a728199876797747429b92440bcdb05233bed522078cddb609700a0ea939']
date = date(2022, 10, 19) - date(2022, 3, 29)
date

datetime.timedelta(days=204)