In [1]:
import numpy as np
import pandas as pd
import duckdb
# suppress scientific notation by setting float_format
pd.set_option('display.float_format', '{:f}'.format)

In [2]:
# Conect to Database
con = duckdb.connect(database='test.db', read_only=False)
con.execute('INSTALL spatial')
con.execute('LOAD spatial')
#TODO build training data
con.sql('SHOW TABLES').fetchdf()

Unnamed: 0,name
0,compustat
1,compustat_geocoded
2,coord_matches
3,fnam
4,fnpm
5,matched
6,nets_all
7,nets_pub
8,none_matches
9,training_matches


In [76]:
# Get one random row of nets
con.sql('SELECT * FROM nets_pub ORDER BY RANDOM() LIMIT 1').fetchdf()

Unnamed: 0,hqduns,hqcompany,latitude,longitude
0,77719045,CHAUSBERNARDINC,40.7533,-73.9869


In [77]:
prepared_statement_name_pub = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_pub n
        --we can use other similarity functions
        --we can also use other similarity scores
"""
prepared_statement_name_all = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_all n
        --we can use other similarity functions
        --we can also use other similarity scores
"""

prepared_statement_add_match = """
    INSERT INTO matched
    VALUES (?, --gvkey 
            (SELECT hqcompany FROM compustat WHERE gvkey = ?), --hqcompany
            ?, --hqduns_pub
            (SELECT hqcompany FROM nets_pub WHERE hqduns = ?),
            ?, --name similarity_pub
            ?, --distance pub
            ?, --hqduns_all
            (SELECT hqcompany FROM nets_all WHERE hqduns = ?),
            ?,  --name similarity_all
            ?); --distance all
"""

prepared_statement_geo_match_all = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnam n
    ORDER BY distance ASC
    LIMIT 1;
"""

prepared_statement_geo_match_pub = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnpm n
    ORDER BY distance ASC
    LIMIT 1;
"""

In [78]:
# Get the matches for a specific name
def name_similarity_pub(comp_name):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_pub, [comp_name])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 5').fetchdf() #TODO change threshold

def name_similarity_all(comp_name):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_all, [comp_name])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 10').fetchdf() #TODO change threshold


In [79]:
# # Matching process
# con.execute('DROP TABLE IF EXISTS matched;')
# 
# sql_query = """
# CREATE TABLE matched (
#     gvkey INTEGER,
#     hqcompany VARCHAR(255),
#     hqduns_pub INTEGER,
#     hqcompany_pub VARCHAR(255),
#     name_similarity_pub FLOAT,
#     distance_pub FLOAT,
#     hqduns_all INTEGER,
#     hqcompany_all VARCHAR(255),
#     name_similarity_all FLOAT,
#     distance_all FLOAT);
# """
#con.execute(sql_query)


In [3]:

compustat_df = con.sql('SELECT * FROM compustat').fetchdf()

# Fill missing values
compustat_df['lat'] = compustat_df['lat'].fillna(0)
compustat_df['lon'] = compustat_df['lon'].fillna(0)
# compustat_df['phone'] = compustat_df['phone'].fillna('')
compustat_df = compustat_df.sample(1) #compustat_df[8000:18000]

In [4]:
# Stats Counter
pub_match_all = 0

In [5]:
i = 0
for index, row in compustat_df.iterrows():
    i += 1
    print(f'Processing {i} of {compustat_df.shape[0]}')
    gvkey = row['gvkey']
    #check if gvkey is already in matched
    if con.sql(f'SELECT COUNT(*) FROM matched WHERE gvkey = {gvkey}').fetchdf()['count_star()'].iloc[0] > 0:
        print(f'Already processed {gvkey}, {row["hqcompany"]}')
        continue
    # Get candidate matches
    name_pub_match = name_similarity_pub(row['hqcompany'])
    
    try:
        filtered_name_pub_matches = name_pub_match[name_pub_match['similarity_score'] == name_pub_match['similarity_score'].iloc[0]]
        # Create a temporary table with the filtered results
        con.execute('DROP TABLE IF EXISTS fnpm;')
        con.execute('DROP TABLE IF EXISTS tmp;')
        con.execute('CREATE TABLE fnpm AS SELECT * FROM filtered_name_pub_matches;')
        pub_match = con.execute(prepared_statement_geo_match_pub, [row['lat'], row['lon']]).fetchdf().iloc[0]
    except IndexError:
        pub_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
        pub_match.loc[0] = [-1, -1, 9999999]  # Fill with default values
        
    name_all_matches = name_similarity_all(row['hqcompany'])
    
    try:
        # Get the top similarity score
        top_similarity_score = name_all_matches['similarity_score'].iloc[0]
        # Filter the DataFrame
        filtered_name_all_matches = name_all_matches[name_all_matches['similarity_score'] == top_similarity_score]
        # Create a temporary table with the filtered results
        con.execute('DROP TABLE IF EXISTS fnam;')
        con.execute('DROP TABLE IF EXISTS tmp;')
        con.execute('CREATE TABLE fnam AS SELECT * FROM filtered_name_all_matches;')
        name_all_match = con.execute(prepared_statement_geo_match_all, [row['lat'], row['lon']]).fetchdf().iloc[0]
    except IndexError:
        name_all_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
        name_all_match.loc[0] = [-1, -1, 9999999]  # Fill with default values

    if pub_match['hqduns'].item() in name_all_matches['hqduns'].values and pub_match['similarity_score'].item() >= name_all_match['similarity_score'].item():
        name_all_match = pub_match
        pub_match_all += 1
    
    try:    
        con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), name_all_match['distance'].item()])
    except KeyError:
         con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), -1])

    
    

Processing 1 of 1


NameError: name 'name_similarity_pub' is not defined

In [8]:
con.sql('SELECT * FROM matched').fetchdf().sample(100)

Unnamed: 0,gvkey,hqcompany,hqduns_pub,hqcompany_pub,name_similarity_pub,distance_pub,hqduns_all,hqcompany_all,name_similarity_all,distance_all
386,7523,MONITOR@INC,13135058,MONROINC,0.907197,35.323483,607420460,MONITOR@INC,1.000000,15.546094
58,10744,TRITONGRPLTD-OLD,50794817,TRITONGRPLTD,0.950000,7.189032,73678505,TRITONGRPLTD,0.950000,1.301611
49,5777,HUMPHREYINC,-1,,-1.000000,9999999.000000,31304116,HUMPHREYINC,1.000000,19.103863
430,7438,MINNETONKACORP,-1,,-1.000000,9999999.000000,194195397,MINNOKACORP,0.957143,15.369818
628,25019,PLAINSSPIRIT)CORP,796948693,PLAINSSPIRIT)CORP,1.000000,0.000085,796948693,PLAINSSPIRIT)CORP,1.000000,0.000085
...,...,...,...,...,...,...,...,...,...,...
22,264712,SHOPPING.COMLTD,-1,,-1.000000,9999999.000000,104579246,SHOPPINGCOMLTD,0.986667,106.415848
272,143605,MEDCATHCORP,21115162,MEDCATHCORP,1.000000,38.273224,21115162,MEDCATHCORP,1.000000,38.273224
659,176612,PROSHARESULTSH)S,-1,,-1.000000,9999999.000000,79731538,PROSHARESSHORT)S,0.935714,0.003086
316,141158,IRSAPROPIEDADESCOMERCIALES,-1,,-1.000000,9999999.000000,-1,,-1.000000,9999999.000000


In [10]:
matched = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_pub >= 0.95').fetchdf()
matched_size = con.sql('SELECT COUNT(*) FROM matched').fetchdf()['count_star()'].iloc[0]
#print percentage of matched companies|
print(matched['count_star()'].iloc[0] / matched_size * 100)
print(matched['count_star()'].iloc[0])

35.56443556443556
356


In [11]:
# Show statistics
print(f'Number of companies: {matched_size}')
accuracy = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_pub >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
accuray_with_all = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Accuracy with all companies: {accuray_with_all}%')
print(f'Accuracy only public companies: {accuracy}%')
same_match = pub_match_all / matched_size * 100
print(f'Entries with same match: {same_match}%')
no_pub_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match: {no_pub_percentage}%')
no_pub_but_all_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND name_similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match: {no_pub_but_all_percentage}%')
# All matches, where no pub match and distance < 1
no_pub_but_all_distance_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND distance_all < 1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match and distance < 1: {no_pub_but_all_distance_percentage}%')

Number of companies: 1001
Accuracy with all companies: 69.73026973026974%
Accuracy only public companies: 35.56443556443556%
Entries with same match: 0.0%
Entries with no pub match: 48.451548451548454%
Entries with no pub match but all match: 22.27772227772228%
Entries with no pub match but all match and distance < 1: 11.688311688311687%
