In [18]:
import numpy as np
import pandas as pd
import duckdb
# suppress scientific notation by setting float_format
pd.set_option('display.float_format', '{:f}'.format)

In [19]:
# Conect to Database
con = duckdb.connect(database='database.db', read_only=False)
con.execute('INSTALL spatial')
con.execute('LOAD spatial')
#TODO build training data
con.sql('SHOW TABLES').fetchdf()

Unnamed: 0,name
0,compustat
1,compustat_geocoded
2,fnpm
3,matched
4,nets_all
5,nets_pub
6,tmp


In [20]:
con.sql('SELECT COUNT (*) FROM compustat').fetchdf()

Unnamed: 0,count_star()
0,39782


In [36]:
con.execute('SELECT * FROM compustat LIMIT 1000').fetchdf().sample(10)

Unnamed: 0,gvkey,hqcompany,hqzipcode,city,lat,lon
445,4402,ENTERTAINMENTPUBLISHINGCP,48083,TROY,42.59504,-83.171965
311,3333,ARTESYN@INC,33434-4105,BOCA RATON,26.365509,-80.167507
866,7853,BLAIRCORP,16366,WARREN,41.845637,-79.148695
541,5204,GOLDORELTDOFCRIPPLECR,,,3.901116,-73.075755
986,8764,PROSCANINC,80202,DENVER,39.74709,-104.997219
510,4976,STRATEGICVALUECORP,M5J 2W4,TORONTO,43.647447,-79.380118
63,1555,AMERICANSEATINGCO,49504,GRAND RAPIDS,42.978694,-85.682009
257,2903,CHAD^INC,91311,CHATSWORTH,34.255819,-118.57958
253,2877,CENTURYBUSINESSCREDIT,10018,NEW YORK,40.754029,-73.985899
944,8430,PENNCENTRALCO,45202,CINCINNATI,39.099723,-84.512324


In [22]:
# DuckDB configuration
# con.execute('SET memory_limit="10GB"')
# con.execute('SET threads TO 1')

In [23]:
# Get one random row of nets
con.sql('SELECT * FROM nets_pub ORDER BY RANDOM() LIMIT 1').fetchdf()

Unnamed: 0,hqduns,hqcompany,latitude,longitude
0,943170548,NOVACORP,39.3937,-107.089203


In [24]:
prepared_statement_name_pub = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_pub n
        --we can use other similarity functions
        --we can also use other similarity scores
"""
prepared_statement_name_all = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_all n
        WHERE hqcity = ?
        --we can use other similarity functions
        --we can also use other similarity scores
"""

prepared_statement_add_match = """
    INSERT INTO matched
    VALUES (?, --gvkey 
            (SELECT hqcompany FROM compustat WHERE gvkey = ?), --hqcompany
            ?, --hqduns_pub
            (SELECT hqcompany FROM nets_pub WHERE hqduns = ?),
            ?, --name similarity_pub
            ?, --distance pub
            ?, --hqduns_all
            (SELECT hqcompany FROM nets_all WHERE hqduns = ?),
            ?,  --name similarity_all
            ?); --distance all
"""

prepared_statement_geo_match_all = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnam n
    ORDER BY distance ASC
    LIMIT 1;
"""

prepared_statement_geo_match_pub = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnpm n
    ORDER BY distance ASC
    LIMIT 1;
"""

In [25]:
# Get the matches for a specific name
def name_similarity_pub(comp_name):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_pub, [comp_name])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 5').fetchdf() #TODO change threshold

def name_similarity_all(comp_name, city):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_all, [comp_name, city])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 10').fetchdf() #TODO change threshold


In [26]:
# con.execute('DROP TABLE IF EXISTS matched;')
sql_query = """
CREATE TABLE IF NOT EXISTS matched (
    gvkey INTEGER,
    hqcompany TEXT,
    hqduns_pub INTEGER,
    hqcompany_pub TEXT,
    similarity_pub FLOAT,
    distance_pub FLOAT,
    hqduns_all INTEGER,
    hqcompany_all TEXT,
    similarity_all FLOAT,
    distance_all FLOAT
);
"""
con.execute(sql_query)

<duckdb.duckdb.DuckDBPyConnection at 0x20c3c4fc330>

In [27]:

compustat_df = con.sql('SELECT * FROM compustat').fetchdf()

# Fill missing values
compustat_df['lat'] = compustat_df['lat'].fillna(0)
compustat_df['lon'] = compustat_df['lon'].fillna(0)
# compustat_df['phone'] = compustat_df['phone'].fillna('')
#compustat_df = compustat_df[0:2000]

In [28]:
# Stats Counter
pub_match_all = 0

In [29]:
def matching(compustat_df):
    pub_match_all = 0
    i = 0
    for index, row in compustat_df.iterrows():
        i += 1
        print(f'Processing {i} of {compustat_df.shape[0]}')
        gvkey = row['gvkey']
        #check if gvkey is already in matched
        if con.sql(f'SELECT COUNT(*) FROM matched WHERE gvkey = {gvkey}').fetchdf()['count_star()'].iloc[0] > 0:
            print(f'Already processed {gvkey}, {row["hqcompany"]}')
            continue
        # Get candidate matches
        name_pub_match = name_similarity_pub(row['hqcompany'])
        
        try:
            filtered_name_pub_matches = name_pub_match[name_pub_match['similarity_score'] == name_pub_match['similarity_score'].iloc[0]]
            # Create a temporary table with the filtered results
            con.execute('DROP TABLE IF EXISTS fnpm;')
            con.execute('DROP TABLE IF EXISTS tmp;')
            con.execute('CREATE TABLE fnpm AS SELECT * FROM filtered_name_pub_matches;')
            pub_match = con.execute(prepared_statement_geo_match_pub, [row['lat'], row['lon']]).fetchdf().iloc[0]
        except IndexError:
            pub_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
            pub_match.loc[0] = [-1, -1, 9999999]  # Fill with default values
            
        name_all_matches = name_similarity_all(row['hqcompany'], row['city'])
        
        try:
            # Get the top similarity score
            top_similarity_score = name_all_matches['similarity_score'].iloc[0]
            # Filter the DataFrame
            filtered_name_all_matches = name_all_matches[name_all_matches['similarity_score'] == top_similarity_score]
            # Create a temporary table with the filtered results
            con.execute('DROP TABLE IF EXISTS fnam;')
            con.execute('DROP TABLE IF EXISTS tmp;')
            con.execute('CREATE TABLE fnam AS SELECT * FROM filtered_name_all_matches;')
            name_all_match = con.execute(prepared_statement_geo_match_all, [row['lat'], row['lon']]).fetchdf().iloc[0]
        except IndexError:
            name_all_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
            name_all_match.loc[0] = [-1, -1, 9999999]  # Fill with default values
    
        if pub_match['hqduns'].item() in name_all_matches['hqduns'].values and pub_match['similarity_score'].item() >= name_all_match['similarity_score'].item():
            name_all_match = pub_match
            pub_match_all += 1
        
        try:    
            con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), name_all_match['distance'].item()])
        except KeyError:
             con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), -1])

In [30]:
matching(compustat_df.sample(100))

Processing 1 of 100


Processing 2 of 100
Processing 3 of 100
Processing 4 of 100
Processing 5 of 100
Processing 6 of 100
Processing 7 of 100
Processing 8 of 100
Processing 9 of 100
Processing 10 of 100
Processing 11 of 100
Processing 12 of 100
Processing 13 of 100
Processing 14 of 100
Processing 15 of 100
Processing 16 of 100
Processing 17 of 100
Processing 18 of 100
Processing 19 of 100
Processing 20 of 100
Processing 21 of 100
Processing 22 of 100
Processing 23 of 100
Processing 24 of 100
Processing 25 of 100
Processing 26 of 100
Processing 27 of 100
Processing 28 of 100
Processing 29 of 100
Processing 30 of 100
Processing 31 of 100
Processing 32 of 100
Processing 33 of 100
Processing 34 of 100
Processing 35 of 100
Processing 36 of 100
Processing 37 of 100
Processing 38 of 100
Processing 39 of 100
Processing 40 of 100
Processing 41 of 100
Processing 42 of 100
Processing 43 of 100
Processing 44 of 100
Processing 45 of 100
Processing 46 of 100
Processing 47 of 100
Processing 48 of 100
Processing 49 of 100


In [35]:
con.sql('SELECT * FROM matched WHERE similarity_all > 0').fetchdf()

Unnamed: 0,gvkey,hqcompany,hqduns_pub,hqcompany_pub,similarity_pub,distance_pub,hqduns_all,hqcompany_all,similarity_all,distance_all


In [32]:
matched = con.sql('SELECT COUNT(*) FROM matched WHERE similarity_pub >= 0.94').fetchdf()
matched_size = con.sql('SELECT COUNT(*) FROM matched').fetchdf()['count_star()'].iloc[0]
#print percentage of matched companies|
print(matched['count_star()'].iloc[0] / matched_size * 100)
print(matched['count_star()'].iloc[0])

40.0
40


In [33]:
# Show statistics
print(f'Number of companies: {matched_size}')
accuracy = con.sql('SELECT COUNT(*) FROM matched WHERE similarity_pub >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
accuray_with_all = con.sql('SELECT COUNT(*) FROM matched WHERE similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Accuracy with all companies: {accuray_with_all}%')
print(f'Accuracy only public companies: {accuracy}%')
same_match = pub_match_all / matched_size * 100
print(f'Entries with same match: {same_match}%')
no_pub_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match: {no_pub_percentage}%')
no_pub_but_all_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match: {no_pub_but_all_percentage}%')
# All matches, where no pub match and distance < 1
no_pub_but_all_distance_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND distance_all < 1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match and distance < 1: {no_pub_but_all_distance_percentage}%')

Number of companies: 100
Accuracy with all companies: 0.0%
Accuracy only public companies: 39.0%
Entries with same match: 0.0%
Entries with no pub match: 46.0%
Entries with no pub match but all match: 0.0%
Entries with no pub match but all match and distance < 1: 0.0%


In [34]:
# con.commit()
# con.close()