In [25]:
import numpy as np
import pandas as pd
import duckdb
# suppress scientific notation by setting float_format
pd.set_option('display.float_format', '{:f}'.format)

In [26]:
# Conect to Database
con = duckdb.connect(database='test.db', read_only=False)
con.execute('INSTALL spatial')
con.execute('LOAD spatial')
#TODO build training data
con.sql('SHOW TABLES').fetchdf()

Unnamed: 0,name
0,compustat
1,compustat_geocoded
2,nets_all
3,nets_pub


In [27]:
# DuckDB configuration
con.execute('SET memory_limit="10GB"')
con.execute('SET threads TO 1')

<duckdb.duckdb.DuckDBPyConnection at 0x12c0abeb0>

In [28]:
# Get one random row of nets
con.sql('SELECT * FROM nets_pub ORDER BY RANDOM() LIMIT 1').fetchdf()

Unnamed: 0,hqduns,hqcompany,latitude,longitude
0,86530011,DESIGNSINC,32.781399,-79.927002


In [29]:
prepared_statement_name_pub = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_pub n
        --we can use other similarity functions
        --we can also use other similarity scores
"""
prepared_statement_name_all = """
    CREATE TABLE tmp AS
    SELECT
        hqduns, 
        hqcompany,
        latitude,
        longitude,
        jaro_winkler_similarity(n.hqcompany, ?) AS similarity_score
    FROM
        nets_all n
        --we can use other similarity functions
        --we can also use other similarity scores
"""

prepared_statement_add_match = """
    INSERT INTO matched
    VALUES (?, --gvkey 
            (SELECT hqcompany FROM compustat WHERE gvkey = ?), --hqcompany
            ?, --hqduns_pub
            (SELECT hqcompany FROM nets_pub WHERE hqduns = ?),
            ?, --name similarity_pub
            ?, --distance pub
            ?, --hqduns_all
            (SELECT hqcompany FROM nets_all WHERE hqduns = ?),
            ?,  --name similarity_all
            ?); --distance all
"""

prepared_statement_geo_match_all = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnam n
    ORDER BY distance ASC
    LIMIT 1;
"""

prepared_statement_geo_match_pub = """
    SELECT 
        hqduns,
        similarity_score,
        ST_Distance(
            ST_Point(n.latitude, n.longitude)::GEOMETRY,
            ST_Point(?, ?)::GEOMETRY
            ) AS distance
    FROM fnpm n
    ORDER BY distance ASC
    LIMIT 1;
"""

In [30]:
# Get the matches for a specific name
def name_similarity_pub(comp_name):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_pub, [comp_name])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 5').fetchdf() #TODO change threshold

def name_similarity_all(comp_name):
    con.execute('DROP TABLE IF EXISTS tmp;')
    con.execute(prepared_statement_name_all, [comp_name])
    return con.sql('SELECT * FROM tmp WHERE similarity_score > 0.9 ORDER BY similarity_score DESC LIMIT 10').fetchdf() #TODO change threshold


<duckdb.duckdb.DuckDBPyConnection at 0x12c0abeb0>

In [32]:

compustat_df = con.sql('SELECT * FROM compustat').fetchdf()

# Fill missing values
compustat_df['lat'] = compustat_df['lat'].fillna(0)
compustat_df['lon'] = compustat_df['lon'].fillna(0)
# compustat_df['phone'] = compustat_df['phone'].fillna('')
compustat_df = compustat_df.sample(1) #compustat_df[8000:18000]

In [33]:
# Stats Counter
pub_match_all = 0

In [34]:
i = 0
for index, row in compustat_df.iterrows():
    i += 1
    print(f'Processing {i} of {compustat_df.shape[0]}')
    gvkey = row['gvkey']
    #check if gvkey is already in matched
    if con.sql(f'SELECT COUNT(*) FROM matched WHERE gvkey = {gvkey}').fetchdf()['count_star()'].iloc[0] > 0:
        print(f'Already processed {gvkey}, {row["hqcompany"]}')
        continue
    # Get candidate matches
    name_pub_match = name_similarity_pub(row['hqcompany'])
    
    try:
        filtered_name_pub_matches = name_pub_match[name_pub_match['similarity_score'] == name_pub_match['similarity_score'].iloc[0]]
        # Create a temporary table with the filtered results
        con.execute('DROP TABLE IF EXISTS fnpm;')
        con.execute('DROP TABLE IF EXISTS tmp;')
        con.execute('CREATE TABLE fnpm AS SELECT * FROM filtered_name_pub_matches;')
        pub_match = con.execute(prepared_statement_geo_match_pub, [row['lat'], row['lon']]).fetchdf().iloc[0]
    except IndexError:
        pub_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
        pub_match.loc[0] = [-1, -1, 9999999]  # Fill with default values
        
    name_all_matches = name_similarity_all(row['hqcompany'])
    
    try:
        # Get the top similarity score
        top_similarity_score = name_all_matches['similarity_score'].iloc[0]
        # Filter the DataFrame
        filtered_name_all_matches = name_all_matches[name_all_matches['similarity_score'] == top_similarity_score]
        # Create a temporary table with the filtered results
        con.execute('DROP TABLE IF EXISTS fnam;')
        con.execute('DROP TABLE IF EXISTS tmp;')
        con.execute('CREATE TABLE fnam AS SELECT * FROM filtered_name_all_matches;')
        name_all_match = con.execute(prepared_statement_geo_match_all, [row['lat'], row['lon']]).fetchdf().iloc[0]
    except IndexError:
        name_all_match = pd.DataFrame(columns= ['hqduns', 'similarity_score', 'distance'])
        name_all_match.loc[0] = [-1, -1, 9999999]  # Fill with default values

    if pub_match['hqduns'].item() in name_all_matches['hqduns'].values and pub_match['similarity_score'].item() >= name_all_match['similarity_score'].item():
        name_all_match = pub_match
        pub_match_all += 1
    
    try:    
        con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), name_all_match['distance'].item()])
    except KeyError:
         con.execute(prepared_statement_add_match, [gvkey, gvkey, pub_match['hqduns'].item(), pub_match['hqduns'].item(), pub_match['similarity_score'].item(), pub_match['distance'].item(), name_all_match['hqduns'].item(), name_all_match['hqduns'].item(), name_all_match['similarity_score'].item(), -1])

    
    

Processing 1 of 1


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Error: Invalid Error: KeyboardInterrupt: <EMPTY MESSAGE>

At:
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/traitlets/traitlets.py(708): __set__
  /var/folders/x6/ky5l77zx4071ly980g1k5fjm0000gn/T/ipykernel_41310/1645377918.py(9): name_similarity_all
  /var/folders/x6/ky5l77zx4071ly980g1k5fjm0000gn/T/ipykernel_41310/1601111449.py(24): <module>
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3553): run_code
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3493): run_ast_nodes
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3311): run_cell_async
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/async_helpers.py(129): _pseudo_sync_runner
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3106): _run_cell
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/IPython/core/interactiveshell.py(3051): run_cell
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/zmqshell.py(549): run_cell
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/ipkernel.py(429): do_execute
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/kernelbase.py(767): execute_request
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/kernelbase.py(429): dispatch_shell
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/kernelbase.py(523): process_one
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/kernelbase.py(534): dispatch_queue
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/asyncio/events.py(84): _run
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/asyncio/base_events.py(1951): _run_once
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/asyncio/base_events.py(618): run_forever
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/tornado/platform/asyncio.py(205): start
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel/kernelapp.py(701): start
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/traitlets/config/application.py(1075): launch_instance
  /Users/marisbuttmann/.pyenv/versions/3.12.0/lib/python3.12/site-packages/ipykernel_launcher.py(17): <module>
  <frozen runpy>(88): _run_code
  <frozen runpy>(198): _run_module_as_main


In [None]:
con.sql('SELECT * FROM matched').fetchdf().sample(100)

In [None]:
matched = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_pub >= 0.95').fetchdf()
matched_size = con.sql('SELECT COUNT(*) FROM matched').fetchdf()['count_star()'].iloc[0]
#print percentage of matched companies|
print(matched['count_star()'].iloc[0] / matched_size * 100)
print(matched['count_star()'].iloc[0])

In [None]:
# Show statistics
print(f'Number of companies: {matched_size}')
accuracy = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_pub >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
accuray_with_all = con.sql('SELECT COUNT(*) FROM matched WHERE name_similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Accuracy with all companies: {accuray_with_all}%')
print(f'Accuracy only public companies: {accuracy}%')
same_match = pub_match_all / matched_size * 100
print(f'Entries with same match: {same_match}%')
no_pub_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match: {no_pub_percentage}%')
no_pub_but_all_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND name_similarity_all >= 0.95').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match: {no_pub_but_all_percentage}%')
# All matches, where no pub match and distance < 1
no_pub_but_all_distance_percentage = con.sql('SELECT COUNT(*) FROM matched WHERE hqduns_pub = -1 AND hqduns_all != -1 AND distance_all < 1').fetchdf()['count_star()'].iloc[0] / matched_size * 100
print(f'Entries with no pub match but all match and distance < 1: {no_pub_but_all_distance_percentage}%')