In [13]:
import duckdb as db
import pandas as pd
from geopy.distance import geodesic
pd.set_option('display.float_format', '{:f}'.format)

In [14]:
# establish connection to the database
con = db.connect('database.db', read_only=True)
# Show all tables in the database
con.sql("SHOW TABLES")

┌────────────────────┐
│        name        │
│      varchar       │
├────────────────────┤
│ compustat          │
│ compustat_geocoded │
│ nets_all           │
│ nets_pub           │
└────────────────────┘

In [22]:
con.sql('DESCRIBE compustat')

┌─────────────┬──────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type  │  null   │   key   │ default │  extra  │
│   varchar   │   varchar    │ varchar │ varchar │ varchar │ varchar │
├─────────────┼──────────────┼─────────┼─────────┼─────────┼─────────┤
│ gvkey       │ INTEGER      │ YES     │ NULL    │ NULL    │ NULL    │
│ hqcompany   │ VARCHAR      │ YES     │ NULL    │ NULL    │ NULL    │
│ lat         │ DECIMAL(9,6) │ YES     │ NULL    │ NULL    │ NULL    │
│ lon         │ DECIMAL(9,6) │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴──────────────┴─────────┴─────────┴─────────┴─────────┘

In [17]:
matched = pd.read_csv('Data/matched.csv')
matched.insert(6, 'distance_pub_miles', 9999999)
matched.insert(11, 'distance_all_miles', 9999999)
matched

Unnamed: 0,gvkey,hqcompany,hqduns_pub,hqcompany_pub,similarity_pub,distance_pub,distance_pub_miles,hqduns_all,hqcompany_all,similarity_all,distance_all,distance_all_miles
0,1000,A&EPLASTIKPAKINC,-1,,-1.000000,9999999.000000,9999999,-1,,-1.000000,9999999.000000,9999999
1,1004,AARCORP,5425814,AARCORP,1.000000,15.079962,9999999,5425814,AARCORP,1.000000,15.079962,9999999
2,1009,ABSINDINC,4159919,ABSINDINC,1.000000,0.016540,9999999,4159919,ABSINDINC,1.000000,0.016540,9999999
3,1015,ADIELECTRONICSINC,-1,,-1.000000,9999999.000000,9999999,849768978,APIELECTRONICSINC,0.964706,0.026698,9999999
4,1019,AFAPROTECTIVESYSTEMSINC,6980528,AFAPROTECTIVESYSTEMSINC,1.000000,2.912565,9999999,-1,,-1.000000,9999999.000000,9999999
...,...,...,...,...,...,...,...,...,...,...,...,...
39777,327451,GRINDRODSHIPPING,-1,,-1.000000,9999999.000000,9999999,-1,,-1.000000,9999999.000000,9999999
39778,332115,ARMATA$INC,784379745,ARMATA$INC,1.000000,0.000750,9999999,784379745,ARMATA$INC,1.000000,0.000750,9999999
39779,345764,TSTAMPINC,-1,,-1.000000,9999999.000000,9999999,80609521,TSTAMPINC,1.000000,0.000166,9999999
39780,349530,NEXTPLAY@INC,884775169,NEXTPLAY@INC,1.000000,0.067128,9999999,884775169,NEXTPLAY@INC,1.000000,0.067128,9999999


In [23]:
for index, row in matched.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index}...")
    gvkey = row['gvkey']
    hqduns_pub = row['hqduns_pub']
    hqduns_all = row['hqduns_all']

    lat_comp = con.execute(f"SELECT lat FROM compustat WHERE gvkey = {gvkey}").fetchone()[0]
    lon_comp = con.execute(f"SELECT lon FROM compustat WHERE gvkey = {gvkey}").fetchone()[0]

    if hqduns_all > 0:
        lat_nets_all = con.execute(f"SELECT latitude FROM nets_all WHERE hqduns = {hqduns_all}").fetchone()[0]
        lon_nets_all = con.execute(f"SELECT longitude FROM nets_all WHERE hqduns = {hqduns_all}").fetchone()[0]

        # calculate distance between compustat and nets_all
        distance_all = geodesic((lat_comp, lon_comp), (lat_nets_all, lon_nets_all)).miles

        # add distance to distance_all_miles column
        matched.at[index, 'distance_all_miles'] = distance_all

    if hqduns_pub > 0:
        lat_nets_pub = con.execute(f"SELECT latitude FROM nets_pub WHERE hqduns = {hqduns_pub}").fetchone()[0]
        lon_nets_pub = con.execute(f"SELECT longitude FROM nets_pub WHERE hqduns = {hqduns_pub}").fetchone()[0]

        # calculate distance between compustat and nets_pub
        distance_pub = geodesic((lat_comp, lon_comp), (lat_nets_pub, lon_nets_pub)).miles

        # add distance to distance_pub_miles column
        matched.at[index, 'distance_pub_miles'] = distance_pub

In [25]:
matched.to_csv('Data/matched_with_miles.csv', index=False)
matched.to_stata('Data/matched_with_miles.dta', write_index=False)