## Setup

In [1]:
from pathlib import Path
import duckdb
from dcpy.utils import duckdb as dcpduckdb
import pandas as pd

pd.set_option('display.max_columns', 50)

In [2]:
DB_PATH = Path("facdb_qa.db")

In [4]:
# # delete the database if it exists
# DB_PATH.unlink(missing_ok=True)

# # create the database with extensions and credentials
# with duckdb.connect(str(DB_PATH)) as connection:
#     connection.sql(f"INSTALL spatial")
#     connection.sql(f"LOAD spatial")


In [3]:
# test
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        "DESCRIBE TABLE 's3://edm-recipes/datasets/test_nypl_libraries/20240124/test_nypl_libraries.csv'"
    ).show()

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │ column_type │  null   │   key   │ default │  extra  │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ field_1     │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ lon         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ lat         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ name        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ zipcode     │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ address     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ locality    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ region      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘



## `dpr_parksproperties`

In [5]:
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        """
            create table dpr_parksproperties_all as
            select * from
            read_csv(
                's3://edm-recipes/datasets/dpr_parksproperties/*/dpr_parksproperties.csv',
                union_by_name = true,
                filename = true
            )
        """
    )

CatalogException: Catalog Error: Table with name "dpr_parksproperties_all" already exists!

In [6]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("SHOW ALL TABLES").show()

┌──────────┬─────────┬─────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│ database │ schema  │          name           │                                                                                                                                   

In [7]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("describe table dpr_parksproperties_all").show(max_rows=100)

┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name   │ column_type │  null   │   key   │ default │  extra  │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ jurisdiction    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ mapped          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ zipcode         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ acres           │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ location        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ nys_assembly    │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ councildistrict │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ url             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ typecategory    │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ us_congress     │ BIGINT      │ YES 

In [8]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("select * from dpr_parksproperties_all").show()

┌──────────────┬─────────┬──────────────────────┬────────┬──────────────────────┬──────────────┬─────────────────┬──────────────────────┬──────────────────────┬─────────────┬──────────────────────┬──────────┬────────────┬─────────┬─────────────────────┬────────────────┬──────────┬──────────────────────┬──────────────────────┬────────────┬─────────────┬──────────────────────┬──────────┬─────────┬─────────────────────┬───────────┬───────────┬──────────────────────┬──────────────────────┬──────────────┬─────────┬────────────┬────────────────┬─────────┬────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [9]:
with duckdb.connect(str(DB_PATH)) as connection:
    # connection.sql("alter table dpr_parksproperties_all add column version varchar")
    connection.sql("update dpr_parksproperties_all set version = string_split(parse_dirpath(filename), '/')[-1]")
    connection.sql("select version, filename, location, from dpr_parksproperties_all").show()

┌──────────┬────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────┐
│ version  │                                    filename                                    │                           location                            │
│ varchar  │                                    varchar                                     │                            varchar                            │
├──────────┼────────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────────────────┤
│ 20210123 │ s3://edm-recipes/datasets/dpr_parksproperties/20210123/dpr_parksproperties.csv │ Gravesend Bay, Bay 44 St. to Bay 49 St., Shore Pkwy.          │
│ 20210123 │ s3://edm-recipes/datasets/dpr_parksproperties/20210123/dpr_parksproperties.csv │ Atlantic Ave., Columbia Pl., State St.                        │
│ 20210123 │ s3://edm-recipes/datasets/dpr_parksprop

In [12]:
with duckdb.connect(str(DB_PATH)) as connection:
  old_dpr_parksproperties = connection.sql("select * from dpr_parksproperties_all where version = '20240109'").df()
old_dpr_parksproperties

Unnamed: 0,jurisdiction,mapped,zipcode,acres,location,nys_assembly,councildistrict,url,typecategory,us_congress,eapply,parentid,gispropnum,retired,commissiondate,communityboard,objectid,globalid,name311,department,pip_ratable,subcategory,precinct,permit,acquisitiondate,omppropid,gisobjid,signname,address,permitparent,class,nys_senate,permitdistrict,borough,waterfront,WKT,filename,version
0,DPR/CDOT/SDOT,True,1.110311e+34,249.389,"Astoria Blvd. and 48 St. to Union Tp., Park Dr...",273536.0,1.921222e+09,http://www.nycgovparks.org/parks/Q084A/,Parkway,614.0,Grand Central Parkway Ext,Q-03,Q084A,False,NaT,4.014034e+11,15508,14567485-5E02-4AD0-B8B6-516D78FC8636,Grand Central Parkway Extension,Q-03,False,EXWY,114,True,1937-06-01,Q084A,100000441.0,Grand Central Parkway Extension,,Q-03,PARK,13141516.0,Q-03,Q,True,MULTIPOLYGON (((-73.8587476480729 40.767414466...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
1,DPR,False,1.169300e+04,9.375,W. 19 Rd. bet Jamaica Bay and Cross Bay Blvd.,23.0,3.200000e+01,http://www.nycgovparks.org/parks/Q498/,Nature Area,5.0,Sunset Cove Park,Q-14,Q498,False,NaT,4.140000e+02,6344,40AF4E2A-9123-4DD1-99B4-7CC3F047C71F,Sunset Cove Park,Q-14,False,,100,False,2009-11-20,Q498,100003992.0,Sunset Cove Park,,Q-14,PARK,10.0,Q-14,Q,True,MULTIPOLYGON (((-73.8221830093641 40.598920723...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
2,DPR/DOE,False,1.136200e+04,2.035,251 St. bet. 61 Ave. and 63 Ave.,26.0,2.300000e+01,http://www.nycgovparks.org/parks/Q346/,Jointly Operated Playground,3.0,Challenge Playground,Q-11,Q346,False,NaT,4.110000e+02,6293,F083B449-6079-40A5-A938-1771F8B763F7,Challenge Playground,Q-11,True,JOP,111,True,1949-03-24,Q346,100000009.0,Challenge Playground,61-25 LITTLE NECK PARKWAY,Q-11,PARK,11.0,Q-11,Q,False,MULTIPOLYGON (((-73.7273829319915 40.756052094...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
3,DPR,True,1.100311e+35,326.895,Whitestone Exwy. at 13 Ave. to the Linden Blvd...,,,http://www.nycgovparks.org/parks/Q135/,Parkway,,Cross Island Parkway,Q-13,Q135,False,NaT,4.074114e+08,69239,124B45D7-E8BD-4B8A-9573-E3BCBBE2285D,Cross Island Parkway,Q-13,False,PKWY,109,False,1938-07-15,Q135,100000039.0,Cross Island Parkway,,Q-13,PARK,,Q-13,Q,True,MULTIPOLYGON (((-73.7767219795344 40.788163779...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
4,DPR,True,1.121311e+24,63.636,Eastern Pkwy. bet. Grand Army Plaza and Ralph ...,43445557.0,,http://www.nycgovparks.org/parks/B029/,Parkway,910.0,Eastern Parkway,B-08,B029,False,NaT,3.083090e+05,69228,3CC7E10E-352E-427C-B9C1-35A428020204,Eastern Parkway,B-08,False,Large Park,78,True,1898-01-01,B029,100005048.0,Eastern Parkway,,B-08,PARK,20.0,B-08,B,False,MULTIPOLYGON (((-73.9662881520952 40.672550022...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,DPR,True,1.110100e+04,20.340,"Queensboro Bridge, 41 Rd., 40 Ave. bet. The Ea...",36.0,2.600000e+01,http://www.nycgovparks.org/parks/Q104/,Community Park,,Queensbridge Park,Q-01,Q104,False,NaT,4.014020e+05,4587,7CAFE9EC-5AA0-49C1-8689-4068B8E942D0,Queensbridge Park,Q-01,True,Large Park,114,True,1940-06-13,Q104,100000370.0,Queensbridge Park,40-50 VERNON BOULEVARD,Q-01,PARK,,Q-01,Q,True,MULTIPOLYGON (((-73.9480605861791 40.755892124...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
2042,DPR,True,1.046310e+09,11.689,"W 232 St , Independence Av , W 235 St ,",81.0,1.100000e+01,http://www.nycgovparks.org/parks/X201/,Community Park,15.0,Seton Park,X-08,X201,False,NaT,2.080000e+02,4822,E354DD98-F8BB-4EE8-8502-7E1F76A8426A,Seton Park,X-08,True,Large Park,50,True,1959-05-28,X201,100004694.0,Seton Park,751 WEST 232 STREET,X-08,PARK,33.0,X-08,X,False,MULTIPOLYGON (((-73.9147960786935 40.886428174...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
2043,DPR/DOE,False,1.030500e+04,0.892,"Sand La., Major Ave. and Mcfarland Ave.",64.0,5.000000e+01,http://www.nycgovparks.org/parks/R062/,Jointly Operated Playground,11.0,Arrochar Playground,R-02,R062,False,NaT,5.020000e+02,5056,385179D7-E7F1-4A5F-9CDC-011BCC4FE13F,Arrochar Playground,R-02,True,JOP,122,True,1945-10-24,R062,100004058.0,Arrochar Playground,200 MC FARLAND AVENUE,R-02,PARK,23.0,R-02,R,False,MULTIPOLYGON (((-74.07090104369 40.59742659450...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109
2044,DPR/DOE,True,1.030400e+04,4.096,"Tompkins Ave., Broad St. and Hill St.",61.0,4.900000e+01,http://www.nycgovparks.org/parks/R061/,Jointly Operated Playground,11.0,Stapleton Playground,R-01,R061,False,NaT,5.010000e+02,5055,F1FE3802-1C95-4AEC-9ACE-D18E2E8B3FE4,Rev. Dr. Maggie Howard Playground,R-01,True,JOP,120,True,1958-04-07,R061,100004541.0,Rev. Dr. Maggie Howard Playground,64 TOMPKINS AVENUE,R-01,PARK,23.0,R-01,R,False,MULTIPOLYGON (((-74.0787471125525 40.622672932...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240109


In [13]:
with duckdb.connect(str(DB_PATH)) as connection:
  new_dpr_parksproperties = connection.sql("select * from dpr_parksproperties_all where version = '20240814'").df()
new_dpr_parksproperties

Unnamed: 0,jurisdiction,mapped,zipcode,acres,location,nys_assembly,councildistrict,url,typecategory,us_congress,eapply,parentid,gispropnum,retired,commissiondate,communityboard,objectid,globalid,name311,department,pip_ratable,subcategory,precinct,permit,acquisitiondate,omppropid,gisobjid,signname,address,permitparent,class,nys_senate,permitdistrict,borough,waterfront,WKT,filename,version
0,DPR,True,1.122400e+04,10.010,"Surf Ave. between W. 16 St. and W. 19 St., Pub...",46.0,4.700000e+01,http://www.nycgovparks.org/parks/B369/,Community Park,8.0,Steeplechase Park,B-13,B369,False,NaT,3.130000e+02,170747,25CF59E2-92B8-41A4-95A5-C9364C4F3E39,Steeplechase Park,B-13,False,Large Park,60,False,1969-11-19,B369,100003998.0,Steeplechase Park,1904 SURF AVENUE,B-13,PARK,23.0,B-13,B,True,MULTIPOLYGON (((-73.9832541416874 40.575307225...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
1,DPR,True,1.001910e+09,71.933,"12 Ave., Riverside Blvd. bet. W. 59 St. and W....",67.0,6.000000e+00,http://www.nycgovparks.org/parks/M353/,Community Park,12.0,Riverside Park South,M-07,M353,False,NaT,1.070000e+02,170746,4149E32C-59F8-4660-8199-233EF191FBE0,Riverside Park South,M-14,False,Large Park,20,True,2001-01-14,M353,100003968.0,Riverside Park South,400 RIVERSIDE DRIVE,M-07,PARK,47.0,M-14,M,True,MULTIPOLYGON (((-73.9871611903936 40.780743786...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
2,DPR,False,1.169200e+04,1.657,Beach 63 St. bet. Elizabeth Rd. and Thursby Ave.,31.0,3.100000e+01,http://www.nycgovparks.org/parks/Q479/,Undeveloped,5.0,Thursby Basin Park,Q-14,Q479,False,NaT,4.140000e+02,170745,FB85433F-1BD5-4A5B-9EC1-6A42A72CE5BA,Thursby Basin Park,Q-14,False,Undeveloped,100,True,2003-01-16,Q479,100000124.0,Thursby Basin Park,62-02 BEACH 63 STREET,Q-14,PARK,10.0,Q-14,Q,True,MULTIPOLYGON (((-73.791081668037 40.5953408013...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
3,DPR/CDOT/SDOT,True,1.110311e+34,249.389,"Astoria Blvd. and 48 St. to Union Tp., Park Dr...",273536.0,1.921222e+09,http://www.nycgovparks.org/parks/Q084A/,Parkway,614.0,Grand Central Parkway Ext,Q-03,Q084A,False,NaT,4.014034e+11,15508,14567485-5E02-4AD0-B8B6-516D78FC8636,Grand Central Parkway Extension,Q-03,False,EXWY,114,True,1937-06-01,Q084A,100000441.0,Grand Central Parkway Extension,,Q-03,PARK,13141516.0,Q-03,Q,True,MULTIPOLYGON (((-73.8587476480729 40.767414466...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
4,DPR,False,1.123500e+04,1.225,Brighton 2 St. bet. Brightwater Ct. and Boardw...,46.0,4.800000e+01,http://www.nycgovparks.org/parks/B169A/,Playground,8.0,Brighton Playground,B-13,B169A,False,NaT,3.130000e+02,170744,9FAECC9A-1FDF-4633-8F70-E54C4B482F9D,Brighton Playground,B-13,True,Neighborhood Plgd,60,True,1950-03-09,B169A,100004640.0,Brighton Playground,126 BRIGHTWATER COURT,B-13,PARK,23.0,B-13,B,True,MULTIPOLYGON (((-73.9659374147051 40.574910156...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4090,DPR,False,1.002110e+34,4.623,E. 59 St. To E. 97 St. and Park Ave.,6873.0,4.000000e+00,http://www.nycgovparks.org/parks/M060C/,Mall,12.0,Park Avenue Malls 59th-97th,M-08,M060C,False,NaT,1.081110e+05,6366,DC410D17-B9F4-412F-9022-747ABD8894F6,Park Avenue Malls,M-08,False,Sitting Area/Triangle/Mall,19,True,1890-06-04,M060C,100003799.0,Park Avenue Malls,,M-08,PARK,2829.0,M-08,M,False,MULTIPOLYGON (((-73.952519683472 40.7866732550...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
4091,DPR,True,1.121100e+04,35.533,Kent Ave. bet. Quay St. and N 9 St.,50.0,3.300000e+01,http://www.nycgovparks.org/parks/B529/,Neighborhood Park,7.0,Bushwick Inlet Park,B-01,B529,False,NaT,3.010000e+02,6427,D55AD6B5-AFD4-4825-8663-19558339588D,Bushwick Inlet Park,B-01,True,Neighborhood Park,94,True,2007-11-21,B529,100005022.0,Bushwick Inlet Park,86 KENT AVENUE,B-01,PARK,59.0,B-01,B,True,MULTIPOLYGON (((-73.9602262663453 40.725310945...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
4092,DPR/CDOT/TBTA,False,1.002810e+19,8.595,"FDR Drive, E 90 St. To E 125 St.",6876.0,5.800000e+01,http://www.nycgovparks.org/parks/M108T01/,Waterfront Facility,1213.0,East River Esplanade 90th-125th,M-15,M108T01,False,NaT,1.081110e+05,69220,B215AD42-CB08-4BB8-8A66-67E381CD9E7E,East River Esplanade,M-15,False,Sitting Area/Triangle/Mall,19,True,NaT,M108T01,100004777.0,East River Esplanade,,M-15,PARK,29.0,M-15,M,True,MULTIPOLYGON (((-73.9426191817124 40.783796468...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814
4093,DPR,True,1.003500e+04,254.300,East River and Harlem River,68.0,8.000000e+00,http://www.nycgovparks.org/parks/M104/,Flagship Park,13.0,Randall's Island Park,M-11,M104,False,NaT,1.110000e+02,6300,37B937EF-C98C-4D00-846D-C45D8975DB3B,Randall's Island Park,M-11R,False,Flagship Park,25,True,1835-04-08,M104,100004488.0,Randall's Island Park,,M-11,PARK,29.0,M-11R,M,True,MULTIPOLYGON (((-73.9259450467 40.790864074149...,s3://edm-recipes/datasets/dpr_parksproperties/...,20240814


### `typecategory`

In [None]:
new_dpr_parksproperties.compare(old_dpr_parksproperties)

## `dca_operatingbusinesses`

In [14]:
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        """
            create table dca_operatingbusinesses_all as
            select * from
            read_csv(
                's3://edm-recipes/datasets/dca_operatingbusinesses/*/dca_operatingbusinesses.csv',
                union_by_name = true,
                filename = true
            )
        """
    )

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [15]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("SHOW ALL TABLES").show()

┌──────────┬─────────┬─────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│ database │ schema  │            name             │                                                                                                                      

In [16]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("describe table dca_operatingbusinesses_all").show(max_rows=100)

┌───────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│          column_name          │ column_type │  null   │   key   │ default │  extra  │
│            varchar            │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ WKT                           │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ dca_license_number            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ license_type                  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ license_expiration_date       │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ license_status                │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ license_creation_date         │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
│ industry                      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ business_name                 

In [17]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("select * from dca_operatingbusinesses_all").show()

┌────────────────────────────────────────────┬────────────────────┬──────────────┬─────────────────────────┬────────────────┬───────────────────────┬──────────────────────────────┬───────────────────────────────────────┬────────────────────────────────────┬──────────────────┬─────────────────────┬───────────────────────────────┬───────────────┬───────────────┬─────────────┬──────────────────────┬─────────────────┬──────────────┬─────────────────┬──────────────────┬─────────┬────────────┬─────────┬──────────────┬────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬──────────────────┬──────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────┐
│                    WKT                     │ dca_license_number │ license_type │ license_expiration_date │ license_status │ license_creation_date │           industry           │             business_name        

In [19]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("alter table dca_operatingbusinesses_all add column version varchar")
    connection.sql("update dca_operatingbusinesses_all set version = string_split(parse_dirpath(filename), '/')[-1]")
    connection.sql("select version, filename, from dca_operatingbusinesses_all").show()

┌──────────┬────────────────────────────────────────────────────────────────────────────────────────┐
│ version  │                                        filename                                        │
│ varchar  │                                        varchar                                         │
├──────────┼────────────────────────────────────────────────────────────────────────────────────────┤
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operatingbusinesses.csv │
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operatingbusinesses.csv │
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operatingbusinesses.csv │
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operatingbusinesses.csv │
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operatingbusinesses.csv │
│ 20210402 │ s3://edm-recipes/datasets/dca_operatingbusinesses/20210402/dca_operat

In [30]:
with duckdb.connect(str(DB_PATH)) as connection:
  old_dca_operatingbusinesses = connection.sql("select * from dca_operatingbusinesses_all where version = '20230714' order by dca_license_number asc").df()
old_dca_operatingbusinesses

Unnamed: 0,WKT,dca_license_number,license_type,license_expiration_date,license_status,license_creation_date,industry,business_name,business_name_2,address_building,address_street_name,secondary_address_street_name,address_city,address_state,address_zip,contact_phone_number,address_borough,borough_code,community_board,council_district,bin,bbl,nta,census_tract,detail,longitude,latitude,location,filename,version
0,POINT (-73.9928206286642 40.723510517629),0000711-DCA,Business,2007-07-31,Inactive,1999-05-25,Secondhand Dealer - General,"M. LEVIN, INC.",,269,BOWERY,,NEW YORK,NY,10002,2126743579,Manhattan,1.0,103.0,1,1078071,1004270502,MN27,3601.0,,-73.992821,40.723511,"(40.723510517629045, -73.99282062866422)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
1,POINT (-73.9909619047039 40.7556130028902),0002902-DCA,Business,2024-04-30,Active,2007-04-18,Pawnbroker,"GEM FINANCIAL SERVICES, INC.",,608,8TH AVE,,NEW YORK,NY,10018,7182371166,Manhattan,1.0,105.0,03,1014495,1007890005,,,,-73.990962,40.755613,"(40.755613002890186, -73.99096190470391)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
2,POINT (-73.987179873865 40.733292338176),0006164-DCA,Business,2002-03-31,Inactive,1999-02-11,Newsstand,"SACKLER, ADELAIDE",,,3 AVENUE,EAST 14 STREET,NEW YORK,NY,10003,2125818546,Manhattan,1.0,,2,,,,,,-73.987180,40.733292,"(40.73329233817604, -73.98717987386496)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
3,POINT (-73.963805125184 40.7740303792405),0006840-DCA,Business,2017-07-31,Inactive,2007-11-02,Secondhand Dealer - General,FLORIAN PAPP INC,,962,MADISON AVE,,NEW YORK,NY,10021,2122886770,Manhattan,1.0,108.0,04,1072056,1013900056,MN40,130.0,,-73.963805,40.774030,"(40.77403037924055, -73.96380512518401)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
4,POINT (-73.9765016891177 40.7883117799114),0010669-DCA,Business,2024-03-31,Active,2002-03-25,Newsstand,"SAYYED, IQBAL",,S/E/C,BROADWAY,WEST 86 STREET,NEW YORK,NY,10024,6468532572,Manhattan,1.0,,6,,,,,,-73.976502,40.788312,"(40.78831177991138, -73.97650168911773)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281239,,2115070-DCA,Individual,2024-02-28,Active,2023-07-13,Process Server Individual,"COLUMBIA, EDWARD",,,,,BROOKLYN,NY,11231,,,,,,,,,,,,,,s3://edm-recipes/datasets/dca_operatingbusines...,20230714
281240,POINT (-73.9397541328313 40.6601813773769),2115072-2-DCA,Business,2024-12-31,Active,2023-07-13,Tobacco Retail Dealer,RUTLAND DELI AND GRILL CORP,,592,ALBANY AVE,,BROOKLYN,NY,11203,,Brooklyn,3.0,309.0,40,3107065,3048040040,,,,-73.939754,40.660181,"(40.66018137737688, -73.93975413283127)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
281241,POINT (-73.8791130217252 40.8822434860754),2115073-DCA,Business,2025-02-28,Active,2023-07-13,Home Improvement Contractor,BROTHERS TOGETHER CORP,,3525,ROCHAMBEAU AVE,,BRONX,NY,10467,3473354426,Bronx,2.0,207.0,11,2017832,2033280179,,,,-73.879113,40.882243,"(40.88224348607542, -73.87911302172519)",s3://edm-recipes/datasets/dca_operatingbusines...,20230714
281242,,2115074-DCA,Business,2025-02-28,Active,2023-07-13,Home Improvement Contractor,TAKKEE INC.,,2696,BILLINGSLEY RD,,COLUMBUS,OH,43235,6142823188,Outside NYC,,,,,,,,,,,,s3://edm-recipes/datasets/dca_operatingbusines...,20230714


In [26]:
with duckdb.connect(str(DB_PATH)) as connection:
  new_dca_operatingbusinesses = connection.sql("select * from dca_operatingbusinesses_all where version = '20240809' order by dca_license_number asc").df()
new_dca_operatingbusinesses

Unnamed: 0,WKT,dca_license_number,license_type,license_expiration_date,license_status,license_creation_date,industry,business_name,business_name_2,address_building,address_street_name,secondary_address_street_name,address_city,address_state,address_zip,contact_phone_number,address_borough,borough_code,community_board,council_district,bin,bbl,nta,census_tract,detail,longitude,latitude,location,filename,version
0,POINT (-73.9928206286642 40.723510517629),0000711-DCA,Business,2007-07-31,Inactive,1999-05-25,Secondhand Dealer - General,"M. LEVIN, INC.",,269,BOWERY,,NEW YORK,NY,10002,2126743579,Manhattan,1.0,103.0,1,1078071,1004270502,MN27,3601.0,,-73.992821,40.723511,"(40.723510517629045, -73.99282062866422)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
1,POINT (-73.9909619047039 40.7556130028902),0002902-DCA,Business,2024-04-30,Active,2007-04-18,Pawnbroker,"GEM FINANCIAL SERVICES, INC.",,608,8TH AVE,,NEW YORK,NY,10018,7182371166,Manhattan,1.0,105.0,03,1014495,1007890005,,,,-73.990962,40.755613,"(40.755613002890186, -73.99096190470391)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
2,POINT (-73.987179873865 40.733292338176),0006164-DCA,Business,2002-03-31,Inactive,1999-02-11,Newsstand,"SACKLER, ADELAIDE",,,3 AVENUE,EAST 14 STREET,NEW YORK,NY,10003,2125818546,Manhattan,1.0,,2,,,,,,-73.987180,40.733292,"(40.73329233817604, -73.98717987386496)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
3,POINT (-73.963805125184 40.7740303792405),0006840-DCA,Business,2017-07-31,Inactive,2007-11-02,Secondhand Dealer - General,FLORIAN PAPP INC,,962,MADISON AVE,,NEW YORK,NY,10021,2122886770,Manhattan,1.0,108.0,04,1072056,1013900056,MN40,130.0,,-73.963805,40.774030,"(40.77403037924055, -73.96380512518401)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
4,POINT (-73.9765016891177 40.7883117799114),0010669-DCA,Business,2024-03-31,Active,2002-03-25,Newsstand,"SAYYED, IQBAL",,S/E/C,BROADWAY,WEST 86 STREET,NEW YORK,NY,10024,6468532572,Manhattan,1.0,,6,,,,,,-73.976502,40.788312,"(40.78831177991138, -73.97650168911773)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281408,POINT (-73.8454403826724 40.6679488838184),2115285-DCA,Business,2025-02-28,Active,2023-07-21,Home Improvement Contractor,KB NY Consulting L.L.C.,,8940,151ST AVE,,HOWARD BEACH,NY,11414,6314579488,Queens,4.0,410.0,32,4434848,4114340025,,,,-73.845440,40.667949,"(40.667948883818354, -73.84544038267242)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
281409,POINT (-73.9341786734151 40.6597830737143),2115286-DCA,Business,2025-02-28,Active,2023-07-21,Home Improvement Contractor,Reliable Development and Construction LLC,,560,SCHENECTADY AVE,,BROOKLYN,NY,11203,8324398855,Brooklyn,3.0,309.0,41,3107661,3048260013,,,,-73.934179,40.659783,"(40.65978307371425, -73.93417867341513)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
281410,POINT (-73.9030255220042 40.8813822646715),2115287-DCA,Business,2025-02-28,Active,2023-07-21,Home Improvement Contractor,"Gigi's Home Improvement, LLC",,5680,BROADWAY,,BRONX,NY,10463,6464778293,Bronx,2.0,208.0,11,2016172,2032680001,,,,-73.903026,40.881382,"(40.881382264671494, -73.90302552200423)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809
281411,POINT (-73.9007524984093 40.856068738716),2115288-DCA,Business,2025-02-28,Active,2023-07-21,Home Improvement Contractor,OJT CONSTRUCTION CORP,,2230,GRAND CONCOURSE,,BRONX,NY,10457,3329997474,Bronx,2.0,205.0,15,2013718,2031580001,,,,-73.900752,40.856069,"(40.856068738716, -73.90075249840933)",s3://edm-recipes/datasets/dca_operatingbusines...,20240809


In [32]:
diff_dca_operatingbusinesses = new_dca_operatingbusinesses.compare(old_dca_operatingbusinesses)
diff_dca_operatingbusinesses

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects