## Setup

In [1]:
from pathlib import Path
import duckdb
from dcpy.utils import duckdb as dcpduckdb
import pandas as pd

pd.set_option("display.max_columns", 50)

In [2]:
DB_PATH = Path("facdb_qa.db")

In [4]:
# # delete the database if it exists
# DB_PATH.unlink(missing_ok=True)

# # create the database with extensions and credentials
# with duckdb.connect(str(DB_PATH)) as connection:
#     connection.sql(f"INSTALL spatial")
#     connection.sql(f"LOAD spatial")


In [None]:
# test
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        "DESCRIBE TABLE 's3://edm-recipes/datasets/test_nypl_libraries/20240124/test_nypl_libraries.csv'"
    ).show()

## `dpr_parksproperties`

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        """
            create table dpr_parksproperties_all as
            select * from
            read_csv(
                's3://edm-recipes/datasets/dpr_parksproperties/*/dpr_parksproperties.csv',
                union_by_name = true,
                filename = true
            )
        """
    )

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("SHOW ALL TABLES").show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("describe table dpr_parksproperties_all").show(max_rows=100)

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("select * from dpr_parksproperties_all").show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    # connection.sql("alter table dpr_parksproperties_all add column version varchar")
    connection.sql(
        "update dpr_parksproperties_all set version = string_split(parse_dirpath(filename), '/')[-1]"
    )
    connection.sql(
        "select version, filename, location, from dpr_parksproperties_all"
    ).show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    old_dpr_parksproperties = connection.sql(
        "select * from dpr_parksproperties_all where version = '20240109'"
    ).df()
old_dpr_parksproperties

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    new_dpr_parksproperties = connection.sql(
        "select * from dpr_parksproperties_all where version = '20240814'"
    ).df()
new_dpr_parksproperties

### `typecategory`

In [None]:
new_dpr_parksproperties.compare(old_dpr_parksproperties)

## `dca_operatingbusinesses`

In [12]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("drop table if exists dca_operatingbusinesses_all")

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    dcpduckdb.setup_s3_secret(DB_PATH)
    connection.sql(
        """
            create table dca_operatingbusinesses_all as
            select * from
            read_csv(
                's3://edm-recipes/datasets/dca_operatingbusinesses/*/dca_operatingbusinesses.csv',
                union_by_name = true,
                filename = true
            )
        """
    )

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("SHOW ALL TABLES").show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("describe table dca_operatingbusinesses_all").show(max_rows=100)

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("select * from dca_operatingbusinesses_all").show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    connection.sql("alter table dca_operatingbusinesses_all add column version varchar")
    connection.sql(
        "update dca_operatingbusinesses_all set version = string_split(parse_dirpath(filename), '/')[-1]"
    )
    connection.sql("select version, filename, from dca_operatingbusinesses_all").show()

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    dca_operatingbusinesses_versions = connection.sql(
        "select version, count(*) from dca_operatingbusinesses_all group by version order by version"
    ).df()
dca_operatingbusinesses_versions

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    dca_operatingbusinesses_scrap_metal = connection.sql(
        "select * from dca_operatingbusinesses_all where industry = 'Scrap Metal Processor'"
    ).df()
dca_operatingbusinesses_scrap_metal

In [None]:
dca_operatingbusinesses_scrap_metal.value_counts("version")

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    old_dca_operatingbusinesses = connection.sql(
        "select * from dca_operatingbusinesses_all where version = '20230714' order by dca_license_number asc"
    ).df()
old_dca_operatingbusinesses

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    new_dca_operatingbusinesses = connection.sql(
        "select * from dca_operatingbusinesses_all where version = '20240809' order by dca_license_number asc"
    ).df()
new_dca_operatingbusinesses

In [None]:
new_dca_operatingbusinesses.value_counts("industry")

In [None]:
with duckdb.connect(str(DB_PATH)) as connection:
    new_new_dca_operatingbusinesses = connection.sql(
        "select * from dca_operatingbusinesses_all where version = '20241018' order by dca_license_number asc"
    ).df()
new_new_dca_operatingbusinesses