# DuckDBManager example usages

## Initializing

In [3]:
import duckdb_manager as ddb

db_manager = ddb.DuckDBManager()

### Note: If the program can't find the duck db file, it will automaticlally recreate the database using the 'import_all_csv_files'. 

### This functionality will not work if you don't have the necessary csv files, so if for whatever reason you don't have the data, you can just run the 'populate_database' notebook file to get the data. 


## Saving the database as csv files

### Since github does not allow users to upload files larger than 100mb, the duckdb database has to split into seperate csv files. However, I don't recommend using the CSV files directly as pandas dataframes since performance becomes an issue whenever CSV files are too large.

In [2]:
db_manager.save_to_csv()

Exporting DataEntry CSV files: 100%|██████████| 12/12 [00:54<00:00,  4.55s/it]


### The 'check_row_length' function takes in a string and spits out the number of rows in the table specified.

In [2]:
print(f"DataEntry number of rows: {db_manager.check_row_length('DataEntry')}")
print(f"DimZipCode number of rows: {db_manager.check_row_length('DimZipCode')}")
print(f"DimYear number of rows: {db_manager.check_row_length('DimYear')}")
print(f"DimNaics number of rows: {db_manager.check_row_length('DimNaics')}")

DataEntry number of rows: 36216155
DimZipCode number of rows: 39331
DimYear number of rows: 12
DimNaics number of rows: 2216


# Export to nested zip folders

In [1]:
import dataexporter as dex
import os
export_dir='database/nested_zip'
db_path='database/us_economic_data.duckdb'
db_exporter = dex.DataExporter(export_dir=export_dir, db_path=db_path, threads=6)
#Use export_geo_year_data(year) if you're querying for new data
db_exporter.export_all_geo_year_data()

Exporting data:   1%|          | 13886/1179930 [05:32<7:45:28, 41.75it/s] 
INFO:root:Exporting process was interrupted by user.


Interrupted by user. Exiting...


### The 'get_schema' gives us the database schema

In [4]:
db_manager.get_schema()

{'DataEntry': [('EntryID', 'INTEGER'),
  ('GeoID', 'VARCHAR'),
  ('NaicsCode', 'VARCHAR'),
  ('Year', 'INTEGER'),
  ('Establishments', 'INTEGER'),
  ('Employees', 'INTEGER'),
  ('Payroll', 'INTEGER'),
  ('IndustryLevel', 'INTEGER')],
 'DimNaics': [('NaicsCode', 'VARCHAR'), ('industry_detail', 'VARCHAR')],
 'DimYear': [('Year', 'INTEGER'), ('YearDescription', 'VARCHAR')],
 'DimZipCode': [('GeoID', 'VARCHAR'),
  ('City', 'VARCHAR'),
  ('State', 'VARCHAR')]}

### If in the future, if you need to query the data from the database directly, you can use the DataQueryManager.

In [1]:
import query as q
export_dir='database/nested_zip'
db_path='database/us_economic_data.duckdb'
dqm = q.DataQueryManager(export_dir=export_dir, db_path=db_path)


### The 'execute_query' function allows you to create a custom query and get a dataframe back. If needed, you can also save the pandas dataframe as a csv file.

In [2]:
dqm.filter(zipcode="21076", industry_level=2, year=2016)

Data written to zip\2\1\0\7\6\US-21076-census-naics2-zipcode-2016.csv


Unnamed: 0,EntryID,GeoID,NaicsCode,Year,Establishments,Employees,Payroll,IndustryLevel
0,21045615,21076,23,2016,41,0,0,2
1,21859697,21076,42,2016,64,0,0,2
2,23159158,21076,51,2016,31,0,0,2
3,23308083,21076,52,2016,22,0,0,2
4,23762110,21076,54,2016,159,0,0,2
5,23552403,21076,53,2016,36,0,0,2
6,24151381,21076,55,2016,17,0,0,2
7,24191074,21076,56,2016,65,0,0,2
8,24471457,21076,61,2016,8,0,0,2
9,24582654,21076,62,2016,52,0,0,2


In [3]:
dqm.filter(zipcode="21076",  year=2016)

Unnamed: 0,EntryID,GeoID,NaicsCode,Year,Establishments,Employees,Payroll,IndustryLevel
0,21045615,21076,23,2016,41,0,0,2
1,21859697,21076,42,2016,64,0,0,2
2,23159158,21076,51,2016,31,0,0,2
3,23308083,21076,52,2016,22,0,0,2
4,23762110,21076,54,2016,159,0,0,2
...,...,...,...,...,...,...,...,...
411,25764184,21076,813910,2016,1,0,0,6
412,25666837,21076,813110,2016,7,0,0,6
413,25771950,21076,813920,2016,3,0,0,6
414,25776377,21076,813930,2016,2,0,0,6


In [4]:
dqm.filter(zipcode="21076", industry_level=2)

Unnamed: 0,EntryID,GeoID,NaicsCode,Year,Establishments,Employees,Payroll,IndustryLevel
0,1706179,21076,23,2012,40,0,0,2
1,2520833,21076,42,2012,75,0,0,2
2,3822561,21076,51,2012,32,0,0,2
3,3968212,21076,52,2012,23,0,0,2
4,4424233,21076,54,2012,121,0,0,2
...,...,...,...,...,...,...,...,...
274,35325542,21076,72,2021,110,0,0,2
275,35105393,21076,62,2021,46,0,0,2
276,35430457,21076,81,2021,65,0,0,2
277,36187920,21076,00,2021,1000,30816,2246409,2


In [5]:
dqm.filter(zipcode="21076")

Unnamed: 0,EntryID,GeoID,NaicsCode,Year,Establishments,Employees,Payroll,IndustryLevel
0,1706179,21076,23,2012,40,0,0,2
1,2520833,21076,42,2012,75,0,0,2
2,3822561,21076,51,2012,32,0,0,2
3,3968212,21076,52,2012,23,0,0,2
4,4424233,21076,54,2012,121,0,0,2
...,...,...,...,...,...,...,...,...
3419,35558788,21076,813110,2021,7,0,0,6
3420,35462798,21076,811111,2021,8,0,0,6
3421,35488855,21076,811490,2021,3,0,0,6
3422,35501572,21076,812112,2021,5,0,0,6
