In [1]:
import os
import zipfile
import duckdb
import pandas as pd
from time import time

conn = duckdb.connect('cnpj.db')

# Extracting Files

In [5]:
# unzip all .zip files in the current directory and save them with the same name in the unzipped folder in a new directory called unzipped
# Source: https://dados.gov.br/dados/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj
folders = ()

# if there is a folder called unzipped, delete it
if os.path.exists('unzipped'):
    os.system('rm -r unzipped')
    print('unzipped folder deleted')

#for file in 'zipped'folder

for file in os.listdir(os.getcwd() + '/zipped'):
    if file.endswith('.zip'):
        new_file_name = file.replace('.zip', '')
        folder_name = new_file_name[0:-1]
        folders += (folder_name,)
        with zipfile.ZipFile(f'zipped/{file}', 'r') as zip_ref:
            zip_ref.extractall(f'unzipped/{folder_name}')

folders = list(set(folders))
print(folders)

unzipped folder deleted
['Cnae', 'Socios', 'Empresas', 'Estabelecimentos']


In [6]:
for folder in folders:
    counter = 0
    file_list = os.listdir(f'unzipped/{folder}')
    for file in file_list:
        os.rename(f'unzipped/{folder}/{file}', f'unzipped/{folder}/{counter}.csv')
        counter = counter + 1
print('unzipped files renamed')

unzipped files renamed


# Creating and populating tables

In [8]:
empresas_columns = ['cnpj_base', 'social_name', 'legal_nature', 'responsible_qualification', 'capital', 'company_size', 'federal_entity_responsible']
cnaes_columns = ['code', 'description']
estabelecimentos_columns = ['cnpj_base', 'cnpj_order', 'cnpj_dv', 'identifier', 'fantasy_name', 'registration_status', 'registration_date', 'registration_reason', 'city_name', 'country_code', 'activity_start_date', 'main_activity', 'secondary_activity', 'street_type', 'street_name', 'number', 'complement', 'neighborhood', 'zip_code', 'state', 'city_code', 'ddd1', 'phone1', 'ddd2', 'phone2', 'fax_ddd', 'fax', 'email', 'special_status', 'special_status_date']
socios_columns = ['cnpj_base', 'socio_identifier', 'socio_name', 'socio_cnpj_cpf', 'socio_qualification', 'entry_date', 'country_code', 'legal_representative_cpf', 'legal_representative_name', 'legal_representative_qualification', 'age_range']

In [9]:
# drop tables if they exist
conn.execute('DROP TABLE IF EXISTS empresas')
conn.execute('DROP TABLE IF EXISTS cnae')
conn.execute('DROP TABLE IF EXISTS estabelecimentos')
conn.execute('DROP TABLE IF EXISTS socios')
# create tables
conn.execute(f'CREATE TABLE IF NOT EXISTS cnae ({", ".join([f"{column} STRING" for column in cnaes_columns])})')
conn.execute(f'CREATE TABLE IF NOT EXISTS empresas ({", ".join([f"{column} STRING" for column in empresas_columns])})')
conn.execute(f'CREATE TABLE IF NOT EXISTS estabelecimentos ({", ".join([f"{column} STRING" for column in estabelecimentos_columns])})')
conn.execute(f'CREATE TABLE IF NOT EXISTS socios ({", ".join([f"{column} STRING" for column in socios_columns])})')

<duckdb.duckdb.DuckDBPyConnection at 0x11cc97cf0>

In [3]:
def add_csv_to_table(folder):
    print(folder)
    counter = 0
    for file in os.listdir(f'unzipped/{folder}'):
        if file.endswith('.csv'):
            df = pd.read_csv(f'unzipped/{folder}/{file}', sep=';', encoding='latin1', dtype=str)
            df = df.fillna('null')
            conn.register(f'{folder}_{counter}', df)
            counter += 1
    
    for i in range(counter):
        conn.execute(f'INSERT INTO {folder} SELECT * FROM {folder}_{i}')
        conn.execute(f'DROP VIEW {folder}_{i}')

In [13]:
for folder in folders:
        add_csv_to_table(folder)
        print(f"{conn.execute(f'SELECT COUNT(*) FROM {folder}').fetchone()[0]} rows added to {folder} table")

Cnae
5432 rows added to Cnae table
Socios
23765603 rows added to Socios table
Empresas
56554309 rows added to Empresas table
Estabelecimentos
59495409 rows added to Estabelecimentos table
