Basic Imports

In [1]:
from datetime import date
from dotenv import load_dotenv
from sqlalchemy import create_engine
import bs4 as bs
import ftplib
import gzip
import os
import pandas as pd
import psycopg2
import re
import sys
import time
import requests
import urllib.request
import pip
import zipfile



Auxiliary functions

In [2]:
def check_diff(url, file_name):
    '''
    Verifica se o arquivo no servidor existe no disco e se ele tem o mesmo
    tamanho no servidor.
    '''
    if not os.path.isfile(file_name):
        return True # ainda nao foi baixado

    response = requests.head(url)
    new_size = int(response.headers.get('content-length', 0))
    old_size = os.path.getsize(file_name)
    if new_size != old_size:
        os.remove(file_name)
        return True # tamanho diferentes

    return False # arquivos sao iguais

def makedirs(path):
    '''
    cria path caso seja necessario
    '''
    if not os.path.exists(path):
        os.makedirs(path)

def getEnv(env):
    return os.getenv(env)

Getting the directories for the data

In [3]:
local_env = r'C:\Users\Jeremias Junior\Documents\GitHub\gov_data'

dados_rf = 'http://200.152.38.155/CNPJ/'

raw_files = r'C:\Users\Jeremias Junior\Documents\GitHub\gov_data\data\raw_files'
extracted_files = r'C:\Users\Jeremias Junior\Documents\GitHub\gov_data\data\extracted_files'

raw_html = urllib.request.urlopen(dados_rf)
raw_html = raw_html.read()

# Formatar página e converter em string
page_items = bs.BeautifulSoup(raw_html, 'lxml')
html_str = str(page_items)

Files = []
text = '.zip'
for m in re.finditer(text, html_str):
    i_start = m.start()-40
    i_end = m.end()
    i_loc = html_str[i_start:i_end].find('href=')+6
    Files.append(html_str[i_start+i_loc:i_end])

Extracting files

In [None]:
i_l = 0
print('unzipping files')
for l in Files:

    try:
        i_l += 1
        
        print(str(i_l) + ' - ' + l)
        full_path = os.path.join(raw_files, l)
        with zipfile.ZipFile(full_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_files)
    except:
        pass

Setting up files and database

In [4]:
insert_start = time.time()

items = [name for name in os.listdir(extracted_files) if name.endswith('')]

files = dict()

files['empresa'] = list()
files['estabeleciemnto'] = list()
files['socios'] = list()
files['simples'] = list()
files['cnae'] = list()
files['moti'] = list()
files['munic'] = list()
files['natju'] = list()
files['pais'] = list()
files['quals'] = list()

for i in range(len(items)):

    if items[i].find('EMPRE') > -1:
        files['empresa'].append(items[i])
    if items[i].find('ESTABELE') > -1:
        files['estabeleciemnto'].append(items[i])
    if items[i].find('SOCIO') > -1:
        files['socios'].append(items[i])
    if items[i].find('SIMPLES') > -1:
        files['simples'].append(items[i])
    if items[i].find('CNAE') > -1:
        files['cnae'].append(items[i])
    if items[i].find('MOTI') > -1:
        files['moti'].append(items[i])
    if items[i].find('MUNIC') > -1:
        files['munic'].append(items[i])
    if items[i].find('NATJU') > -1:
        files['natju'].append(items[i])
    if items[i].find('PAIS') > -1:
        files['pais'].append(items[i])
    if items[i].find('QUALS') > -1:   
        files['quals'].append(items[i])

    else:
        pass

import pyodbc
import sqlalchemy

               
conn = pyodbc.connect(driver='{SQL Server}', 
                      server='(local)', 
                      database='gov_db',               
                      trusted_connection='yes')




Empresa

In [21]:
table_name = 'empresa'
table_columns = ['cnpj_basico', 
                       'razao_social', 
                       'natureza_juridica', 
                       'qualificacao_responsavel', 
                       'capital_social', 
                       'porte_empresa', 
                       'ente_federativo_responsavel']

cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS "empresa";')
conn.commit()


create_table= f"CREATE TABLE {table_name} ({', '.join([f'{col} VARCHAR(255)' for col in table_columns])})"
cursor.execute(create_table)
cursor.commit()


insertdata_query = f"INSERT INTO {table_name} ({', '.join(table_columns)}) VALUES ({', '.join(['?'] * len(table_columns))})"


for i in range(0, len(files['empresa'])):
    print('loading : ',files['empresa'][i])

    empresa = pd.DataFrame(columns=[0, 1, 2, 3, 4, 5, 6])
    empresa_dtypes = {0: 'object', 1: 'object', 2: 'object', 3: 'object', 4: 'object', 5: 'object', 6: 'object'}
    extracted_file_path = os.path.join(extracted_files, files['empresa'][i])

    empresa = pd.read_csv(filepath_or_buffer=extracted_file_path,
                          sep=';',
                          #nrows=100,
                          skiprows=0,
                          header=None,
                          dtype=empresa_dtypes,
                          encoding='latin-1',
                        )
    
    empresa = empresa.reset_index()

    del empresa['index']

    empresa.columns = ['cnpj_basico', 
                       'razao_social', 
                       'natureza_juridica', 
                       'qualificacao_responsavel', 
                       'capital_social', 
                       'porte_empresa', 
                       'ente_federativo_responsavel']
    
    #empresa['capital_social'] = empresa['capital_social'].apply(lambda x: x.replace(',','.'))
    empresa['ente_federativo_responsavel'] = empresa['ente_federativo_responsavel'] .astype(str)
    
    for values in empresa.values.tolist():
        cursor.execute(insertdata_query, values)
    print(i)

conn.commit()

    
    

empresa.head(20)
  




loading :  K3241.K03200Y1.D20910.EMPRECSV
['00000000', 'BANCO DO BRASIL SA', '2038', '10', '90000023475,34', '05', 'nan'] <class 'type'>


KeyboardInterrupt: 

In [19]:
empresa.values.tolist()[0]

['36627979',
 'LORENA MARIA DE BRITO CAMARGO 09776918700',
 '2135',
 '50',
 '4000,00',
 '01',
 'nan']

In [None]:
files