# Análise dos dados para predição


In [1]:
import os
import dotenv
import zipfile
import pandas as pd
import numpy as np
from datetime import date

import boto3
from io import BytesIO
import pickle

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go

import pickle
import shutil

import locale
locale.setlocale(locale.LC_ALL, 'pt_BR.utf8')

import warnings
warnings.filterwarnings("ignore")

rootPath = os.getcwd()
dataPath = os.path.join(rootPath, 'data')
modelsPath = os.path.join(rootPath, 'models')
env = os.path.join(rootPath, '.env')
dotenv.load_dotenv(dotenv_path=env)

True

# 01) Importando dados

In [2]:
def formatar_moeda(valor):
    return locale.currency(valor, grouping=True)

def paste_intervalo(row):
    return "[" + str(row['inf']) + ", " + str(row['sup']) + ")"

def up_s3_files(dataframe, bucket_name, folder_name, file_name):
    csv_buffer = BytesIO()
    dataframe.to_csv(csv_buffer, sep=';', index=False)
    file_key_aws = folder_name + file_name
    s3_resource.Object(bucket_name, file_key_aws).put(Body=csv_buffer.getvalue())

In [3]:
zip_file = os.path.join(dataPath, 'base_treino.zip')
z = zipfile.ZipFile(zip_file)

In [4]:
def ler_bases_exportadas(nome_arquivo):
    z.extract(nome_arquivo)
    df = pd.read_csv(nome_arquivo, sep=',')
    os.remove(nome_arquivo)
    return df

In [5]:
base_conjunta = ler_bases_exportadas('imovel_mercantil.csv')
base_notas_fiscais = ler_bases_exportadas('emissao_notas.csv')

In [6]:
base_conjunta.rename(columns={'id_contribuinte': 'id_pessoa'}, inplace=True)

In [41]:
base_conjunta.columns

Index(['cda', 'tipo_divida', 'id_pessoa', 'atividade_principal', 'situacao',
       'tipo_tributo', 'vlr_pago', 'valor_tot', 'vlr_tributo', 'vlr_taxa',
       'competencia_divida', 'inscricao_divida', 'arrecadacao_divida',
       'ajuizamento_divida', 'edificacao', 'cpf_cnpj_existe', 'protesto',
       'ajuizamento', 'refis', 'deb_totais', 'deb_pagos', 'idade_divida',
       'quantidade_reparcelamento', 'da_aberto'],
      dtype='object')

# Estudo sobre o histórico fechado da base

In [9]:
da_fechada = base_conjunta[base_conjunta['da_aberto'] == 0]
da_fechada

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,edificacao,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,idade_divida,quantidade_reparcelamento,da_aberto
2,00000b44c5ba1e669ceed47545e621dd2,mercantil,96e8e553de69d7a4,COMERCIO VAREJISTA DE ARTIGOS DE ARMARINHO,INAPTO,ISS,0.00,1278.97,0.00,1278.97,...,0,1,0,0,0,4.0,0.0,21.0,0,0
4,0000331f601a73e52b46f673bf0c61251,imovel,870c08c252b25ad1,APARTAMENTO,ATIVO,IPTU,0.00,344.12,147.20,196.92,...,1,1,0,0,0,6.0,0.0,11.0,0,0
7,000040eda866e3d19ceed47545e621dd1,imovel,9f3bac8718dac1fa,LOJA,ATIVO,IPTU,5438.40,10779.87,10779.87,0.00,...,1,1,0,1,0,4.0,3.0,15.0,2,0
8,000040eda866e3d19ceed47545e621dd2,mercantil,2f8ae1b9606267b4,INSTRUTOR DE TREINAMENTOS,INAPTO,ISS,0.00,328.63,328.63,0.00,...,0,1,0,0,0,1.0,0.0,15.0,0,0
10,0000521b64bf28c8e52281a70553db461,imovel,b5df816e1786f0a3,CASA,ATIVO,IPTU,0.00,490.83,245.50,245.33,...,1,1,0,0,0,10.0,0.0,7.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575104,ffffd79d8aa612ac9fcacff532cf50c41,imovel,c6b16a1235d987d3,APARTAMENTO,ATIVO,IPTU,26104.42,26104.42,22190.68,3913.74,...,1,1,1,0,0,40.0,40.0,1.0,0,0
2575105,ffffd79d8aa612ac9fcacff532cf50c42,mercantil,cc5afb41f6fbc367,CONSULTORIA EM TECNOLOGIA DA INFORMAÇÃO,ATIVO,ISS,265.15,1431.77,1431.77,0.00,...,0,1,0,0,0,3.0,2.0,1.0,1,0
2575107,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,,ATIVO,IPTU,0.00,1728.13,790.85,937.28,...,0,0,0,0,0,19.0,0.0,4.0,0,0
2575108,ffffdb721b90282718565e26094f06ee2,mercantil,51d6043a4efa1ed1,PUBLICITARIO (NÍVEL MÉDIO),ATIVO,ISS,0.00,240.77,240.77,0.00,...,0,1,0,0,0,1.0,0.0,6.0,0,0


In [12]:
da_fechada[['cda']].nunique()

cda    1608270
dtype: int64

Contexto entre as dívidas fechadas que a pessoa pagou zero da dívida

In [24]:
da_fechada_0 = da_fechada[da_fechada['vlr_pago'] == 0]
da_fechada_0[['cda']].nunique()

cda    989909
dtype: int64

In [23]:
formatar_moeda(da_fechada_0['valor_tot'].sum())

'R$ 2.804.837.668,63'

Dívidas dos últimos 5 anos que se encontram fechadas, sem ação de protesto ou ajuizamento em que não houve nenhum tipo de pagamento do contribuinte

In [45]:
sem_acao = da_fechada_0[(da_fechada_0['protesto'] == 0) & (da_fechada_0['ajuizamento'] == 0)]
sem_acao = sem_acao[sem_acao['idade_divida'] <= 5]
sem_acao[['cda', 'tipo_divida', 'atividade_principal', 'vlr_pago', 'valor_tot', 'protesto',  'ajuizamento', 'inscricao_divida','idade_divida', 'quantidade_reparcelamento' ]]

Unnamed: 0,cda,tipo_divida,atividade_principal,vlr_pago,valor_tot,protesto,ajuizamento,inscricao_divida,idade_divida,quantidade_reparcelamento
11,000057f68f7b77276b3a8e268c80aedf1,imovel,CASA,0.0,1148.06,0,0,2018-07-07,5.0,0
15,0000681a0944cefb6b3a8e268c80aedf1,imovel,SALA,0.0,513.48,0,0,2019-06-28,4.0,0
42,000122d8f759a468d8a3e5cebc255ca61,imovel,APARTAMENTO,0.0,816.96,0,0,2019-06-28,4.0,0
51,0001413d2c1f2a0f9989bae6f4af91ee1,imovel,CASA,0.0,1626.23,0,0,2018-07-07,5.0,0
52,0001413d2c1f2a0f9989bae6f4af91ee2,mercantil,REPRESENTANTES COMERCIAIS E AGENTES DO COM DE ...,0.0,3708.55,0,0,2018-07-07,5.0,0
...,...,...,...,...,...,...,...,...,...,...
2575038,fffe2265ec22084db5eae1a923f0f8702,mercantil,SERVIÇOS DE MANUTENÇÃO E REPARAÇÃO MECÂNICA DE...,0.0,1856.54,0,0,2018-07-07,5.0,0
2575041,fffe2ed889b77e536b3a8e268c80aedf2,mercantil,MONTAGEM DE ESTRUTURAS METÁLICAS,0.0,1910.18,0,0,2018-07-07,5.0,0
2575057,fffe82ba945121484bd01ecb8277da0b1,imovel,SALA,0.0,649.65,0,0,2018-07-07,5.0,0
2575058,fffe82ba945121484bd01ecb8277da0b2,mercantil,Comercio varejista especializado de equipament...,0.0,3708.55,0,0,2018-07-07,5.0,0


In [44]:
formatar_moeda(sem_acao['valor_tot'].sum())

'R$ 316.165.110,04'

In [43]:
sem_acao.to_csv(r'C:\Users\Consultor\Documents\bases_pesquisa\verificacao\cdas_fechadas.csv', index=False)

### São 989.909 CDAS que correspondem a R$ 2.804.837,63 que não foram pagos e se encontram no status de fechadas

Contexto entre as dívidas fechadas que a pessoa pagou parcialmente do valor da dívida

In [25]:
da_fechada_parcial = da_fechada[(da_fechada['vlr_pago'] > 0) & (da_fechada['valor_tot'] > da_fechada['vlr_pago'])]
da_fechada_parcial[['cda']].nunique()

cda    322750
dtype: int64

In [26]:
formatar_moeda(da_fechada_parcial['valor_tot'].sum())

'R$ 7.947.880.871,27'

In [27]:
formatar_moeda(da_fechada_parcial['vlr_pago'].sum())

'R$ 1.449.637.027,28'

### São 322.750 CDAS que correspondem a R$ 7.947.880.871,27 no qual apenas R$ 1.449.637.027,28 deste montante foram pagos e se encontram no status de fechadas

In [28]:
da_fechada_pago = da_fechada[(da_fechada['vlr_pago']  == da_fechada['valor_tot'])]
da_fechada_pago[['cda']].nunique()

cda    295616
dtype: int64

In [29]:
formatar_moeda(da_fechada_pago['valor_tot'].sum())

'R$ 999.009.494,49'

### São 295.616 CDAS que correspondem a R$ 999.009.494,49 que foram pagos e se encontram no status de fechadas

## Métricas de anos dentro de contextos específicos

In [34]:
# Média de anos que temos de dívida no dataframe
media_idade_divida = da_fechada['idade_divida'].mean()
media_idade_divida

10.648363769765027

In [36]:
# Quantas dívidas por ano temos
contagem_idade_divida = da_fechada['idade_divida'].value_counts()
contagem_idade_divida.to_frame()

Unnamed: 0,idade_divida
9.0,186226
7.0,152500
6.0,151438
5.0,149449
10.0,117613
11.0,112255
4.0,89694
3.0,89665
19.0,48805
22.0,46067


In [37]:
filtro_10_anos = da_fechada[da_fechada['idade_divida'] <= 10]
filtro_10_anos

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,edificacao,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,idade_divida,quantidade_reparcelamento,da_aberto
10,0000521b64bf28c8e52281a70553db461,imovel,b5df816e1786f0a3,CASA,ATIVO,IPTU,0.00,490.83,245.50,245.33,...,1,1,0,0,0,10.0,0.0,7.0,0,0
11,000057f68f7b77276b3a8e268c80aedf1,imovel,e3df827c83c5efa8,CASA,ATIVO,IPTU,0.00,1148.06,555.67,592.39,...,1,1,0,0,0,18.0,0.0,5.0,0,0
14,0000662885c684004bd01ecb8277da0b2,mercantil,bac1362812257f05,INSTALAÇÃO E MANUTENÇÃO ELÉTRICA,INAPTO,ISS,0.00,2894.57,0.00,2894.57,...,0,1,0,0,0,2.0,0.0,9.0,0,0
15,0000681a0944cefb6b3a8e268c80aedf1,imovel,47d6248500cb7b7d,SALA,ATIVO,IPTU,0.00,513.48,264.24,249.24,...,1,1,0,0,0,7.0,0.0,4.0,0,0
17,00007af008add2c7e52281a70553db461,imovel,d22c7618647dcf14,CASA,ATIVO,IPTU,0.00,930.47,745.57,184.90,...,1,0,0,0,0,16.0,0.0,9.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575103,ffffd120a8f3f2799fcacff532cf50c41,imovel,934532c6c448a899,CASA,ATIVO,IPTU,0.00,802.85,617.95,184.90,...,1,1,0,0,0,16.0,0.0,9.0,0,0
2575104,ffffd79d8aa612ac9fcacff532cf50c41,imovel,c6b16a1235d987d3,APARTAMENTO,ATIVO,IPTU,26104.42,26104.42,22190.68,3913.74,...,1,1,1,0,0,40.0,40.0,1.0,0,0
2575105,ffffd79d8aa612ac9fcacff532cf50c42,mercantil,cc5afb41f6fbc367,CONSULTORIA EM TECNOLOGIA DA INFORMAÇÃO,ATIVO,ISS,265.15,1431.77,1431.77,0.00,...,0,1,0,0,0,3.0,2.0,1.0,1,0
2575107,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,,ATIVO,IPTU,0.00,1728.13,790.85,937.28,...,0,0,0,0,0,19.0,0.0,4.0,0,0


In [38]:
filtro_5_anos = da_fechada[da_fechada['idade_divida'] <= 5]
filtro_5_anos

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,edificacao,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,idade_divida,quantidade_reparcelamento,da_aberto
11,000057f68f7b77276b3a8e268c80aedf1,imovel,e3df827c83c5efa8,CASA,ATIVO,IPTU,0.00,1148.06,555.67,592.39,...,1,1,0,0,0,18.0,0.0,5.0,0,0
15,0000681a0944cefb6b3a8e268c80aedf1,imovel,47d6248500cb7b7d,SALA,ATIVO,IPTU,0.00,513.48,264.24,249.24,...,1,1,0,0,0,7.0,0.0,4.0,0,0
21,0000905d980177f36b3a8e268c80aedf1,imovel,f08282eba436ba83,APARTAMENTO,ATIVO,IPTU,528.49,528.49,272.20,256.29,...,1,1,1,0,0,14.0,14.0,3.0,0,0
22,0000a43dae672b166b3a8e268c80aedf1,imovel,f5fe60dbb517e68a,APARTAMENTO,ATIVO,IPTU,1692.36,1692.36,1692.36,0.00,...,1,1,1,0,0,5.0,5.0,1.0,0,0
24,0000a43dae672b166b3a8e268c80aedf2,mercantil,94ba02ded51129a5,RESTAURANTES E SIMILARES,ATIVO,ISS,1928.97,1928.97,1928.97,0.00,...,0,1,0,0,0,6.0,6.0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575058,fffe82ba945121484bd01ecb8277da0b2,mercantil,5662d0e57e72d351,Comercio varejista especializado de equipament...,SUSPENSO,ISS,0.00,3708.55,0.00,3708.55,...,0,1,0,0,0,4.0,0.0,5.0,0,0
2575073,fffebfc302df64ae6b3a8e268c80aedf1,imovel,8455f043f0c21d9f,APARTAMENTO,ATIVO,IPTU,824.54,824.54,359.25,465.29,...,1,1,0,0,0,16.0,16.0,5.0,0,0
2575104,ffffd79d8aa612ac9fcacff532cf50c41,imovel,c6b16a1235d987d3,APARTAMENTO,ATIVO,IPTU,26104.42,26104.42,22190.68,3913.74,...,1,1,1,0,0,40.0,40.0,1.0,0,0
2575105,ffffd79d8aa612ac9fcacff532cf50c42,mercantil,cc5afb41f6fbc367,CONSULTORIA EM TECNOLOGIA DA INFORMAÇÃO,ATIVO,ISS,265.15,1431.77,1431.77,0.00,...,0,1,0,0,0,3.0,2.0,1.0,1,0


---

# Estudo sobre o histórico aberto da base

In [31]:
da_aberta = base_conjunta[base_conjunta['da_aberto'] == 1]
da_aberta

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,edificacao,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,idade_divida,quantidade_reparcelamento,da_aberto
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,APARTAMENTO,ATIVO,IPTU,0.0,2048.73,762.08,1286.65,...,1,1,0,0,0,40.0,0.0,1.0,0,1
1,00000123c3d4731c6b3a8e268c80aedf2,mercantil,fc4b99b807fbed41,ATIVIDADES DE TELEATENDIMENTO,SUSPENSO,ISS,0.0,2515.85,0.00,2515.85,...,0,1,0,0,0,3.0,0.0,1.0,0,1
3,000014e359592e62d8a3e5cebc255ca6E,mercantil,6dbe14da38a31dc1,Comercio varejista especializado de equipament...,ATIVO,ISS,0.0,847.02,847.02,0.00,...,0,1,0,0,0,1.0,0.0,0.0,0,1
5,0000331f601a73e52b46f673bf0c61252,mercantil,e8424494daac9641,"COM VAR DE MERC EM GERAL, COM PREDOM DE PROD A...",INAPTO,ISS,0.0,5385.81,0.00,5385.81,...,0,1,0,1,0,12.0,0.0,11.0,0,1
6,00003d46e618da886b3a8e268c80aedf1,imovel,d8b23eda9800b9e3,CASA,EM PROCESSO DE BAIXA,IPTU,0.0,6434.96,3409.70,3025.26,...,1,0,0,1,0,60.0,0.0,12.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575101,ffffca43e897bbf0d8a3e5cebc255ca61,imovel,cad70f5b8701af03,CASA,ATIVO,IPTU,0.0,5082.97,2240.39,2842.58,...,1,0,0,1,0,40.0,0.0,11.0,0,1
2575102,ffffca43e897bbf0d8a3e5cebc255ca62,mercantil,e750ab1d88feb1ba,ATIVIDADES DE ASSOCIAÇÕES DE DEFESA DE DIREITO...,ATIVO,ISS,0.0,7634.36,0.00,7634.36,...,0,1,0,1,0,8.0,0.0,11.0,0,1
2575106,ffffd79d8aa612ac9fcacff532cf50c42,mercantil,cc5afb41f6fbc367,CONSULTORIA EM TECNOLOGIA DA INFORMAÇÃO,ATIVO,ISS,0.0,1080.38,1080.38,0.00,...,0,1,0,0,0,1.0,0.0,1.0,1,1
2575109,ffffdc78aa7a90e26b3a8e268c80aedf1,imovel,9b512f210b460120,APARTAMENTO,ATIVO,IPTU,0.0,9390.43,5113.61,4276.82,...,1,1,0,1,0,30.0,0.0,25.0,0,1


In [32]:
da_aberta[['cda']].nunique()

cda    966842
dtype: int64