# Obtenção de rating do contribuinte

Com o objetivo de direcionar o modelo que ditá quais dívidas são melhores de recuperar, será realizado um rating que envolva apenas as variáveis do contribuinte.

In [1]:
import os
import dotenv
import zipfile
import pandas as pd
import numpy as np
from datetime import date

import boto3
from io import BytesIO
import pickle

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go

import pickle
import shutil

import locale
locale.setlocale(locale.LC_ALL, 'pt_BR.utf8')

import warnings
warnings.filterwarnings("ignore")

rootPath = os.getcwd()
dataPath = os.path.join(rootPath, 'data')
modelsPath = os.path.join(rootPath, 'models')
env = os.path.join(rootPath, '.env')
dotenv.load_dotenv(dotenv_path=env)

True

In [2]:
def formatar_moeda(valor):
    return locale.currency(valor, grouping=True)

def paste_intervalo(row):
    return "[" + str(row['inf']) + ", " + str(row['sup']) + ")"

def up_s3_files(dataframe, bucket_name, folder_name, file_name):
    csv_buffer = BytesIO()
    dataframe.to_csv(csv_buffer, sep=';', index=False)
    file_key_aws = folder_name + file_name
    s3_resource.Object(bucket_name, file_key_aws).put(Body=csv_buffer.getvalue())

In [3]:
dataPath

'c:\\Users\\BHN\\datascience\\divida_ativa\\data'

# 01) Importando dados

In [3]:
zip_file = os.path.join(dataPath, 'base_treino.zip')
z = zipfile.ZipFile(zip_file)

In [4]:
def ler_bases_exportadas(nome_arquivo):
    z.extract(nome_arquivo)
    df = pd.read_csv(nome_arquivo, sep=',')
    os.remove(nome_arquivo)
    return df

In [5]:
base_notas_fiscais = ler_bases_exportadas('emissao_notas.csv')
base_conjunta = ler_bases_exportadas('imovel_mercantil.csv')

In [7]:
base_conjunta.rename(columns={'id_contribuinte': 'id_pessoa'}, inplace=True)

In [8]:
base_conjunta[['id_pessoa', 'tipo_divida']].nunique()

id_pessoa      424210
tipo_divida         2
dtype: int64

# Há CDAs que aparecem mais de 01 vez?

In [9]:
numcda = base_conjunta.groupby('cda')['tipo_divida'].count().reset_index()
numcda = numcda.sort_values(by = "tipo_divida", ascending= False)
numcda

Unnamed: 0,cda,tipo_divida
1207889,800de02f650f30982b46f673bf0c61251,2
2127811,e172a886a4f9d3619989bae6f4af91ee1,2
308489,20d81ce4eff7ca602b46f673bf0c61251,2
2127853,e173c38003fe4572d8a3e5cebc255ca61,2
980426,67f1802913e7f42f4bd01ecb8277da0b1,2
...,...,...
832669,584dcba3426330277c8fa7a67092eaab1,1
832668,584dc692aae2212018565e26094f06ee1,1
832667,584dc61fe89b29af6b3a8e268c80aedf2,1
832666,584dc61fe89b29af6b3a8e268c80aedf1,1


In [10]:
numcda[numcda['tipo_divida'] > 1]['cda'].nunique()

159883

In [11]:
numcda['cda'].nunique()

2415779

In [12]:
numcda[numcda['tipo_divida'] > 1]['cda'].nunique()/numcda['cda'].nunique()

0.06618279238291251

### Das **2.415.779** CDAs distintas, 159.883 parecem mais de uma vez. Ou seja, 159.883 (6,61%) tem 2 valores distintos para da_aberto.

In [13]:
base_conjunta.columns

Index(['cda', 'tipo_divida', 'id_pessoa', 'atividade_principal', 'situacao',
       'tipo_tributo', 'vlr_pago', 'valor_tot', 'vlr_tributo', 'vlr_taxa',
       'competencia_divida', 'inscricao_divida', 'arrecadacao_divida',
       'ajuizamento_divida', 'edificacao', 'cpf_cnpj_existe', 'protesto',
       'ajuizamento', 'refis', 'deb_totais', 'deb_pagos', 'idade_divida',
       'quantidade_reparcelamento', 'da_aberto', 'endereco_existe'],
      dtype='object')

# Renomeando coluna de idade da dívida

In [14]:
base_conjunta = base_conjunta.rename(columns = {'idade_divida':'anos_idade_da'})
base_conjunta

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,anos_idade_da,quantidade_reparcelamento,da_aberto,endereco_existe
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,APARTAMENTO,ATIVO,IPTU,0.0,2048.73,762.08,1286.65,...,1,0,0,0,40.0,0.0,1.0,0,1,1
1,00000123c3d4731c6b3a8e268c80aedf2,mercantil,fc4b99b807fbed41,ATIVIDADES DE TELEATENDIMENTO,SUSPENSO,ISS,0.0,2515.85,0.00,2515.85,...,1,0,0,0,3.0,0.0,1.0,0,1,0
2,00000b44c5ba1e669ceed47545e621dd2,mercantil,96e8e553de69d7a4,COMERCIO VAREJISTA DE ARTIGOS DE ARMARINHO,INAPTO,ISS,0.0,1278.97,0.00,1278.97,...,1,0,0,0,4.0,0.0,21.0,0,0,0
3,000014e359592e62d8a3e5cebc255ca6E,mercantil,6dbe14da38a31dc1,Comercio varejista especializado de equipament...,ATIVO,ISS,0.0,847.02,847.02,0.00,...,1,0,0,0,1.0,0.0,0.0,0,1,0
4,0000331f601a73e52b46f673bf0c61251,imovel,870c08c252b25ad1,APARTAMENTO,ATIVO,IPTU,0.0,344.12,147.20,196.92,...,1,0,0,0,6.0,0.0,11.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575657,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,,ATIVO,IPTU,0.0,1728.13,790.85,937.28,...,0,0,0,0,19.0,0.0,4.0,0,0,1
2575658,ffffdb721b90282718565e26094f06ee2,mercantil,51d6043a4efa1ed1,PUBLICITARIO (NÍVEL MÉDIO),ATIVO,ISS,0.0,240.77,240.77,0.00,...,1,0,0,0,1.0,0.0,6.0,0,0,1
2575659,ffffdc78aa7a90e26b3a8e268c80aedf1,imovel,9b512f210b460120,APARTAMENTO,ATIVO,IPTU,0.0,9390.43,5113.61,4276.82,...,1,0,1,0,30.0,0.0,25.0,0,1,1
2575660,fffff192f894983c9ceed47545e621dd1,imovel,dd5538d2d38803a7,CASA,ATIVO,IPTU,0.0,16594.80,11374.98,5219.82,...,1,0,1,0,32.0,0.0,15.0,0,1,1


In [14]:
formatar_moeda(base_conjunta['valor_tot'].sum())

'R$ 25.612.605.304,70'

In [15]:
base_conjunta['inscricao_divida'].max()

'2041-10-25'

In [17]:
formatar_moeda(base_conjunta.loc[base_conjunta['da_aberto'] == 1, 'valor_tot'].sum())

'R$ 13.856.278.075,62'

# 02) Manipulando Dados

### 02.01) Valor total, pago e saldo

In [17]:
valor_tot = base_conjunta.groupby(['cda', 'tipo_divida', 'id_pessoa'])['valor_tot'].sum().to_frame().reset_index()
valor_pago = base_conjunta.groupby(['cda', 'tipo_divida', 'id_pessoa'])['vlr_pago'].sum().to_frame().reset_index()

# O que está em aberto
valor_aberto_tot = base_conjunta[base_conjunta['da_aberto'] == 1].groupby(['cda', 'tipo_divida', 'id_pessoa'])['valor_tot'].sum().to_frame().reset_index()
valor_aberto_pg = base_conjunta[base_conjunta['da_aberto'] == 1].groupby(['cda', 'tipo_divida', 'id_pessoa'])['vlr_pago'].sum().to_frame().reset_index()
valor_aberto = pd.merge(valor_aberto_tot, valor_aberto_pg, on = ['cda', 'tipo_divida', 'id_pessoa'], how = "left")
valor_aberto['valor_aberto'] = valor_aberto['valor_tot'] - valor_aberto['vlr_pago']
valor_aberto.drop(columns = ['valor_tot', 'vlr_pago'], inplace = True)

# Merge de valor_tot & valor_pago
valor_tot_pago = pd.merge(valor_tot, valor_pago, on = ['cda', 'tipo_divida', 'id_pessoa'], how = "left")
# Merge de (valor_tot e valor_pago) & valor_aberto
valor_tot_pago_aberto = pd.merge(valor_tot_pago, valor_aberto, on = ['cda', 'tipo_divida', 'id_pessoa'], how = "left")

# O que a gente esperava receber: dif_tot_pago
valor_tot_pago_aberto['dif_tot_pago'] = valor_tot_pago_aberto['valor_tot'] - valor_tot_pago_aberto['vlr_pago']
# O quanto perdeu entre o que a gente esperava receber e o que foi efetivamente pago
valor_tot_pago_aberto['dif_tot_pago_aberto'] = round(valor_tot_pago_aberto['dif_tot_pago'] - valor_tot_pago_aberto['valor_aberto'], 5)

valor_tot_pago_aberto.sort_values(by = 'dif_tot_pago_aberto', ascending = False)

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
1065506,71036654a451c4a29ceed47545e621ddE,mercantil,60bcc46b84243022,27055192.97,3579340.37,42983.52,23475852.60,23432869.08
273829,1d271205bb109cc89ceed47545e621ddE,mercantil,a5a62b17a2ca8a04,28440312.96,734070.01,7323732.13,27706242.95,20382510.82
2247592,ee27ee78f6c63e6318565e26094f06eeE,mercantil,60bcc46b84243022,21113432.77,1750182.07,2886797.53,19363250.70,16476453.17
300829,200b26abed9c0e496b3a8e268c80aedfE,mercantil,0c38fef7c41e01ce,25663880.31,182283.98,11832898.77,25481596.33,13648697.56
1216934,8104f268246d1f919ceed47545e621ddE,mercantil,b66a0dbc22eebc5a,15326768.32,1519768.82,817497.26,13806999.50,12989502.24
...,...,...,...,...,...,...,...,...
2415771,ffffd120a8f3f2799fcacff532cf50c41,imovel,934532c6c448a899,802.85,0.00,,802.85,
2415772,ffffd79d8aa612ac9fcacff532cf50c41,imovel,c6b16a1235d987d3,26104.42,26104.42,,0.00,
2415774,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,1728.13,0.00,,1728.13,
2415775,ffffdb721b90282718565e26094f06ee2,mercantil,51d6043a4efa1ed1,240.77,0.00,,240.77,


In [18]:
valor_tot_pago_aberto[~valor_tot_pago_aberto['dif_tot_pago_aberto'].isna()].sort_values(by = 'dif_tot_pago_aberto')

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,2048.73,0.00,2048.73,2048.73,0.00
1563927,a5c7d57615d6caf02b46f673bf0c61252,mercantil,604b29422fe78cae,685.60,0.00,685.60,685.60,0.00
1563930,a5c7ddd9c80d7659b5eae1a923f0f8702,mercantil,d813911b33e12b47,18571.25,0.00,18571.25,18571.25,0.00
1563931,a5c7f74567ce12346b3a8e268c80aedf1,imovel,3d8e266cd39b3555,2288.36,87.67,2200.69,2200.69,0.00
1563932,a5c7f74567ce12346b3a8e268c80aedf2,mercantil,4bc69dd79f31752f,4757.16,98.52,4658.64,4658.64,0.00
...,...,...,...,...,...,...,...,...
1216934,8104f268246d1f919ceed47545e621ddE,mercantil,b66a0dbc22eebc5a,15326768.32,1519768.82,817497.26,13806999.50,12989502.24
300829,200b26abed9c0e496b3a8e268c80aedfE,mercantil,0c38fef7c41e01ce,25663880.31,182283.98,11832898.77,25481596.33,13648697.56
2247592,ee27ee78f6c63e6318565e26094f06eeE,mercantil,60bcc46b84243022,21113432.77,1750182.07,2886797.53,19363250.70,16476453.17
273829,1d271205bb109cc89ceed47545e621ddE,mercantil,a5a62b17a2ca8a04,28440312.96,734070.01,7323732.13,27706242.95,20382510.82


In [19]:
formatar_moeda(15326768.32 - 1519768.82)

'R$ 13.806.999,50'

### Quantas CDAs em que NÃO há valor em aberto? 1.448.727

In [20]:
valor_tot_pago_aberto.loc[valor_tot_pago_aberto['dif_tot_pago_aberto'].isna(), 'cda'].nunique()

1448727

### Quantas CDAs em que há valor em aberto? 967.052

In [21]:
valor_tot_pago_aberto.loc[~valor_tot_pago_aberto['dif_tot_pago_aberto'].isna(), 'cda'].nunique()

967052

### Quantas CDAs em que o valor pago é exatamente igual ao valor_tot (quitação)? 240.954

In [22]:
valor_tot_pago_aberto.loc[valor_tot_pago_aberto['dif_tot_pago'] == 0, 'cda'].nunique()

240954

### Quantas CDAs em que o valor_tot é MAIOR que o valor_pago (ainda tá devendo ou negociação)? 2.174.825

In [23]:
valor_tot_pago_aberto[valor_tot_pago_aberto['dif_tot_pago'] > 0]

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,2048.73,0.00,2048.73,2048.73,0.00
1,00000123c3d4731c6b3a8e268c80aedf2,mercantil,fc4b99b807fbed41,2515.85,0.00,2515.85,2515.85,0.00
2,00000b44c5ba1e669ceed47545e621dd2,mercantil,96e8e553de69d7a4,1278.97,0.00,,1278.97,
3,000014e359592e62d8a3e5cebc255ca6E,mercantil,6dbe14da38a31dc1,847.02,0.00,847.02,847.02,0.00
4,0000331f601a73e52b46f673bf0c61251,imovel,870c08c252b25ad1,344.12,0.00,,344.12,
...,...,...,...,...,...,...,...,...
2415773,ffffd79d8aa612ac9fcacff532cf50c42,mercantil,cc5afb41f6fbc367,2512.15,265.15,1080.38,2247.00,1166.62
2415774,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,1728.13,0.00,,1728.13,
2415775,ffffdb721b90282718565e26094f06ee2,mercantil,51d6043a4efa1ed1,240.77,0.00,,240.77,
2415776,ffffdc78aa7a90e26b3a8e268c80aedf1,imovel,9b512f210b460120,9390.43,0.00,9390.43,9390.43,0.00


### Quantas CDAs em que o valor_tot é MENOR que o valor_pago? 0, NÃO há casos que o valor_pago é maior que o valor_devido

In [24]:
valor_tot_pago_aberto[valor_tot_pago_aberto['dif_tot_pago'] < 0]

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto


### Quantas CDAs em que o dif_tot_pago (saldo) é MAIOR que o valor_aberto? 105.099. Ou seja, são 105.099 CDAs em que o valor_aberto HOJE é menor do que deveria. São casos de negociação?

In [25]:
valor_tot_pago_aberto[valor_tot_pago_aberto['dif_tot_pago_aberto'] > 0]

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
53,000144e266882447b5eae1a923f0f8702,mercantil,cd7f1c355024e86f,6822.26,753.54,3488.75,6068.72,2579.97
87,00023f4e4527d0d69989bae6f4af91ee1,imovel,ba888e16f9b7f31a,3825.50,1123.95,96.95,2701.55,2604.60
99,000290b09c8b6829b5eae1a923f0f8701,imovel,28389708ca180f13,30923.33,1124.00,13788.57,29799.33,16010.76
188,000512739f54c8e34bd01ecb8277da0b2,mercantil,3c45b8167c93156b,1336.76,84.96,538.89,1251.80,712.91
190,000524a39303f88cb5eae1a923f0f8701,imovel,39fc040aa0ab54aa,10011.00,385.87,4627.75,9625.13,4997.38
...,...,...,...,...,...,...,...,...
2415707,fffe2228f91399f79fcacff532cf50c41,imovel,1cc7ab3dc955d096,811.75,192.31,124.84,619.44,494.60
2415721,fffe69ada60a04979ceed47545e621dd1,imovel,910e31fa556db12c,33597.69,919.49,15601.48,32678.20,17076.72
2415724,fffe75c5e436f4e69fcacff532cf50c42,mercantil,eefe16fde5758b38,6173.14,578.55,2077.97,5594.59,3516.62
2415729,fffe8ed282868b7d18565e26094f06ee1,imovel,1eb4adad813da472,16098.88,696.05,7150.17,15402.83,8252.66


### Quantas CDAs em que a 'renegociação' aumentou o valor_aberto? 0, NÃO há casos em que isso ocorre.

In [26]:
valor_tot_pago_aberto[valor_tot_pago_aberto['dif_tot_pago_aberto'] < 0]

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto


In [34]:
# Caso comentado no teams
valor_tot_pago_aberto[valor_tot_pago_aberto['cda'] == '8d1f5d34b36fe9dce52281a70553db462']
#base_conjunta[base_conjunta['cda'] == '8d1f5d34b36fe9dce52281a70553db462']

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,valor_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
1331558,8d1f5d34b36fe9dce52281a70553db462,mercantil,08d5af2c269cf4ba,17339.85,760.65,6139.11,16579.2,10440.09


In [35]:
# Caso comentado no teams 2
valor_tot_pago_aberto[valor_tot_pago_aberto['cda'] == 'fd285a26491e4db34bd01ecb8277da0b1']
#base_conjunta[base_conjunta['cda'] == 'fd285a26491e4db34bd01ecb8277da0b1']

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,valor_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto
2388952,fd285a26491e4db34bd01ecb8277da0b1,imovel,a815998acb8c6215,7590.99,6791.23,799.76,799.76,0.0


### Criar variável: ja_reneg_essa_cda

In [30]:
num_linhas_por_cda = base_conjunta.groupby(['cda', 'tipo_divida', 'id_pessoa'])['da_aberto'].nunique().to_frame().sort_values(by = 'da_aberto').reset_index()
num_linhas_por_cda =  num_linhas_por_cda.rename(columns = {'da_aberto':'num_da_aberto'})
num_linhas_por_cda['ja_reneg_essa_cda'] = 0
# As entradas que tem mais de um valor para da_aberto é pq foram renegociadas
num_linhas_por_cda.loc[(num_linhas_por_cda['num_da_aberto'] > 1 ), 'ja_reneg_essa_cda'] = 1 

valor_tot_pago_aberto = pd.merge(valor_tot_pago_aberto, num_linhas_por_cda, on = ['cda', 'tipo_divida', 'id_pessoa'], how = 'left')

In [31]:

valor_tot_pago_aberto.sort_values(by  = 'num_da_aberto')

Unnamed: 0,cda,tipo_divida,id_pessoa,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto,num_da_aberto,ja_reneg_essa_cda
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,2048.73,0.00,2048.73,2048.73,0.00,1,0
1582650,a7bf605bd84906a69989bae6f4af91ee1,imovel,c023846a586364b1,1872.56,702.51,,1170.05,,1,0
1582649,a7bf386675d3965f6b3a8e268c80aedf2,mercantil,6bba8aec44c1dd8f,3341.97,0.00,,3341.97,,1,0
1582648,a7bf378b9f9f08f99ceed47545e621dd1,imovel,117ecd85abb5ff15,1663.35,328.49,,1334.86,,1,0
1582647,a7bf260d0b1acf0b18565e26094f06ee2,mercantil,bea0db608943187b,6105.68,0.00,6105.68,6105.68,0.00,1,0
...,...,...,...,...,...,...,...,...,...,...
475285,328656ebe52205da18565e26094f06ee1,imovel,6a7ff69184132d60,11345.77,3264.96,1219.21,8080.81,6861.60,2,1
1906517,ca0904304a08d90f2b46f673bf0c61252,mercantil,f2a9ef6a13438d5e,2478.17,336.28,2141.89,2141.89,0.00,2,1
475294,32867aa0610b0c426b3a8e268c80aedf1,imovel,7cbca31d9c048a90,8518.15,223.39,4118.52,8294.76,4176.24,2,1
475266,3285ff24c745814b7c8fa7a67092eaab2,mercantil,619b9b9f6869e2b4,3479.50,158.95,3320.55,3320.55,0.00,2,1


In [32]:
df_conjunta = pd.merge(base_conjunta, num_linhas_por_cda, on = ['cda', 'tipo_divida', 'id_pessoa'], how = 'left')

# CDAs SEM mudanca de status
cdas_01_apar = df_conjunta[df_conjunta['num_da_aberto'] == 1]
cdas_01_apar = cdas_01_apar.sort_values(by = ['cda', 'inscricao_divida'])

# CDAs com mudanca de status
cdas_gt_01_apar = df_conjunta[df_conjunta['num_da_aberto'] > 1]
cdas_gt_01_apar = cdas_gt_01_apar.sort_values(by = ['cda', 'inscricao_divida']) # 319034 

# # CDAs com mudanca de status - ABERTO
cdas_gt_01_apar_ABERTO = cdas_gt_01_apar[cdas_gt_01_apar['da_aberto'] == 1]

reconstroi = pd.concat([cdas_01_apar, cdas_gt_01_apar_ABERTO])

reconstroi = reconstroi[['cda', 'tipo_divida', 'id_pessoa', 'atividade_principal', 'situacao',
       'tipo_tributo', 'vlr_tributo', 'vlr_taxa',
       'competencia_divida', 'inscricao_divida', 'arrecadacao_divida',
       'ajuizamento_divida', 'edificacao', 'cpf_cnpj_existe', 'protesto',
       'ajuizamento', 'refis', 'anos_idade_da',
       'quantidade_reparcelamento', 'da_aberto', 'endereco_existe']]

reconstroi = pd.merge(reconstroi, valor_tot_pago_aberto, on = ['cda', 'tipo_divida', 'id_pessoa'], how = 'left')

reconstroi['perc_pago'] = np.round(reconstroi['vlr_pago']/reconstroi['valor_tot'], 5)

reconstroi

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_tributo,vlr_taxa,competencia_divida,inscricao_divida,...,da_aberto,endereco_existe,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto,num_da_aberto,ja_reneg_essa_cda,perc_pago
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,APARTAMENTO,ATIVO,IPTU,762.08,1286.65,2021-01-01,2022-09-07,...,1,1,2048.73,0.00,2048.73,2048.73,0.00,1,0,0.00000
1,00000123c3d4731c6b3a8e268c80aedf2,mercantil,fc4b99b807fbed41,ATIVIDADES DE TELEATENDIMENTO,SUSPENSO,ISS,0.00,2515.85,2019-01-01,2022-12-30,...,1,0,2515.85,0.00,2515.85,2515.85,0.00,1,0,0.00000
2,00000b44c5ba1e669ceed47545e621dd2,mercantil,96e8e553de69d7a4,COMERCIO VAREJISTA DE ARTIGOS DE ARMARINHO,INAPTO,ISS,0.00,1278.97,2001-08-03,2002-09-07,...,0,0,1278.97,0.00,,1278.97,,1,0,0.00000
3,000014e359592e62d8a3e5cebc255ca6E,mercantil,6dbe14da38a31dc1,Comercio varejista especializado de equipament...,ATIVO,ISS,847.02,0.00,2022-12-01,2023-06-15,...,1,0,847.02,0.00,847.02,847.02,0.00,1,0,0.00000
4,0000331f601a73e52b46f673bf0c61251,imovel,870c08c252b25ad1,APARTAMENTO,ATIVO,IPTU,147.20,196.92,2010-01-01,2012-10-17,...,0,1,344.12,0.00,,344.12,,1,0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2415774,fffe69ada60a04979ceed47545e621dd1,imovel,910e31fa556db12c,CASA,ATIVO,IPTU,15601.48,0.00,2015-07-16,2015-08-10,...,1,1,33597.69,919.49,15601.48,32678.20,17076.72,2,1,0.02737
2415775,fffe75c5e436f4e69fcacff532cf50c42,mercantil,eefe16fde5758b38,FOTOCÓPIAS,INAPTO,ISS,2077.97,0.00,2018-12-11,2018-12-20,...,1,0,6173.14,578.55,2077.97,5594.59,3516.62,2,1,0.09372
2415776,fffe8ed282868b7d18565e26094f06ee1,imovel,1eb4adad813da472,CASA,ATIVO,IPTU,7150.17,0.00,2015-10-08,2015-11-10,...,1,1,16098.88,696.05,7150.17,15402.83,8252.66,2,1,0.04324
2415777,fffeb819f46062317c8fa7a67092eaab2,mercantil,52da39231b917f91,CARGA E DESCARGA,INAPTO,ISS,2864.02,0.00,2016-12-01,2016-12-09,...,1,0,3229.55,365.53,2864.02,2864.02,0.00,2,1,0.11318


In [33]:
reconstroi[reconstroi['cda'] == '0000a43dae672b166b3a8e268c80aedf1']

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_tributo,vlr_taxa,competencia_divida,inscricao_divida,...,da_aberto,endereco_existe,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto,num_da_aberto,ja_reneg_essa_cda,perc_pago
2255896,0000a43dae672b166b3a8e268c80aedf1,imovel,f5fe60dbb517e68a,APARTAMENTO,ATIVO,IPTU,2108.87,0.0,2022-08-24,2022-08-29,...,1,1,3801.23,1692.36,2108.87,2108.87,0.0,2,1,0.44521


In [34]:
reconstroi[reconstroi['cda'] == 'fd285a26491e4db34bd01ecb8277da0b1']

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_tributo,vlr_taxa,competencia_divida,inscricao_divida,...,da_aberto,endereco_existe,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto,num_da_aberto,ja_reneg_essa_cda,perc_pago
2413963,fd285a26491e4db34bd01ecb8277da0b1,imovel,a815998acb8c6215,APARTAMENTO,ATIVO,IPTU,7485.41,0.0,2021-06-04,2021-06-05,...,1,1,7590.99,6791.23,799.76,799.76,0.0,2,1,0.89464


# CDAs com as quais vamos treinar (da_aberto == 0)

In [36]:
# CDAs que vamos treinar
da_aberto_0 = reconstroi[reconstroi['da_aberto'] == 0]
# da_aberto_0[da_aberto_0['perc_pago'] == 0] # 989872  pagaram 0
# da_aberto_0[da_aberto_0['perc_pago'] == 1] # 240.953  pagaram tudo (semana passada: 240.883)
# da_aberto_0[(da_aberto_0['perc_pago'] > 0) & (da_aberto_0['perc_pago'] < 1)] # 217.901 pagaram algo (semana passada: 217.996)
# 3(240953 + 217996)/ (240953 + 989872 + 217996) # 31.67% dos dados de da aberto = 0 tem alguma informação de pagamento

# 03) Criando variáveis para clusterização

In [38]:
print("Gerando variáveis para identificação dos grupos de contribuintes")

dados_pessoas = reconstroi[['tipo_divida', 'cda', 'id_pessoa', 'situacao', 'cpf_cnpj_existe', 'edificacao', 
                            #'deb_totais', 'deb_pagos', 
                                'valor_tot', 'vlr_pago',      
                                'quantidade_reparcelamento', 'anos_idade_da',
                                'ja_reneg_essa_cda']]


dados_pessoas.dropna(subset = ['id_pessoa'], inplace = True)

dados_pessoas

Gerando variáveis para identificação dos grupos de contribuintes


Unnamed: 0,tipo_divida,cda,id_pessoa,situacao,cpf_cnpj_existe,edificacao,valor_tot,vlr_pago,quantidade_reparcelamento,anos_idade_da,ja_reneg_essa_cda
0,imovel,00000123c3d4731c6b3a8e268c80aedf1,ac28642d7c82b33f,ATIVO,1,1,2048.73,0.00,0,1.0,0
1,mercantil,00000123c3d4731c6b3a8e268c80aedf2,fc4b99b807fbed41,SUSPENSO,1,0,2515.85,0.00,0,1.0,0
2,mercantil,00000b44c5ba1e669ceed47545e621dd2,96e8e553de69d7a4,INAPTO,1,0,1278.97,0.00,0,21.0,0
3,mercantil,000014e359592e62d8a3e5cebc255ca6E,6dbe14da38a31dc1,ATIVO,1,0,847.02,0.00,0,0.0,0
4,imovel,0000331f601a73e52b46f673bf0c61251,870c08c252b25ad1,ATIVO,1,1,344.12,0.00,0,11.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2415774,imovel,fffe69ada60a04979ceed47545e621dd1,910e31fa556db12c,ATIVO,1,1,33597.69,919.49,0,8.0,1
2415775,mercantil,fffe75c5e436f4e69fcacff532cf50c42,eefe16fde5758b38,INAPTO,1,0,6173.14,578.55,2,5.0,1
2415776,imovel,fffe8ed282868b7d18565e26094f06ee1,1eb4adad813da472,ATIVO,1,1,16098.88,696.05,2,8.0,1
2415777,mercantil,fffeb819f46062317c8fa7a67092eaab2,52da39231b917f91,INAPTO,1,0,3229.55,365.53,0,7.0,1


In [65]:
#reconstroi['perc_pago'] = np.round(reconstroi['valor_pago']/reconstroi['valor_tot'], 5)
#xx = reconstroi[(reconstroi['perc_pago'] == 0) & (reconstroi['da_aberto'] == 0)]
#xx.sort_values(by = 'perc_pago')

In [39]:
num_reneg_cda_por_id_pessoa = dados_pessoas[dados_pessoas['ja_reneg_essa_cda'] == 1].groupby(['id_pessoa', 'tipo_divida'])['cda'].nunique().to_frame().reset_index()
num_reneg_cda_por_id_pessoa

Unnamed: 0,id_pessoa,tipo_divida,cda
0,0002644d836d4141,imovel,2
1,0005d30ef81c01c4,imovel,2
2,000722117d63c81c,imovel,5
3,0007a35a19982d05,imovel,7
4,0007f1cd58c2dfb2,mercantil,1
...,...,...,...
70484,fffa5f445cc41ff7,imovel,1
70485,fffbf81ab1d18c5a,imovel,1
70486,fffc0651dd11f426,imovel,6
70487,fffc2ed4460bead4,mercantil,1


# VAMOS ANALISAR?

In [41]:
bs_conj = reconstroi[(reconstroi['id_pessoa'] == '000722117d63c81c') & (reconstroi['tipo_divida'] == 'imovel')].sort_values(by = 'cda')
bs_conj = bs_conj[['cda', 'tipo_divida', 'id_pessoa', 'inscricao_divida', 'vlr_pago',	'valor_tot', 'vlr_tributo',	'vlr_taxa', 'da_aberto']]
ja_reneg_essa_cda = dados_pessoas.loc[dados_pessoas['id_pessoa']  == '000722117d63c81c', ['cda', 'tipo_divida', 'id_pessoa', 'ja_reneg_essa_cda']]


pd.merge(bs_conj, ja_reneg_essa_cda, on = ['cda', 'tipo_divida', 'id_pessoa'], how = 'left').sort_values(by = 'cda')

Unnamed: 0,cda,tipo_divida,id_pessoa,inscricao_divida,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,da_aberto,ja_reneg_essa_cda
0,0a8cab6bbf311e4818565e26094f06ee1,imovel,000722117d63c81c,2019-06-28,6294.66,6294.66,4991.94,1302.72,0,0
1,0c0939d83041f6d89989bae6f4af91ee1,imovel,000722117d63c81c,2023-02-25,536.02,8363.95,7827.93,0.0,1,1
2,1ba6b58b877b50507c8fa7a67092eaab1,imovel,000722117d63c81c,2023-02-25,1090.95,33690.38,15950.66,0.0,1,1
3,3cce22480c33111f2b46f673bf0c61251,imovel,000722117d63c81c,2023-01-25,1463.74,30676.91,14069.18,0.0,1,1
4,423e72500b92021d9fcacff532cf50c41,imovel,000722117d63c81c,2023-02-25,692.46,22282.12,10033.79,0.0,1,1
5,429180603ea716fa9ceed47545e621dd1,imovel,000722117d63c81c,2011-08-25,2606.1,7676.39,7676.39,0.0,0,0
6,4d512a19f62ed6eb9ceed47545e621dd1,imovel,000722117d63c81c,2013-08-28,12865.82,56151.37,56151.37,0.0,0,0
7,527a26eca1994b77e52281a70553db461,imovel,000722117d63c81c,2014-06-30,12352.54,178643.6,172128.03,6515.57,0,0
8,5a1e3841d7dccfcd6b3a8e268c80aedf1,imovel,000722117d63c81c,2016-05-14,5075.84,5075.84,4315.54,760.3,0,0
9,81bf93ef7d6b92987c8fa7a67092eaab1,imovel,000722117d63c81c,2023-01-25,1381.34,28871.71,13244.84,0.0,1,1


In [42]:
reconstroi[reconstroi['cda'] == '0c0939d83041f6d89989bae6f4af91ee1']

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_tributo,vlr_taxa,competencia_divida,inscricao_divida,...,da_aberto,endereco_existe,valor_tot,vlr_pago,valor_aberto,dif_tot_pago,dif_tot_pago_aberto,num_da_aberto,ja_reneg_essa_cda,perc_pago
2263503,0c0939d83041f6d89989bae6f4af91ee1,imovel,000722117d63c81c,APARTAMENTO,ATIVO,IPTU,7827.93,0.0,2023-01-20,2023-02-25,...,1,1,8363.95,536.02,7827.93,7827.93,0.0,2,1,0.06409


In [43]:
base_conjunta[base_conjunta['cda'] == '0c0939d83041f6d89989bae6f4af91ee1']

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,vlr_pago,valor_tot,vlr_tributo,vlr_taxa,...,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,anos_idade_da,quantidade_reparcelamento,da_aberto,endereco_existe
120432,0c0939d83041f6d89989bae6f4af91ee1,imovel,000722117d63c81c,APARTAMENTO,ATIVO,IPTU,536.02,536.02,536.02,0.0,...,1,0,0,0,4.0,4.0,0.0,0,0,1
120433,0c0939d83041f6d89989bae6f4af91ee1,imovel,000722117d63c81c,APARTAMENTO,ATIVO,IPTU,0.0,7827.93,7827.93,0.0,...,1,0,0,0,1.0,0.0,0.0,0,1,1


In [52]:
base_conjunta.columns

Index(['cda', 'tipo_divida', 'id_pessoa', 'atividade_principal', 'situacao',
       'tipo_tributo', 'vlr_pago', 'valor_tot', 'vlr_tributo', 'vlr_taxa',
       'competencia_divida', 'inscricao_divida', 'arrecadacao_divida',
       'ajuizamento_divida', 'edificacao', 'cpf_cnpj_existe', 'protesto',
       'ajuizamento', 'refis', 'deb_totais', 'deb_pagos', 'anos_idade_da',
       'quantidade_reparcelamento', 'da_aberto', 'endereco_existe'],
      dtype='object')

# Que tipo de agrupamento faremos?

--------------

In [56]:

# Calculo que apresenta quantas cdas o contribuinte tem
frequencia_da_pessoa = reconstroi.groupby(['id_pessoa', 'tipo_divida'])['cda'].nunique().to_frame().reset_index()
total_reparcelamentos_pessoa = reconstroi.groupby(['id_pessoa', 'tipo_divida'])['quantidade_reparcelamento'].sum().to_frame().reset_index()
valor_total_pessoa = reconstroi.groupby(['id_pessoa', 'tipo_divida'])['valor_tot'].sum().to_frame().reset_index()
valor_pago_pessoa = reconstroi.groupby(['id_pessoa', 'tipo_divida'])['vlr_pago'].sum().to_frame().reset_index()

ja_reneg_essa_cda = reconstroi[reconstroi['ja_reneg_essa_cda'] == 1].groupby(['id_pessoa', 'tipo_divida'])['cda'].nunique().to_frame().reset_index()
ja_reneg_essa_cda = ja_reneg_essa_cda.rename(columns = {'cda':'num_cda_reneg'})

In [57]:
# Edificacao
edificacao = reconstroi[['tipo_divida',	'id_pessoa', 'edificacao']]
edificacao = edificacao.drop_duplicates()

In [58]:
# Situacao
situacao = reconstroi[['tipo_divida', 'id_pessoa', 'situacao']]
situacao = situacao.drop_duplicates()

In [59]:
# cpf_cnpj_existe
cpf_cnpj_existe = reconstroi[['tipo_divida', 'id_pessoa', 'cpf_cnpj_existe']]
cpf_cnpj_existe = cpf_cnpj_existe.drop_duplicates()

In [60]:
# endereco_existe
endereco_existe = reconstroi[['tipo_divida', 'id_pessoa', 'endereco_existe']]
endereco_existe = endereco_existe.drop_duplicates()

In [61]:
CHAVE = ['id_pessoa', 'tipo_divida']
freq_repal = pd.merge(frequencia_da_pessoa, total_reparcelamentos_pessoa, on = CHAVE, how = 'left') # id_pessoa	tipo_divida	cda	quantidade_reparcelamento
freq_repal = freq_repal.rename(columns = {'cda':'num_dist_cda'})

#debitos = pd.merge(total_debitos_pessoa, debitos_pagos_pessoa, on = CHAVE, how = 'outer') # id_pessoa	tipo_divida	deb_totais	deb_pagos
valor = pd.merge(valor_total_pessoa, valor_pago_pessoa, on = CHAVE, how = 'outer') # id_pessoa	tipo_divida	valor_tot	valor_pago
notas_edif = pd.merge(base_notas_fiscais, edificacao, on = 'id_pessoa', how = 'outer') # id_pessoa	qtd_notas_2anos	tipo_divida	edificacao
situ_doc = pd.merge(situacao, cpf_cnpj_existe, on = CHAVE, how = 'outer') # tipo_divida	id_pessoa	situacao	cpf_cnpj_existe
ender_sit = pd.merge(situ_doc, endereco_existe, on = CHAVE, how = 'outer')

#freq_repal_debitos = pd.merge(freq_repal, debitos, on = CHAVE, how = 'left')
valor_notas_edif = pd.merge(valor, notas_edif, on = CHAVE, how = 'left')

ja_reneg_essa_cda_situ_doc = pd.merge(ender_sit, ja_reneg_essa_cda, on = CHAVE, how = 'left')
freq_repal_debitos_valor_notas_edif = pd.merge(freq_repal, valor_notas_edif, on = CHAVE, how = 'left')
pessoas = pd.merge(freq_repal_debitos_valor_notas_edif, ja_reneg_essa_cda_situ_doc, on = CHAVE, how = 'left')

In [62]:
pessoas.columns

Index(['id_pessoa', 'tipo_divida', 'num_dist_cda', 'quantidade_reparcelamento',
       'valor_tot', 'vlr_pago', 'qtd_notas_2anos', 'edificacao', 'situacao',
       'cpf_cnpj_existe', 'endereco_existe', 'num_cda_reneg'],
      dtype='object')

In [None]:
base_conjunta[base_conjunta['id_pessoa'] == '0007a35a19982d05']

In [63]:
# Substituindo por zero os valores nulos
pessoas['qtd_notas_2anos'] = pessoas['qtd_notas_2anos'].fillna(0)
pessoas['edificacao'] = pessoas['edificacao'].fillna(0)
pessoas['num_cda_reneg'] = pessoas['num_cda_reneg'].fillna(0)

In [64]:
# REGRAS DE APLICAÇÃO REFERENTE A ACESSIBILIDADE DE COBRANÇA DO CONTRIBUINTE

# MERCANTIL
pessoas.loc[(pessoas['tipo_divida'] == 'mercantil' ) & (pessoas['qtd_notas_2anos'] > 0) & (pessoas['situacao'] == 'ATIVO'), 'perfil_acessivel'] = 2
pessoas.loc[(pessoas['tipo_divida'] == 'mercantil' ) & (pessoas['qtd_notas_2anos'] > 0) & (pessoas['situacao'] != 'ATIVO'), 'perfil_acessivel'] = 1
pessoas.loc[(pessoas['tipo_divida'] == 'mercantil' ) & (pessoas['qtd_notas_2anos'] == 0) & (pessoas['situacao'] == 'ATIVO'), 'perfil_acessivel'] = 1
pessoas.loc[(pessoas['tipo_divida'] == 'mercantil' ) & (pessoas['qtd_notas_2anos'] == 0) & (pessoas['situacao'] != 'ATIVO'), 'perfil_acessivel'] = 0

# IMOVEL
pessoas.loc[(pessoas['tipo_divida'] == 'imovel' ) & (pessoas['edificacao'] == 1), 'perfil_acessivel'] = 2
pessoas.loc[(pessoas['tipo_divida'] == 'imovel' ) & (pessoas['edificacao'] == 0), 'perfil_acessivel'] = 0

In [65]:
# SE O CONTRIBUINTE NÃO OBTÉM ENDEREÇO, O MESMO É ZERADO NO SITUACAO COBRANCA
pessoas['situacao_cobranca'] = pessoas['perfil_acessivel'] + pessoas['cpf_cnpj_existe'] + pessoas['endereco_existe']

pessoas.loc[(pessoas['endereco_existe'] == 0), 'situacao_cobranca'] = 0
pessoas.loc[(pessoas['tipo_divida'] == 'mercantil' ) & (pessoas['perfil_acessivel'] == 0), 'situacao_cobranca'] = 0

In [72]:
teste = pessoas[pessoas['situacao_cobranca'] == 0]
teste

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,valor_tot,vlr_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,endereco_existe,num_cda_reneg,perfil_acessivel,situacao_cobranca
0,00000b449b8ad90b,mercantil,6,0,32725.94,0.00,0.0,0.0,SUSPENSO,1,0,0.0,0.0,0.0
2,000032f6d93a0abd,mercantil,9,0,69091.42,0.00,0.0,0.0,INAPTO,1,0,0.0,0.0,0.0
3,00006a29d93b27bb,mercantil,1,0,1714.47,0.00,0.0,0.0,BAIXADO,1,0,0.0,0.0,0.0
8,0000e13a7d8d7537,mercantil,5,6,8306.33,1980.75,76.0,0.0,ATIVO,1,0,0.0,2.0,0.0
9,0000fe7e1c1dbbe4,mercantil,2,0,13269.01,0.00,0.0,0.0,INAPTO,1,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494607,fffead7b88a44de8,mercantil,7,0,50421.93,0.00,0.0,0.0,SUSPENSO,1,0,0.0,0.0,0.0
494609,fffeb78fd42d2624,mercantil,3,0,13050.71,0.00,0.0,0.0,SUSPENSO,1,0,0.0,0.0,0.0
494610,fffee3cacb174a68,mercantil,1,0,486.92,486.92,0.0,0.0,SUSPENSO,1,1,0.0,0.0,0.0
494611,ffff460622e1e234,mercantil,2,0,692.98,0.00,0.0,0.0,INAPTO,1,1,0.0,0.0,0.0


In [97]:
# Faz o calculo do historico de pagamento
pessoas.loc[(pessoas['valor_tot'].isna()) | (pessoas['valor_tot'] == 0) , 'valor_tot'] = 1

pessoas['historico_pagamento_em_valor'] = pessoas['valor_pago'] / (pessoas['valor_tot'])

pessoas = pessoas.sort_values(by = 'historico_pagamento_em_valor', ascending = False)

In [98]:
pessoas.columns

Index(['id_pessoa', 'tipo_divida', 'num_dist_cda', 'quantidade_reparcelamento',
       'valor_tot', 'valor_pago', 'qtd_notas_2anos', 'edificacao', 'situacao',
       'cpf_cnpj_existe', 'num_cda_reneg', 'perfil_acessivel',
       'situacao_cobranca', 'historico_pagamento_em_valor'],
      dtype='object')

In [138]:
df_pipe_cluster = pessoas.query("num_dist_cda > 1")
    
df_pipe_cluster = df_pipe_cluster[['id_pessoa',
                                       'tipo_divida',
                                       'situacao_cobranca',
                                       'num_dist_cda',              # nova (antigo frequencia_da_pessoa)
                                       'quantidade_reparcelamento', # nova
                                       #'historico_pagamento_em_qtd', 
                                       'historico_pagamento_em_valor',
                                       #'num_cda_reneg'
                                       ]]
df_pipe_cluster = df_pipe_cluster.set_index(['id_pessoa', 'tipo_divida'])

In [139]:
#pessoas[pessoas['id_pessoa'] == '000032f6d93a0abd'] # 000032f6d93a0abd # def2e7cb1e2f6ae1
pessoas.sort_values(by = "num_cda_reneg", ascending = False)

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,num_cda_reneg,perfil_acessivel,situacao_cobranca,historico_pagamento_em_valor
123026,3f964cf3a3ef0aa0,mercantil,112,173,16513037.24,795792.16,0.0,0.0,SUSPENSO,1,96.0,0.0,0.0,0.048192
235252,799a139c3a15f9b9,mercantil,40,103,713369.73,30398.62,1037.0,0.0,ATIVO,1,31.0,2.0,3.0,0.042613
430581,df0ef300b498bbdf,mercantil,54,221,1364943.91,15060.81,342.0,0.0,ATIVO,1,29.0,2.0,3.0,0.011034
385645,c78fa21b361cd62c,mercantil,37,84,1866936.64,356052.98,598.0,0.0,ATIVO,1,28.0,2.0,3.0,0.190715
281620,919b137114328d10,mercantil,44,93,793276.96,26895.87,655.0,0.0,SUSPENSO,1,25.0,1.0,2.0,0.033905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343077,b1aceed765a713e6,imovel,2,0,302980.66,13855.44,0.0,1.0,ATIVO,1,0.0,2.0,3.0,0.045730
192531,63ad6ddca7c7dace,imovel,8,1,7005.07,320.35,0.0,1.0,ATIVO,1,0.0,2.0,3.0,0.045731
270537,8bcc8f8d2edf8b76,mercantil,4,1,16748.05,765.91,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.045731
156526,50f70cacc478c27b,imovel,8,2,7365.29,336.87,0.0,1.0,ATIVO,1,0.0,2.0,3.0,0.045738


In [140]:
df_pipe_cluster[df_pipe_cluster['id_pessoa'] == '000722117d63c81c']

KeyError: 'id_pessoa'

In [141]:
df_pipe_cluster[df_pipe_cluster['situacao_cobranca'] == 4]

Unnamed: 0_level_0,Unnamed: 1_level_0,situacao_cobranca,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_valor
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


----------------------------------------------------------------------------------------------

In [142]:
df_pipe_cluster.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,situacao_cobranca,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_valor
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ce990c05be9093d5,imovel,3.0,2,0,1.0
5999ff47795cba4d,imovel,3.0,2,2,1.0
fab8ab9b234e6e6e,imovel,3.0,5,0,1.0
b41df34003d09142,imovel,3.0,6,3,1.0
ce984494bc04e76a,imovel,3.0,3,2,1.0


In [143]:
df_pipe_cluster.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
situacao_cobranca,390647.0,1.452593,1.382894,0.0,0.0,2.0,3.0,3.0
num_dist_cda,390647.0,5.917905,3.490871,2.0,3.0,5.0,8.0,112.0
quantidade_reparcelamento,390647.0,1.433197,3.619606,0.0,0.0,0.0,1.0,221.0
historico_pagamento_em_valor,390647.0,0.146797,0.272086,0.0,0.0,0.0,0.167384,1.0


# 03) Clusterização dos contribuintes

In [144]:
faixa_n_clusters = [i for i in range(2,16)]
valores_inercia = []
valores_score = []

for k in faixa_n_clusters:
    agrupador = KMeans(n_clusters=k, random_state=1337)
    label = agrupador.fit_predict(df_pipe_cluster)
    print(f"Treinamento do agrupador para K= {k} finalizado")
    
    media_inercia = agrupador.inertia_
    valores_inercia.append(media_inercia)
    print(f"Inércia calculada para o agrupador de K= {k}. Inércia: {media_inercia}")

    media_score = agrupador.score(df_pipe_cluster)
    valores_score.append(media_score)
    print(f"Score calculado para o agrupador de K= {k}. Socre: {media_score}")

Treinamento do agrupador para K= 2 finalizado
Inércia calculada para o agrupador de K= 2. Inércia: 7008947.002189524
Score calculado para o agrupador de K= 2. Socre: -7008947.002189524
Treinamento do agrupador para K= 3 finalizado
Inércia calculada para o agrupador de K= 3. Inércia: 4339712.109864904
Score calculado para o agrupador de K= 3. Socre: -4339712.109864904
Treinamento do agrupador para K= 4 finalizado
Inércia calculada para o agrupador de K= 4. Inércia: 3441581.7271740693
Score calculado para o agrupador de K= 4. Socre: -3441581.7271740693
Treinamento do agrupador para K= 5 finalizado
Inércia calculada para o agrupador de K= 5. Inércia: 2889273.6659675245
Score calculado para o agrupador de K= 5. Socre: -2889273.6659675245
Treinamento do agrupador para K= 6 finalizado
Inércia calculada para o agrupador de K= 6. Inércia: 2441573.89979015
Score calculado para o agrupador de K= 6. Socre: -2441573.89979015
Treinamento do agrupador para K= 7 finalizado
Inércia calculada para o ag

In [145]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = faixa_n_clusters, y = valores_inercia))
fig.update_layout(
    title = "INDICADOR: Inercia para K grupos",
    xaxis_title = "Valores de K",
    yaxis_title = "Inércia",
    font = dict(
        family = "Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [146]:
fig =go.Figure()
fig.add_trace(go.Scatter(x = faixa_n_clusters, y = valores_score))
fig.update_layout(
    title="Indicador: Erro quadratico médio para K grupos",
    xaxis_title="Valores de K",
    yaxis_title="MSE",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [147]:
def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 15, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

In [148]:
valor_ideal_k = optimal_number_of_clusters(valores_inercia)

print("Melhor valor de K:", valor_ideal_k)

Melhor valor de K: 5


In [149]:
# Construindo o melhor agrupador de clusteres
VALOR_K = valor_ideal_k

agrupador = KMeans(n_clusters=VALOR_K, random_state=1337)
agrupador.fit_transform(df_pipe_cluster)

# Obtendo o ponto central dos clusteres
centros = agrupador.cluster_centers_
df_centroide = pd.DataFrame(centros, columns = df_pipe_cluster.columns).round(3)
df_centroide['cluster'] = df_centroide.index

# Obtendo o label para cada pessoa
df_pipe_cluster['label_cluster'] = agrupador.labels_

In [150]:
df_pipe_cluster

Unnamed: 0_level_0,Unnamed: 1_level_0,situacao_cobranca,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_valor,label_cluster
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ce990c05be9093d5,imovel,3.0,2,0,1.0,1
5999ff47795cba4d,imovel,3.0,2,2,1.0,1
fab8ab9b234e6e6e,imovel,3.0,5,0,1.0,1
b41df34003d09142,imovel,3.0,6,3,1.0,0
ce984494bc04e76a,imovel,3.0,3,2,1.0,1
...,...,...,...,...,...,...
cc0b2f4889f1899e,mercantil,0.0,5,0,0.0,1
cc0b203816bff9f9,mercantil,0.0,7,0,0.0,0
cc0a4c00fcbc23bb,imovel,3.0,11,0,0.0,3
cc0e13ae39899bf7,mercantil,0.0,6,0,0.0,0


In [151]:
df_pipe_cluster.groupby('label_cluster').count()

Unnamed: 0_level_0,situacao_cobranca,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_valor
label_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,95563,95563,95563,95563
1,203994,203994,203994,203994
2,4814,4814,4814,4814
3,52237,52237,52237,52237
4,34039,34039,34039,34039


# Centroide

In [152]:
df_centroide

Unnamed: 0,situacao_cobranca,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_valor,cluster
0,1.024,7.635,0.319,0.05,0
1,1.272,3.297,0.585,0.182,1
2,2.613,11.119,23.379,0.239,2
3,2.094,12.091,0.824,0.033,3
4,2.586,6.594,7.478,0.367,4


------------------------

# Primeira Divida

In [119]:
# Para ter uma noção de quem seria o centroide
df_primeira_divida = pessoas.query("num_dist_cda == 1")

# Imputando historico_pagamento_em_valor = 1 nos casos que passa de 1
# ESCOLHA DA LARISSA
df_primeira_divida.loc[df_primeira_divida['historico_pagamento_em_valor'] > 1, 'historico_pagamento_em_valor'] = 1
df_primeira_divida

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,num_cda_reneg,perfil_acessivel,situacao_cobranca,historico_pagamento_em_valor
244201,7e3e470e48111cef,imovel,1,1,2692.83,2692.83,0.0,1.0,ATIVO,1,0.0,2.0,3.0,1.0
293547,97d358eb46a1405d,mercantil,1,0,68.13,68.13,0.0,0.0,ATIVO,1,0.0,1.0,2.0,1.0
369994,bf8854ad4819396a,imovel,1,1,1016.49,1016.49,0.0,1.0,ATIVO,1,0.0,2.0,3.0,1.0
240052,7c147242d698e270,mercantil,1,0,547.84,547.84,0.0,0.0,ATIVO,1,0.0,1.0,2.0,1.0
224320,73ef53db7b018bb3,imovel,1,0,152.83,152.83,0.0,1.0,ATIVO,1,0.0,2.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394140,cc0d1a090c2a576b,mercantil,1,0,1034.33,0.00,0.0,0.0,ATIVO,1,0.0,1.0,2.0,0.0
394139,cc0cecb1b1362893,mercantil,1,0,1833.70,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0
394133,cc0c916e9be26fc5,imovel,1,0,2538.26,0.00,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,0.0
394127,cc0aced2ea4715a0,mercantil,1,0,1369.74,0.00,0.0,0.0,ATIVO,1,0.0,1.0,2.0,0.0


### Comentários 1ª Dívida

#### 39% inacessível

#### 46% completamente acessível

In [121]:
prim_div_status_sit = df_primeira_divida.groupby('situacao_cobranca')['id_pessoa'].nunique().to_frame().reset_index()
total = prim_div_status_sit['id_pessoa'].sum()
prim_div_status_sit['total'] = total
prim_div_status_sit['perc'] = np.round(prim_div_status_sit['id_pessoa']/total, 5)
prim_div_status_sit

Unnamed: 0,situacao_cobranca,id_pessoa,total,perc
0,0.0,41261,103568,0.3984
1,1.0,2256,103568,0.02178
2,2.0,11844,103568,0.11436
3,3.0,48207,103568,0.46546


In [123]:
data_percentil = {
    'inf': [0, 0.01, 0.25, 0.5, 0.75, 0.99, 1],
    'sup': [0.01, 0.25, 0.5, 0.75, 0.99, 1, 1.01]
}

# Creating a DataFrame from the dictionary
df_cut_percentil = pd.DataFrame(data_percentil)
df_cut_percentil['intervalo'] = df_cut_percentil.apply(paste_intervalo, axis = 1)

# Define the bins and labels for groups
igr_bins = [0, 0.01, 0.25, 0.5, 0.75, 0.99, 1, 1.01]
igr_labels = df_cut_percentil['intervalo']


# df_primeira_divida['historico_pagamento_em_qtd_faixas'] = pd.cut(df_primeira_divida['historico_pagamento_em_qtd'],
#                                 bins = igr_bins,
#                                 labels = igr_labels,
#                                 right = False)


df_primeira_divida['historico_pagamento_em_valor_faixas'] = pd.cut(df_primeira_divida['historico_pagamento_em_valor'],
                                bins = igr_bins,
                                labels = igr_labels,
                                right = False)

df_hist_pg_vlr = df_primeira_divida.groupby('historico_pagamento_em_valor_faixas')['historico_pagamento_em_valor_faixas'].count().to_frame()
total = df_hist_pg_vlr['historico_pagamento_em_valor_faixas'].sum()  
df_hist_pg_vlr['total'] = total
df_hist_pg_vlr['perc'] = np.round(df_hist_pg_vlr['historico_pagamento_em_valor_faixas']/total, 4)
df_hist_pg_vlr

Unnamed: 0_level_0,historico_pagamento_em_valor_faixas,total,perc
historico_pagamento_em_valor_faixas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"[0.0, 0.01)",53893,103967,0.5184
"[0.01, 0.25)",7888,103967,0.0759
"[0.25, 0.5)",8710,103967,0.0838
"[0.5, 0.75)",3450,103967,0.0332
"[0.75, 0.99)",225,103967,0.0022
"[0.99, 1.0)",0,103967,0.0
"[1.0, 1.01)",29801,103967,0.2866


In [125]:
# df_hist_pg_qtd = df_primeira_divida.groupby('historico_pagamento_em_qtd_faixas')['historico_pagamento_em_qtd_faixas'].count().to_frame()
# total = df_hist_pg_qtd['historico_pagamento_em_qtd_faixas'].sum()  
# df_hist_pg_qtd['total'] = total
# df_hist_pg_qtd['perc'] = np.round(df_hist_pg_qtd['historico_pagamento_em_qtd_faixas']/total, 4)
# df_hist_pg_qtd

In [127]:
df_primeira_divida['situacao_cobranca'].mean()

1.6510815931978415

In [128]:
df_primeira_divida['num_dist_cda'].mean()

1.0

In [129]:
df_primeira_divida['quantidade_reparcelamento'].mean()

0.35732492040743696

In [130]:
df_primeira_divida['historico_pagamento_em_valor'].mean()

0.3497702846945392

In [131]:
df_primeira_divida['num_cda_reneg'].mean()

0.038560312406821395

Na totalidade temos +1 cluster especial que é do contribuinte que apareceu a primeira vez em DA

In [41]:
# dicionario_clusteres ={            # status       # qtd   # vlr
#     1: 'PIOR PAGADOR',          C  # 0.006	    0.113	0.045   --->  paga pouquíssimo & tá inacessível     
#     3: 'PAGADOR INTERMEDIARIO', B  # 1.999	    0.177	0.077   --->  paga pouquíssimo & tá menos acessível
#     2: 'BOM PAGADOR',           A  # 3.000 	    0.139	0.067   --->  paga pouquíssimo & tá acessível         
#     0: 'MELHOR PAGADOR',        AA # 2.954	    0.874	0.606   --->  paga bem         & tá acessível	        
#     4: 'PRIMEIRA DIVIDA'           # 1.668        0.437   0.351   --->  paga médio       & tá mais ou menos acessível
# }

# Com K = 4
#status_situacao 	num_dist_cda	quantidade_reparc	historico_pagamento_em_qtd	historico_pagamento_em_valor	cluster
#2.595	            6.761	        7.513	                0.713	                        0.361	                        0
#1.548	            9.855	        0.486	                0.073	                        0.037	                        1
#2.627	            11.095	        23.413	                0.698	                        0.239	                        2
#1.277	            3.601	        0.562	                0.251	                        0.170	                        3

# Com k = 5
#status_situacao num_dist_cda	quantidade_reparc	hist_pg_qtd	   hist_pg_valor   cluster  Num.Contrib
#0.768	         6.601	         0.259	            0.077	        0.041	        0   --- 108.634  # pg mau + nao repar + num médio de CDAs + pouquíssimo acessível [CONTRIB PESSIMO]
#1.490	         2.998	         0.672	            0.307	        0.212	        1   --- 173.614  # pg médio + nao repar + num baixo de CDAs + pouco acessível     [MÉDIO INACESSÍVEL]
#2.606	         6.629	         7.518	            0.721	        0.367	        2   --- 33.602   # pg melhor + repar bem + num médio de CDAs + mt acessível       [CONTRIB EXCEL]
#2.627	         11.115	         23.339	            0.696	        0.238	        3   --- 4.793    # pg bem + repar mt + num alto de CDAs + mt acessível            [CONTRIB NEGOC]
#2.019	         11.414	         0.693	            0.072	        0.034	        4   --- 69.859   # pg mau + nao repar + num alto de CDAs + médio acessível        [RUIM ACESSÍVEL]


situacao_cobranca	num_dist_cda	quantidade_reparcelamento	historico_pagamento_em_valor	cluster
1.024	            7.635	        0.319	                    0.050	                        0           # pg mau, não reparcela e tá inacessível
1.272	            3.297	        0.585	                    0.182	                        1           # pg algo, não reparcela e tá inacessível
2.613	            11.119	        23.379	                    0.239	                        2           # Negociador (tá cessível)
2.094	            12.091	        0.824	                    0.033	                        3           # pg mau, não reparcela e tá acessível
2.586	            6.594	        7.478	                    0.367	                        4           # pg bem + tá acessível


# Criar um dicionário com valores escalares
dicionario_clusteres = {
    'class_contribuinte': [0, 1, 2, 3, 4, 5],
    'class_contribuinte_nome': ['CONTRIB PESSIMO', 
                                'MEDIO INACESSIVEL', 
                                'CONTRIB EXCELENTE', 
                                'CONTRIB NEGOCIADOR', 
                                'RUIM ACESSIVEL',
                                'PRIMEIRA DIVIDA']
}

df_dicionario_clusteres = pd.DataFrame(dicionario_clusteres)
df_dicionario_clusteres

Unnamed: 0,class_contribuinte,class_contribuinte_nome
0,0,CONTRIB PESSIMO
1,1,MEDIO INACESSIVEL
2,2,CONTRIB EXCELENTE
3,3,CONTRIB NEGOCIADOR
4,4,RUIM ACESSIVEL
5,5,PRIMEIRA DIVIDA


In [42]:
df_pipe_cluster.groupby('label_cluster').count()

Unnamed: 0_level_0,status_situacao,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_qtd,historico_pagamento_em_valor
label_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,108634,108634,108634,108634,108634
1,173614,173614,173614,173614,173614
2,33602,33602,33602,33602,33602
3,4793,4793,4793,4793,4793
4,69859,69859,69859,69859,69859


In [43]:
df_centroide

Unnamed: 0,status_situacao,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_qtd,historico_pagamento_em_valor,cluster
0,0.768,6.601,0.259,0.077,0.041,0
1,1.49,2.998,0.672,0.307,0.212,1
2,2.606,6.629,7.518,0.721,0.367,2
3,2.627,11.115,23.339,0.696,0.238,3
4,2.019,11.414,0.693,0.072,0.034,4


In [44]:
# Salve o DataFrame em um arquivo CSV
if AGRUP_COM_FREQ_PESSOAS == 1:
    df_dicionario_clusteres.to_csv('data/df_dicionario_clusteres_AGRUP_COM_FREQ_PESSOAS.csv', index = False)  
if AGRUP_COM_STATUS_SITUACAO == 1:
    df_dicionario_clusteres.to_csv('data/df_dicionario_clusteres_AGRUP_COM_STATUS_SITUACAO.csv', index = False) 

Classificações dos contribuintes com base no melhor ao pior pagador, baseado no seu histórico e na sua situação atual.

# 04) Cria classificador de class do contribuinte

In [45]:
# Constroi o modelo que prevê qual o grupo do contribuinte

x_cluster = df_pipe_cluster.drop(columns=['label_cluster'])
y_cluster = df_pipe_cluster['label_cluster']

X_train, X_test, y_train, y_test = train_test_split(x_cluster, y_cluster, test_size=0.3, random_state=1337)

In [46]:
model_predict_contribuinte = RandomForestClassifier(random_state=1337)
model_predict_contribuinte.fit(X_train, y_train)

score_validacao = model_predict_contribuinte.score(X_test, y_test)
print("Score de validacao:", score_validacao)

Score de validacao: 0.9998890321038659


In [47]:
# Previsão da classificação para a base total de contribuintes
matriz_previsao_class = pessoas[['status_situacao', 'num_dist_cda', 'quantidade_reparcelamento', 'historico_pagamento_em_qtd', 'historico_pagamento_em_valor']]
pessoas['class_contribuinte'] = model_predict_contribuinte.predict(matriz_previsao_class)

pessoas.loc[pessoas['num_dist_cda'] == 1, 'class_contribuinte'] = 5

In [48]:
#dados_pessoas = dados_pessoas.reset_index()
pessoas

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte
247206,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5
467137,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1
183548,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5
183545,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5
49086,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212557,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1
212556,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0
212552,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1
212551,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0


# Exemplo com hash de pessoas

In [49]:
pessoas[pessoas['id_pessoa'] == '000032f6d93a0abd'] 
#status_situacao num_dist_cda	quantidade_reparc	hist_pg_qtd	   hist_pg_valor   cluster  Num.Contrib
#0.768	         6.601	         0.259	            0.077	        0.041	        0   --- 108.634  # pg mau + nao repar + num médio de CDAs + pouquíssimo acessível [CONTRIB PESSIMO]
#1.490	         2.998	         0.672	            0.307	        0.212	        1   --- 173.614  # pg médio + nao repar + num baixo de CDAs + pouco acessível     [MÉDIO INACESSÍVEL]


Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte
1,000032f6d93a0abd,imovel,2,2,10.0,8.0,12715.63,3339.31,0.0,1.0,ATIVO,1,2.0,3.0,0.8,0.262615,1
2,000032f6d93a0abd,mercantil,9,0,66.0,0.0,69091.42,0.0,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0


In [50]:
pessoas[pessoas['id_pessoa'] == 'def2e7cb1e2f6ae1'] 
#status_situacao num_dist_cda	quantidade_reparc	hist_pg_qtd	   hist_pg_valor   cluster  Num.Contrib
#1.490	         2.998	         0.672	            0.307	        0.212	        1   --- 173.614  # pg médio + nao repar + num baixo de CDAs + pouco acessível     [MÉDIO INACESSÍVEL]
#2.606	         6.629	         7.518	            0.721	        0.367	        2   --- 33.602   # pg melhor + repar bem + num médio de CDAs + mt acessível       [CONTRIB EXCEL]


Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte
430200,def2e7cb1e2f6ae1,imovel,7,7,45.0,43.0,4980.41,4137.32,0.0,1.0,ATIVO,1,2.0,3.0,0.955556,0.830719,2
430201,def2e7cb1e2f6ae1,mercantil,2,0,6.0,0.0,33600.12,0.0,0.0,0.0,SUSPENSO,1,0.0,0.0,0.0,0.0,1


# Quantos contribuintes tem CDA em imóvel e mercantil?

### 70.354

In [51]:
contrib_num_tipo_divida = dados_pessoas.groupby('id_pessoa')['tipo_divida'].nunique().to_frame().reset_index()
contrib_num_tipo_divida = contrib_num_tipo_divida.sort_values(by = 'tipo_divida', ascending = False)
contrib_02_tipo_divida = contrib_num_tipo_divida[contrib_num_tipo_divida['tipo_divida'] > 1]
contrib_02_tipo_divida = contrib_02_tipo_divida.drop(columns = 'tipo_divida')

In [52]:
dados_pessoas_aux = dados_pessoas[["tipo_divida", "cda", "id_pessoa", "valor_tot"]]
contrib_02_tipo_divida = pd.merge(dados_pessoas_aux, contrib_02_tipo_divida, how = "right", on = "id_pessoa")
contrib_02_tipo_divida = contrib_02_tipo_divida.sort_values(by = "id_pessoa")
contrib_02_tipo_divida

Unnamed: 0,tipo_divida,cda,id_pessoa,valor_tot
579016,mercantil,fcab2f9c51259d2ed8a3e5cebc255ca62,000032f6d93a0abd,1789.80
579015,mercantil,db950944ce8b2b706b3a8e268c80aedf2,000032f6d93a0abd,22147.06
579014,mercantil,bfb98e5cd3a4eb2e7c8fa7a67092eaab2,000032f6d93a0abd,7519.63
579013,mercantil,b269b3c8a08334b29fcacff532cf50c42,000032f6d93a0abd,1897.32
579012,mercantil,b208dd365dbb6b4c6b3a8e268c80aedf2,000032f6d93a0abd,9769.89
...,...,...,...,...
687897,imovel,858bdcca9dce6e914bd01ecb8277da0b1,fffeb78fd42d2624,1015.13
687898,imovel,996bf4fc3cbc77229ceed47545e621dd1,fffeb78fd42d2624,4162.30
687900,mercantil,bb2daefbbfd5c13fe52281a70553db462,fffeb78fd42d2624,885.21
687902,imovel,fc3f9d49392410942b46f673bf0c61251,fffeb78fd42d2624,11056.72


In [53]:
contrib_02_tipo_divida['id_pessoa'].nunique() /494413

0.14229803827973778

In [54]:
contrib_02_tipo_divida_metrics = contrib_02_tipo_divida.groupby(['id_pessoa', 'tipo_divida']).agg({
    'cda': ['nunique'], 
    'valor_tot': ['sum']    
})
contrib_02_tipo_divida_metrics = contrib_02_tipo_divida_metrics.reset_index()
contrib_02_tipo_divida_metrics

Unnamed: 0_level_0,id_pessoa,tipo_divida,cda,valor_tot
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,nunique,sum
0,000032f6d93a0abd,imovel,2,12715.63
1,000032f6d93a0abd,mercantil,9,69091.42
2,0000e13a7d8d7537,imovel,9,12683.16
3,0000e13a7d8d7537,mercantil,5,8306.33
4,0001cc5c538504ce,imovel,6,13948.46
...,...,...,...,...
140703,fffc4c97f1f6c954,mercantil,2,1651.71
140704,fffe4c8195b51bd9,imovel,2,1060.03
140705,fffe4c8195b51bd9,mercantil,1,1833.70
140706,fffeb78fd42d2624,imovel,7,48370.82


In [55]:
# Salve o DataFrame em um arquivo CSV
if AGRUP_COM_FREQ_PESSOAS == 1:
    contrib_02_tipo_divida_metrics.to_csv('data/contrib_02_tipo_divida_metrics_AGRUP_COM_FREQ_PESSOAS.csv', index = False)  

In [56]:
# Vendo as dívidas desses pessoas
contrib_02_tipo_divida_aux = contrib_02_tipo_divida[['tipo_divida',	'id_pessoa']]
contrib_02_tipo_divida_pessoas = pd.merge(contrib_02_tipo_divida_aux, pessoas, on = ['tipo_divida', 'id_pessoa'], how = 'left')
contrib_02_tipo_divida_pessoas = contrib_02_tipo_divida_pessoas.drop_duplicates()
contrib_02_tipo_divida_pessoas

Unnamed: 0,tipo_divida,id_pessoa,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte
0,mercantil,000032f6d93a0abd,9,0,66.0,0.0,69091.42,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.000000,0.000000,0
9,imovel,000032f6d93a0abd,2,2,10.0,8.0,12715.63,3339.31,0.0,1.0,ATIVO,1,2.0,3.0,0.800000,0.262615,1
11,mercantil,0000e13a7d8d7537,5,6,42.0,27.0,8306.33,1980.75,76.0,0.0,ATIVO,1,2.0,3.0,0.642857,0.238463,2
12,imovel,0000e13a7d8d7537,9,0,177.0,0.0,12683.16,0.00,76.0,1.0,ATIVO,0,2.0,2.0,0.000000,0.000000,4
25,imovel,0001cc5c538504ce,6,0,103.0,0.0,13948.46,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690520,mercantil,fffc4c97f1f6c954,2,1,7.0,2.0,1651.71,46.47,0.0,0.0,BAIXADO,1,0.0,0.0,0.285714,0.028134,1
690530,imovel,fffe4c8195b51bd9,2,0,32.0,32.0,1060.03,1060.03,0.0,1.0,ATIVO,1,2.0,3.0,1.000000,1.000000,1
690531,mercantil,fffe4c8195b51bd9,1,0,4.0,0.0,1833.70,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.000000,0.000000,5
690533,mercantil,fffeb78fd42d2624,3,0,15.0,0.0,13050.71,0.00,0.0,0.0,SUSPENSO,1,0.0,0.0,0.000000,0.000000,1


# Voltando para os agrupamentos

In [57]:
# Nomeando a classificação com label de prioridade

pessoas = pd.merge(pessoas, 
         df_dicionario_clusteres, 
         on = "class_contribuinte",
         how = "left")

pessoas

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5,PRIMEIRA DIVIDA
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5,PRIMEIRA DIVIDA
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5,PRIMEIRA DIVIDA
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO


In [58]:
pessoas.columns

Index(['id_pessoa', 'tipo_divida', 'num_dist_cda', 'quantidade_reparcelamento',
       'deb_totais', 'deb_pagos', 'valor_tot', 'valor_pago', 'qtd_notas_2anos',
       'edificacao', 'situacao', 'cpf_cnpj_existe', 'situacao_ativa',
       'status_situacao', 'historico_pagamento_em_qtd',
       'historico_pagamento_em_valor', 'class_contribuinte',
       'class_contribuinte_nome'],
      dtype='object')

In [59]:
# Incluindo variável 'tipo_divida'
df_classificao_contribuinte = pessoas[['id_pessoa', 'tipo_divida','class_contribuinte_nome']]
df_classificao_contribuinte

Unnamed: 0,id_pessoa,tipo_divida,class_contribuinte_nome
0,7fcd58cfe066e299,mercantil,PRIMEIRA DIVIDA
1,f1e9dde2a3053d51,imovel,MEDIO INACESSIVEL
2,5f225ff188d71bf6,imovel,PRIMEIRA DIVIDA
3,5f21929a71c17a08,mercantil,PRIMEIRA DIVIDA
4,196d80b1b82edbc0,mercantil,MEDIO INACESSIVEL
...,...,...,...
494408,6dec5209fbc2f813,imovel,MEDIO INACESSIVEL
494409,6dec36900dea378c,mercantil,CONTRIB PESSIMO
494410,6debc091e4bf4d84,mercantil,MEDIO INACESSIVEL
494411,6debbb4cc485f25f,mercantil,CONTRIB PESSIMO


In [60]:
df_classificao_contribuinte.groupby('class_contribuinte_nome').count()

Unnamed: 0_level_0,id_pessoa,tipo_divida
class_contribuinte_nome,Unnamed: 1_level_1,Unnamed: 2_level_1
CONTRIB EXCELENTE,33610,33610
CONTRIB NEGOCIADOR,4794,4794
CONTRIB PESSIMO,108633,108633
MEDIO INACESSIVEL,173614,173614
PRIMEIRA DIVIDA,103911,103911
RUIM ACESSIVEL,69851,69851


In [61]:
base_conjunta.columns

Index(['cda', 'tipo_divida', 'id_pessoa', 'atividade_principal', 'situacao',
       'tipo_tributo', 'valor_pago', 'valor_tot', 'vlr_tributo', 'vlr_taxa',
       'competencia_divida', 'inscricao_divida', 'arrecadacao_divida',
       'ajuizamento_divida', 'edificacao', 'cpf_cnpj_existe', 'protesto',
       'ajuizamento', 'refis', 'deb_totais', 'deb_pagos', 'anos_idade_da',
       'quantidade_reparcelamento'],
      dtype='object')

# 05) Análise discriminante da classificação do contribuintes para o % Pago das dívidas de cada CDA

In [62]:
# Calcula variável target y
base_conjunta.loc[(base_conjunta['valor_pago'].isna()) | (base_conjunta['valor_pago'] == 0) , 'valor_pago'] = 1
base_conjunta.loc[(base_conjunta['valor_tot'].isna()) | (base_conjunta['valor_tot'] == 0) , 'valor_tot'] = 1

base_conjunta['percentual_pago_cda'] = base_conjunta['valor_pago'] / base_conjunta['valor_tot']

In [63]:
# Imputando historico_pagamento_em_valor = 1 nos casos que passa de 1
# ESCOLHA DA LARISSA
base_conjunta.loc[base_conjunta['percentual_pago_cda'] > 1, 'percentual_pago_cda'] = 1

In [64]:
df_classificao_contribuinte

Unnamed: 0,id_pessoa,tipo_divida,class_contribuinte_nome
0,7fcd58cfe066e299,mercantil,PRIMEIRA DIVIDA
1,f1e9dde2a3053d51,imovel,MEDIO INACESSIVEL
2,5f225ff188d71bf6,imovel,PRIMEIRA DIVIDA
3,5f21929a71c17a08,mercantil,PRIMEIRA DIVIDA
4,196d80b1b82edbc0,mercantil,MEDIO INACESSIVEL
...,...,...,...
494408,6dec5209fbc2f813,imovel,MEDIO INACESSIVEL
494409,6dec36900dea378c,mercantil,CONTRIB PESSIMO
494410,6debc091e4bf4d84,mercantil,MEDIO INACESSIVEL
494411,6debbb4cc485f25f,mercantil,CONTRIB PESSIMO


In [65]:
# Separação dos dados para a análise discriminante dos grupos de contribuintes
df_analise_discriminante = pd.merge(
    left = base_conjunta, on=['id_pessoa', 'tipo_divida'], right=df_classificao_contribuinte, how='left'
)

df_analise_discriminante = df_analise_discriminante[['id_pessoa', 'tipo_divida', 'percentual_pago_cda', 'class_contribuinte_nome']]

In [66]:
base_conjunta

Unnamed: 0,cda,tipo_divida,id_pessoa,atividade_principal,situacao,tipo_tributo,valor_pago,valor_tot,vlr_tributo,vlr_taxa,...,edificacao,cpf_cnpj_existe,protesto,ajuizamento,refis,deb_totais,deb_pagos,anos_idade_da,quantidade_reparcelamento,percentual_pago_cda
0,00000123c3d4731c6b3a8e268c80aedf1,imovel,ac28642d7c82b33f,APARTAMENTO,ATIVO,IPTU,1.0,2048.73,762.08,1286.65,...,1,1,0,0,0,40.0,0.0,1.0,0,0.000488
1,00000123c3d4731c6b3a8e268c80aedf2,mercantil,fc4b99b807fbed41,ATIVIDADES DE TELEATENDIMENTO,SUSPENSO,ISS,1.0,2515.85,0.00,2515.85,...,0,1,0,0,0,3.0,0.0,1.0,0,0.000397
2,00000b44c5ba1e669ceed47545e621dd2,mercantil,96e8e553de69d7a4,COMERCIO VAREJISTA DE ARTIGOS DE ARMARINHO,INAPTO,ISS,1.0,1278.97,0.00,1278.97,...,0,1,0,0,0,4.0,0.0,21.0,0,0.000782
3,000014e359592e62d8a3e5cebc255ca6E,mercantil,6dbe14da38a31dc1,Comercio varejista especializado de equipament...,ATIVO,ISS,1.0,847.02,847.02,0.00,...,0,1,0,0,0,1.0,0.0,0.0,0,0.001181
4,0000331f601a73e52b46f673bf0c61251,imovel,870c08c252b25ad1,APARTAMENTO,ATIVO,IPTU,1.0,344.12,147.20,196.92,...,1,1,0,0,0,6.0,0.0,11.0,0,0.002906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414893,ffffd8af5760cada9fcacff532cf50c41,imovel,d411f54ad97d1f39,,ATIVO,IPTU,1.0,1728.13,790.85,937.28,...,0,0,0,0,0,19.0,0.0,4.0,0,0.000579
2414894,ffffdb721b90282718565e26094f06ee2,mercantil,51d6043a4efa1ed1,PUBLICITARIO (NÍVEL MÉDIO),ATIVO,ISS,1.0,240.77,240.77,0.00,...,0,1,0,0,0,1.0,0.0,6.0,0,0.004153
2414895,ffffdc78aa7a90e26b3a8e268c80aedf1,imovel,9b512f210b460120,APARTAMENTO,ATIVO,IPTU,1.0,9390.43,5113.61,4276.82,...,1,1,0,1,0,30.0,0.0,25.0,0,0.000106
2414896,fffff192f894983c9ceed47545e621dd1,imovel,dd5538d2d38803a7,CASA,ATIVO,IPTU,1.0,16594.80,11374.98,5219.82,...,1,1,0,1,0,32.0,0.0,15.0,0,0.000060


In [67]:
df_analise_discriminante = df_analise_discriminante.set_index(['id_pessoa', 'tipo_divida'])
df_analise_discriminante

Unnamed: 0_level_0,Unnamed: 1_level_0,percentual_pago_cda,class_contribuinte_nome
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1
ac28642d7c82b33f,imovel,0.000488,MEDIO INACESSIVEL
fc4b99b807fbed41,mercantil,0.000397,CONTRIB PESSIMO
96e8e553de69d7a4,mercantil,0.000782,MEDIO INACESSIVEL
6dbe14da38a31dc1,mercantil,0.001181,CONTRIB EXCELENTE
870c08c252b25ad1,imovel,0.002906,RUIM ACESSIVEL
...,...,...,...
d411f54ad97d1f39,imovel,0.000579,RUIM ACESSIVEL
51d6043a4efa1ed1,mercantil,0.004153,MEDIO INACESSIVEL
9b512f210b460120,imovel,0.000106,CONTRIB EXCELENTE
dd5538d2d38803a7,imovel,0.000060,CONTRIB EXCELENTE


In [68]:
df_analise_discriminante = df_analise_discriminante.reset_index()#.groupby("class_contribuinte_nome").describe().T
df_analise_discriminante

Unnamed: 0,id_pessoa,tipo_divida,percentual_pago_cda,class_contribuinte_nome
0,ac28642d7c82b33f,imovel,0.000488,MEDIO INACESSIVEL
1,fc4b99b807fbed41,mercantil,0.000397,CONTRIB PESSIMO
2,96e8e553de69d7a4,mercantil,0.000782,MEDIO INACESSIVEL
3,6dbe14da38a31dc1,mercantil,0.001181,CONTRIB EXCELENTE
4,870c08c252b25ad1,imovel,0.002906,RUIM ACESSIVEL
...,...,...,...,...
2414893,d411f54ad97d1f39,imovel,0.000579,RUIM ACESSIVEL
2414894,51d6043a4efa1ed1,mercantil,0.004153,MEDIO INACESSIVEL
2414895,9b512f210b460120,imovel,0.000106,CONTRIB EXCELENTE
2414896,dd5538d2d38803a7,imovel,0.000060,CONTRIB EXCELENTE


In [69]:
# Dummyzando a variável de classificação 

ohe = OneHotEncoder(dtype=int)

colunas_ohe = ohe.fit_transform(df_analise_discriminante[['class_contribuinte_nome']]).toarray()
df_2 = pd.DataFrame(colunas_ohe, columns=ohe.get_feature_names_out(['class_contribuinte_nome']))
df_2 

Unnamed: 0,class_contribuinte_nome_CONTRIB EXCELENTE,class_contribuinte_nome_CONTRIB NEGOCIADOR,class_contribuinte_nome_CONTRIB PESSIMO,class_contribuinte_nome_MEDIO INACESSIVEL,class_contribuinte_nome_PRIMEIRA DIVIDA,class_contribuinte_nome_RUIM ACESSIVEL
0,0,0,0,1,0,0
1,0,0,1,0,0,0
2,0,0,0,1,0,0
3,1,0,0,0,0,0
4,0,0,0,0,0,1
...,...,...,...,...,...,...
2414893,0,0,0,0,0,1
2414894,0,0,0,1,0,0
2414895,1,0,0,0,0,0
2414896,1,0,0,0,0,0


In [70]:
df_n_categorico = df_analise_discriminante.drop(columns=['class_contribuinte_nome'], axis=1)
df_n_categorico

Unnamed: 0,id_pessoa,tipo_divida,percentual_pago_cda
0,ac28642d7c82b33f,imovel,0.000488
1,fc4b99b807fbed41,mercantil,0.000397
2,96e8e553de69d7a4,mercantil,0.000782
3,6dbe14da38a31dc1,mercantil,0.001181
4,870c08c252b25ad1,imovel,0.002906
...,...,...,...
2414893,d411f54ad97d1f39,imovel,0.000579
2414894,51d6043a4efa1ed1,mercantil,0.004153
2414895,9b512f210b460120,imovel,0.000106
2414896,dd5538d2d38803a7,imovel,0.000060


In [71]:
df_pipe_discriminante = pd.concat([df_n_categorico, df_2], axis=1)

In [72]:
df_pipe_discriminante = df_pipe_discriminante.set_index(['id_pessoa', 'tipo_divida'])
df_pipe_discriminante

Unnamed: 0_level_0,Unnamed: 1_level_0,percentual_pago_cda,class_contribuinte_nome_CONTRIB EXCELENTE,class_contribuinte_nome_CONTRIB NEGOCIADOR,class_contribuinte_nome_CONTRIB PESSIMO,class_contribuinte_nome_MEDIO INACESSIVEL,class_contribuinte_nome_PRIMEIRA DIVIDA,class_contribuinte_nome_RUIM ACESSIVEL
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ac28642d7c82b33f,imovel,0.000488,0,0,0,1,0,0
fc4b99b807fbed41,mercantil,0.000397,0,0,1,0,0,0
96e8e553de69d7a4,mercantil,0.000782,0,0,0,1,0,0
6dbe14da38a31dc1,mercantil,0.001181,1,0,0,0,0,0
870c08c252b25ad1,imovel,0.002906,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
d411f54ad97d1f39,imovel,0.000579,0,0,0,0,0,1
51d6043a4efa1ed1,mercantil,0.004153,0,0,0,1,0,0
9b512f210b460120,imovel,0.000106,1,0,0,0,0,0
dd5538d2d38803a7,imovel,0.000060,1,0,0,0,0,0


In [73]:
x_analise_discriminante = df_pipe_discriminante.drop(columns=['percentual_pago_cda'])
y_analise_discriminante = df_pipe_discriminante['percentual_pago_cda'].astype('int')

In [74]:
analise_discriminante = LinearDiscriminantAnalysis()
analise_discriminante.fit(x_analise_discriminante, y_analise_discriminante)

LinearDiscriminantAnalysis()

In [75]:
dados_analise_disc = {'variavel': analise_discriminante.feature_names_in_, 'coeficiente' : analise_discriminante.coef_[0].round(5)}
pesos_analise_disc = pd.DataFrame(dados_analise_disc).sort_values('variavel').reset_index().drop(columns=['index'])

In [76]:
pesos_analise_disc

Unnamed: 0,variavel,coeficiente
0,class_contribuinte_nome_CONTRIB EXCELENTE,1.53379
1,class_contribuinte_nome_CONTRIB NEGOCIADOR,0.18144
2,class_contribuinte_nome_CONTRIB PESSIMO,-0.79172
3,class_contribuinte_nome_MEDIO INACESSIVEL,0.84104
4,class_contribuinte_nome_PRIMEIRA DIVIDA,2.14301
5,class_contribuinte_nome_RUIM ACESSIVEL,-0.89373


In [77]:
pesos_analise_disc['class_contribuinte_nome'] = pesos_analise_disc['variavel'].str.replace('class_contribuinte_nome_', '')
pesos_analise_disc

Unnamed: 0,variavel,coeficiente,class_contribuinte_nome
0,class_contribuinte_nome_CONTRIB EXCELENTE,1.53379,CONTRIB EXCELENTE
1,class_contribuinte_nome_CONTRIB NEGOCIADOR,0.18144,CONTRIB NEGOCIADOR
2,class_contribuinte_nome_CONTRIB PESSIMO,-0.79172,CONTRIB PESSIMO
3,class_contribuinte_nome_MEDIO INACESSIVEL,0.84104,MEDIO INACESSIVEL
4,class_contribuinte_nome_PRIMEIRA DIVIDA,2.14301,PRIMEIRA DIVIDA
5,class_contribuinte_nome_RUIM ACESSIVEL,-0.89373,RUIM ACESSIVEL


In [78]:
# Salve o DataFrame em um arquivo CSV
if AGRUP_COM_FREQ_PESSOAS == 1:
    pesos_analise_disc.to_csv('data/pesos_analise_disc_AGRUP_COM_FREQ_PESSOAS.csv', index = False)  
if AGRUP_COM_STATUS_SITUACAO == 1:
    pesos_analise_disc.to_csv('data/pesos_analise_disc_AGRUP_COM_STATUS_SITUACAO.csv', index = False) 

# 06) Salva o modelo de classificação dos contribuintes

In [79]:
def salva_modelo_serializado(nome_modelo_serializado, modelo):
    sav_best_model = open(nome_modelo_serializado, 'wb')
    pickle.dump(modelo, sav_best_model)
    sav_best_model.close()

    pathModelo = modelsPath+"\\"+os.path.join(nome_modelo_serializado)
    shutil.move(os.path.abspath(nome_modelo_serializado), pathModelo)

In [80]:
if AGRUP_COM_FREQ_PESSOAS == 1:
    salva_modelo_serializado("classificador-contribuinte_prime.pkl", model_predict_contribuinte)
if AGRUP_COM_STATUS_SITUACAO == 1:
    salva_modelo_serializado("classificador-contribuinte-AGRUP_COM_STATUS_SITUACAO_v4.pkl", model_predict_contribuinte)

# SALVAR PESOS

In [81]:
# Use o método str.replace() para substituir o caractere '#' por uma string vazia na coluna 'texto'
pesos_analise_disc['class_contribuinte_nome'] = pesos_analise_disc['variavel'].str.replace('class_contribuinte_nome_', '')
pesos_analise_disc 

Unnamed: 0,variavel,coeficiente,class_contribuinte_nome
0,class_contribuinte_nome_CONTRIB EXCELENTE,1.53379,CONTRIB EXCELENTE
1,class_contribuinte_nome_CONTRIB NEGOCIADOR,0.18144,CONTRIB NEGOCIADOR
2,class_contribuinte_nome_CONTRIB PESSIMO,-0.79172,CONTRIB PESSIMO
3,class_contribuinte_nome_MEDIO INACESSIVEL,0.84104,MEDIO INACESSIVEL
4,class_contribuinte_nome_PRIMEIRA DIVIDA,2.14301,PRIMEIRA DIVIDA
5,class_contribuinte_nome_RUIM ACESSIVEL,-0.89373,RUIM ACESSIVEL


In [82]:
pesos_analise_disc = pd.merge(pesos_analise_disc, 
         df_dicionario_clusteres, 
         on = "class_contribuinte_nome",
         how = "left")

pesos_analise_disc  

Unnamed: 0,variavel,coeficiente,class_contribuinte_nome,class_contribuinte
0,class_contribuinte_nome_CONTRIB EXCELENTE,1.53379,CONTRIB EXCELENTE,2
1,class_contribuinte_nome_CONTRIB NEGOCIADOR,0.18144,CONTRIB NEGOCIADOR,3
2,class_contribuinte_nome_CONTRIB PESSIMO,-0.79172,CONTRIB PESSIMO,0
3,class_contribuinte_nome_MEDIO INACESSIVEL,0.84104,MEDIO INACESSIVEL,1
4,class_contribuinte_nome_PRIMEIRA DIVIDA,2.14301,PRIMEIRA DIVIDA,5
5,class_contribuinte_nome_RUIM ACESSIVEL,-0.89373,RUIM ACESSIVEL,4


In [83]:
# Salve o DataFrame em um arquivo CSV
if AGRUP_COM_FREQ_PESSOAS == 1:
    pesos_analise_disc.to_csv('data/pesos_analise_disc_AGRUP_COM_FREQ_PESSOAS.csv', index = False)  
if AGRUP_COM_STATUS_SITUACAO == 1:
    pesos_analise_disc.to_csv('data/pesos_analise_disc_AGRUP_COM_STATUS_SITUACAO.csv', index = False)  

In [84]:
df_pipe_cluster

Unnamed: 0_level_0,Unnamed: 1_level_0,status_situacao,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_qtd,historico_pagamento_em_valor,label_cluster
id_pessoa,tipo_divida,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
f1e9dde2a3053d51,imovel,3.0,4,0,1.0,1.0,1
196d80b1b82edbc0,mercantil,0.0,2,2,1.0,1.0,1
5f21929a71c17a08,imovel,3.0,4,3,1.0,1.0,1
5f202122440e32a5,mercantil,0.0,2,2,1.0,1.0,1
cb7ae840eac57e1d,mercantil,3.0,2,2,1.0,1.0,1
...,...,...,...,...,...,...,...
6dec5209fbc2f813,imovel,3.0,4,0,0.0,0.0,1
6dec36900dea378c,mercantil,0.0,7,0,0.0,0.0,0
6debc091e4bf4d84,mercantil,0.0,2,0,0.0,0.0,1
6debbb4cc485f25f,mercantil,0.0,8,0,0.0,0.0,0


# ANALISANDO

In [85]:
# df_pipe_cluster2 = df_pipe_cluster.reset_index()
# df = df_pipe_cluster2.groupby('label_cluster')['id_pessoa'].nunique().to_frame().reset_index()
# total = df['id_pessoa'].sum()
# df['perc'] = df['id_pessoa']/total
# df

Unnamed: 0,label_cluster,id_pessoa,perc
0,0,106011,0.280301
1,1,165172,0.436727
2,2,33373,0.088241
3,3,4786,0.012655
4,4,68862,0.182076


In [87]:
#dados_pessoas.reset_index(inplace = True)

# Distribuição dos Contribuintes

In [89]:
freq_classecontrib = pessoas.groupby(['class_contribuinte_nome'])['id_pessoa'].nunique().to_frame().reset_index()
tot = freq_classecontrib['id_pessoa'].sum()
freq_classecontrib['perc'] = np.round(freq_classecontrib['id_pessoa']/tot, 4)
freq_classecontrib

Unnamed: 0,class_contribuinte_nome,id_pessoa,perc
0,CONTRIB EXCELENTE,33381,0.0697
1,CONTRIB NEGOCIADOR,4787,0.01
2,CONTRIB PESSIMO,106010,0.2214
3,MEDIO INACESSIVEL,165172,0.345
4,PRIMEIRA DIVIDA,100601,0.2101
5,RUIM ACESSIVEL,68854,0.1438


In [90]:
# dict = {"class_contribuinte_nome": ['PRIMEIRA DIVIDA', 'MELHOR PAGADOR', 
#                                     'BOM PAGADOR', 'PAGADOR INTERMEDIARIO', 'PIOR PAGADOR'],
#         "ordem_contrib": [0, 1, 2 ,3, 4]}

# df_ordenar_class_contrib = pd.DataFrame(dict)
# df_ordenar_class_contrib

dict = {"class_contribuinte_nome": ['CONTRIB EXCELENTE', 'CONTRIB NEGOCIADOR', 
                                    'MEDIO INACESSIVEL', 'RUIM ACESSIVEL', 'CONTRIB PESSIMO', 'PRIMEIRA DIVIDA'],
         "ordem_contrib": [0, 1, 2 ,3, 4, 5]}

df_ordenar_class_contrib = pd.DataFrame(dict)
df_ordenar_class_contrib

Unnamed: 0,class_contribuinte_nome,ordem_contrib
0,CONTRIB EXCELENTE,0
1,CONTRIB NEGOCIADOR,1
2,MEDIO INACESSIVEL,2
3,RUIM ACESSIVEL,3
4,CONTRIB PESSIMO,4
5,PRIMEIRA DIVIDA,5


In [None]:
# data_ord_rat_div = {
#     'ord_rat_div': [0, 1, 2, 3],
#     'rating_divida': ['ALTISSIMA', 'ALTA', 'MEDIA', 'BAIXISSIMA'],
#     'rating_divida_label_ord': ['00_ALTISSIMA', '01_ALTA', '02_MEDIA', '03_BAIXISSIMA']
# }

# df_ord_rat_div = pd.DataFrame(data_ord_rat_div)

In [91]:
freq_classecontrib = pd.merge(
    left = freq_classecontrib, 
    right = df_ordenar_class_contrib,
    on = 'class_contribuinte_nome',
    how = "left"
)

freq_classecontrib = freq_classecontrib.sort_values(by = ['ordem_contrib'])
total = freq_classecontrib['id_pessoa'].sum()
freq_classecontrib['total'] = total
freq_classecontrib

Unnamed: 0,class_contribuinte_nome,id_pessoa,perc,ordem_contrib,total
0,CONTRIB EXCELENTE,33381,0.0697,0,478805
1,CONTRIB NEGOCIADOR,4787,0.01,1,478805
3,MEDIO INACESSIVEL,165172,0.345,2,478805
5,RUIM ACESSIVEL,68854,0.1438,3,478805
2,CONTRIB PESSIMO,106010,0.2214,4,478805
4,PRIMEIRA DIVIDA,100601,0.2101,5,478805


In [92]:
# Salve o DataFrame em um arquivo CSV
if AGRUP_COM_FREQ_PESSOAS == 1:
    freq_classecontrib.to_csv('data/distribuicao_contribuintes_AGRUP_COM_FREQ_PESSOAS.csv', index = False)  
if AGRUP_COM_STATUS_SITUACAO == 1:
    freq_classecontrib.to_csv('data/distribuicao_contribuintes_AGRUP_COM_STATUS_SITUACAO.csv', index = False)  

# Previsão

In [93]:
matriz_previsao_class = pessoas[['status_situacao', 'num_dist_cda', 'quantidade_reparcelamento', 'historico_pagamento_em_qtd', 'historico_pagamento_em_valor']]
matriz_previsao_class

Unnamed: 0,status_situacao,num_dist_cda,quantidade_reparcelamento,historico_pagamento_em_qtd,historico_pagamento_em_valor
0,2.0,1,0,1.0,1.0
1,3.0,4,0,1.0,1.0
2,3.0,1,0,1.0,1.0
3,0.0,1,0,1.0,1.0
4,0.0,2,2,1.0,1.0
...,...,...,...,...,...
494408,3.0,4,0,0.0,0.0
494409,0.0,7,0,0.0,0.0
494410,0.0,2,0,0.0,0.0
494411,0.0,8,0,0.0,0.0


In [94]:
pessoas['class_contribuinte'] = model_predict_contribuinte.predict(matriz_previsao_class)
pessoas

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,1,PRIMEIRA DIVIDA
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,PRIMEIRA DIVIDA
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,1,PRIMEIRA DIVIDA
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO


In [95]:
pessoas.loc[pessoas['num_dist_cda'] == 1, 'class_contribuinte'] = 5
pessoas

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5,PRIMEIRA DIVIDA
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5,PRIMEIRA DIVIDA
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5,PRIMEIRA DIVIDA
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO


# Pesos dos Contribuintes

In [96]:
pesos_analise_disc_aux = pesos_analise_disc[["coeficiente",	"class_contribuinte_nome",	"class_contribuinte"]]

In [97]:
# Nomeando a classificação com label de prioridade
pessoas = pd.merge(pessoas, 
         pesos_analise_disc_aux, 
         on = ["class_contribuinte", "class_contribuinte_nome"],
         how = "left")

pessoas = pessoas.rename(columns = {'coeficiente':'class_contribuinte_peso'})
pessoas

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome,class_contribuinte_peso
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172


# Feature store do Contribuinte

In [99]:
df_feature_store_contribuinte = pessoas#.reset_index()
df_feature_store_contribuinte
# df_feature_store_contribuinte = df_feature_store_contribuinte[['id_pessoa', 'situacao', 'cpf_existe', 'edificacao', 'qtd_notas_2anos', 
#                                                                'situacao_ativa', 'status_situacao', 
#                                                                'deb_totais','deb_pagos', 'valor_tot', 'valor_pago', 
#                                                                'frequencia_da_pessoa', 'total_debitos_pessoa', 'debitos_pagos_pessoa', 'valor_total_pessoa', 'valor_pago_pessoa', 
#                                                                'historico_pagamento_em_qtd', 'historico_pagamento_em_valor', 
#                                                                'class_contribuinte', 'class_contribuinte_nome', 'class_contribuinte_peso']]

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome,class_contribuinte_peso
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172


In [100]:
df_feature_store_contribuinte.loc[df_feature_store_contribuinte['historico_pagamento_em_valor'] > 1, 'historico_pagamento_em_valor'] = 1
df_feature_store_contribuinte

Unnamed: 0,id_pessoa,tipo_divida,num_dist_cda,quantidade_reparcelamento,deb_totais,deb_pagos,valor_tot,valor_pago,qtd_notas_2anos,edificacao,situacao,cpf_cnpj_existe,situacao_ativa,status_situacao,historico_pagamento_em_qtd,historico_pagamento_em_valor,class_contribuinte,class_contribuinte_nome,class_contribuinte_peso
0,7fcd58cfe066e299,mercantil,1,0,4.0,4.0,533.37,533.37,0.0,0.0,ATIVO,1,1.0,2.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
1,f1e9dde2a3053d51,imovel,4,0,79.0,79.0,7987.43,7987.43,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
2,5f225ff188d71bf6,imovel,1,0,12.0,12.0,454.28,454.28,0.0,1.0,ATIVO,1,2.0,3.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
3,5f21929a71c17a08,mercantil,1,0,1.0,1.0,550.47,550.47,0.0,0.0,BAIXADO,1,0.0,0.0,1.0,1.0,5,PRIMEIRA DIVIDA,2.14301
4,196d80b1b82edbc0,mercantil,2,2,8.0,8.0,493.97,493.97,0.0,0.0,SUSPENSO,1,0.0,0.0,1.0,1.0,1,MEDIO INACESSIVEL,0.84104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494408,6dec5209fbc2f813,imovel,4,0,136.0,0.0,14185.80,0.00,0.0,1.0,ATIVO,1,2.0,3.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494409,6dec36900dea378c,mercantil,7,0,52.0,0.0,19190.68,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172
494410,6debc091e4bf4d84,mercantil,2,0,6.0,0.0,14930.45,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,1,MEDIO INACESSIVEL,0.84104
494411,6debbb4cc485f25f,mercantil,8,0,25.0,0.0,29849.66,0.00,0.0,0.0,INAPTO,1,0.0,0.0,0.0,0.0,0,CONTRIB PESSIMO,-0.79172


In [101]:
df_feature_store_contribuinte.groupby('class_contribuinte_nome')['id_pessoa'].nunique().to_frame().reset_index()

Unnamed: 0,class_contribuinte_nome,id_pessoa
0,CONTRIB EXCELENTE,33381
1,CONTRIB NEGOCIADOR,4787
2,CONTRIB PESSIMO,106010
3,MEDIO INACESSIVEL,165172
4,PRIMEIRA DIVIDA,100601
5,RUIM ACESSIVEL,68854


# Salvar no S3

In [108]:
print("Inicia a conexão com S3 para inscrição dos dados")
# Cria conexão ao s3 e preenche a tabela com os dados
s3_resource = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id=os.getenv("AWS_ACESS_KEY"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACESS_KEY")
    )

NOME_ARQ_SALVAR_S3 = 'feature_store_contribuinte_prime.csv'

up_s3_files(dataframe=df_feature_store_contribuinte, 
            bucket_name=os.getenv("S3_BUCKET_NAME"), 
            folder_name=os.getenv("S3_FOLDER_NAME"), 
            file_name= NOME_ARQ_SALVAR_S3)

print("Dados upados no s3")
print("Processo finalizado")

Inicia a conexão com S3 para inscrição dos dados
Dados upados no s3
Processo finalizado
