# Obtenção de rating do contribuinte

Com o objetivo de direcionar o modelo que ditá quais dívidas são melhores de recuperar, será realizado um rating que envolva apenas as variáveis do contribuinte.

In [7]:
import os
import pandas as pd
import zipfile
from datetime import date
import dotenv
import boto3
from io import BytesIO
import locale
locale.setlocale(locale.LC_ALL, 'pt_BR.utf8')

import warnings
warnings.filterwarnings("ignore")

rootPath = os.getcwd()
dataPath = os.path.join(rootPath, 'data')
modelsPath = os.path.join(rootPath, 'models')
env = os.path.join(rootPath, '.env')
dotenv.load_dotenv(dotenv_path=env)

print("Iniciando carregamento dos dados")
zip_file = os.path.join(dataPath, 'rating_igr_18_09.zip')
z = zipfile.ZipFile(zip_file)

def ler_bases_exportadas(nome_arquivo):
    z.extract(nome_arquivo)
    df = pd.read_csv(nome_arquivo, sep=',')
    os.remove(nome_arquivo)
    return df

# # Transforma as chaves de tempo em data
# def coleta_datas(df, left_on, nova_coluna_data):
#     df = pd.merge(
#         left=df, right=base_dim_datas,
#         left_on=left_on, right_on='chave_tempo',
#         how='left'
#     ).rename(
#         columns={"date": nova_coluna_data})
#     return df

def up_s3_files(dataframe, bucket_name, folder_name, file_name):
    csv_buffer = BytesIO()
    dataframe.to_csv(csv_buffer, sep=';', index=False)
    file_key_aws = folder_name + file_name
    s3_resource.Object(bucket_name, file_key_aws).put(Body=csv_buffer.getvalue())

base_imovel = ler_bases_exportadas('imovel.csv')
base_mercantil = ler_bases_exportadas('mercantil.csv')
# base_parcelas = ler_bases_exportadas('parcelas.csv')
base_notas_fiscais = ler_bases_exportadas('emissao_notas.csv')
#base_dim_datas = ler_bases_exportadas('dim_datas.csv')

base_conjunta = pd.concat([base_imovel, base_mercantil])



Iniciando carregamento dos dados


In [8]:
print("Inicia transformação das variáveis sobre a dívida")

# Gera as variáveis de tempo
base_conjunta['data_divida'] = pd.to_datetime(base_conjunta['inscricao_divida'], infer_datetime_format = True)
base_conjunta['ano_inscricao_da'] = base_conjunta['data_divida'].dt.year

base_conjunta.drop_duplicates(subset='cda', inplace=True) #Garantia que não houve duplicatas de linhas



Inicia transformação das variáveis sobre a dívida


In [11]:
# Seleciona dados sobre a divida
dados_divida = base_conjunta[[ 'cda', 'id_pessoa', 'tipo_divida', 'valor_tot', 'valor_pago', 'protesto', 'divida_ajuizada', 'ano_inscricao_da']]
dados_divida.dropna(subset=['id_pessoa'], inplace=True)
dados_divida['id_pessoa'] = dados_divida['id_pessoa'].astype(str) # persistindo tipo de dados
dados_divida

# # cria coluna e instancia como zero para preencher
# dados_divida['quantidade_reparcelamentos'] = 0

# # concateno as bases
# dados_divida = base_conjunta.merge(base_parcelas,how='left',on=['cda', 'tipo_divida', 'id_pessoa'],suffixes=('', '_PARC'))
# dados_divida.loc[ dados_divida['total_valor_pago'].isna(), 'total_valor_pago' ] = 0
# dados_divida

Unnamed: 0,cda,id_pessoa,tipo_divida,valor_tot,valor_pago,protesto,divida_ajuizada,ano_inscricao_da
0,f451a74747c571f418565e26094f06ee1,befea92c212b53a5,imovel,2374.82,0.00,1,0,2019
1,88c943a518b64f4e2b46f673bf0c61251,d052efa16322ece6,imovel,6013.17,0.00,1,0,2022
2,5bea514d024b70a89ceed47545e621dd1,d585d69233729912,imovel,8729.53,8729.53,0,1,2002
3,76ced639f5fe21292b46f673bf0c61251,083875f6253f6a4c,imovel,2489.85,2489.85,0,1,1999
4,dc51df46517a0ae9e52281a70553db461,1d90a6a8283dfbdf,imovel,2904.37,0.00,0,1,1998
...,...,...,...,...,...,...,...,...
1085182,a1d667b98a3933cf9989bae6f4af91ee2,6adb004fb9328f79,mercantil,837.96,0.00,0,0,2016
1085183,74627de79bddbb8d6b3a8e268c80aedf2,e8205704db1d8bd3,mercantil,8386.99,0.00,0,1,2012
1085184,b5ddb33df344c2439fcacff532cf50c42,8d4101a266ba2fa3,mercantil,791.80,0.00,0,0,2017
1085185,fe7175f3beabf3116b3a8e268c80aedf2,0b040e03a02139f0,mercantil,2040.71,0.00,0,0,2017


In [5]:
# # Somando valores a vista e aprcelados em nova coluna
# dados_divida['valor_pago_vista_parc'] = dados_divida['valor_pago'] + dados_divida['total_valor_pago']
# dados_divida['valor_pago_vista_parc'].sum() / 1000000

1343.1835638800003

In [15]:
valor_tot = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['valor_tot'].sum()
valor_pago = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['valor_pago'].sum()
divida_protestada = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['protesto'].max()
divida_ajuizada = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['divida_ajuizada'].max()
ano_inscricao_da = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['ano_inscricao_da'].max()
# quantidade_reparcelamentos = dados_divida.groupby(['cda', 'id_pessoa', 'tipo_divida'])['quantidade_reparcelamentos'].sum()

aux = pd.merge(valor_tot, valor_pago, on = ['cda', 'id_pessoa', 'tipo_divida'], how = "left")
aux2 = pd.merge(divida_protestada, divida_ajuizada, on = ['cda', 'id_pessoa', 'tipo_divida'], how = "left")
# aux3 = pd.merge(ano_inscricao_da, on = ['cda', 'id_pessoa', 'tipo_divida'], how = "left")

aux4 = pd.merge(aux, aux2, on = ['cda', 'id_pessoa', 'tipo_divida'], how = "left")
aux5 = pd.merge(aux4, ano_inscricao_da, on = ['cda', 'id_pessoa', 'tipo_divida'], how = "left")


In [None]:
# # renomeia a coluna criada para valor_pago usada no modelo
# aux5.rename( columns={'valor_pago_vista_parc':'valor_pago'}, inplace=True)
# aux5

In [16]:
# Calcula a idade da dívida ativa
aux5['ano_atual'] = date.today().year
aux5['anos_idade_da'] = aux5['ano_atual'] - aux5['ano_inscricao_da']
dados_divida = aux5.drop(columns=['ano_atual'])
dados_divida

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,valor_tot,valor_pago,protesto,divida_ajuizada,ano_inscricao_da,anos_idade_da
cda,id_pessoa,tipo_divida,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00000123c3d4731c6b3a8e268c80aedf1,ac28642d7c82b33f,imovel,2048.73,0.00,0,0,2022,1
00000123c3d4731c6b3a8e268c80aedf2,fc4b99b807fbed41,mercantil,2515.85,0.00,0,0,2022,1
00000b44c5ba1e669ceed47545e621dd2,96e8e553de69d7a4,mercantil,1278.97,0.00,0,0,2002,21
000014e359592e62d8a3e5cebc255ca6E,6dbe14da38a31dc1,mercantil,847.02,0.00,0,0,2023,0
0000331f601a73e52b46f673bf0c61251,870c08c252b25ad1,imovel,344.12,0.00,0,0,2012,11
...,...,...,...,...,...,...,...,...
ffffd79d8aa612ac9fcacff532cf50c41,c6b16a1235d987d3,imovel,26104.42,26104.42,1,0,2022,1
ffffd8af5760cada9fcacff532cf50c41,d411f54ad97d1f39,imovel,1728.13,0.00,0,0,2019,4
ffffdb721b90282718565e26094f06ee2,51d6043a4efa1ed1,mercantil,240.77,0.00,0,0,2017,6
ffffdc78aa7a90e26b3a8e268c80aedf1,9b512f210b460120,imovel,9390.43,0.00,0,1,1998,25


In [17]:
# Trazendo as três chaves como colunas
dados_divida = dados_divida.reset_index()
dados_divida

Unnamed: 0,cda,id_pessoa,tipo_divida,valor_tot,valor_pago,protesto,divida_ajuizada,ano_inscricao_da,anos_idade_da
0,00000123c3d4731c6b3a8e268c80aedf1,ac28642d7c82b33f,imovel,2048.73,0.00,0,0,2022,1
1,00000123c3d4731c6b3a8e268c80aedf2,fc4b99b807fbed41,mercantil,2515.85,0.00,0,0,2022,1
2,00000b44c5ba1e669ceed47545e621dd2,96e8e553de69d7a4,mercantil,1278.97,0.00,0,0,2002,21
3,000014e359592e62d8a3e5cebc255ca6E,6dbe14da38a31dc1,mercantil,847.02,0.00,0,0,2023,0
4,0000331f601a73e52b46f673bf0c61251,870c08c252b25ad1,imovel,344.12,0.00,0,0,2012,11
...,...,...,...,...,...,...,...,...,...
2170545,ffffd79d8aa612ac9fcacff532cf50c41,c6b16a1235d987d3,imovel,26104.42,26104.42,1,0,2022,1
2170546,ffffd8af5760cada9fcacff532cf50c41,d411f54ad97d1f39,imovel,1728.13,0.00,0,0,2019,4
2170547,ffffdb721b90282718565e26094f06ee2,51d6043a4efa1ed1,mercantil,240.77,0.00,0,0,2017,6
2170548,ffffdc78aa7a90e26b3a8e268c80aedf1,9b512f210b460120,imovel,9390.43,0.00,0,1,1998,25


In [24]:
df = dados_divida.query("anos_idade_da < 10")
df

# Aqui temos os 56.475 cdas que estávamos jogando como pior pagador mapeados como parcelamento

Unnamed: 0,cda,id_pessoa,tipo_divida,valor_tot,valor_pago,protesto,divida_ajuizada,ano_inscricao_da,anos_idade_da
0,00000123c3d4731c6b3a8e268c80aedf1,ac28642d7c82b33f,imovel,2048.73,0.00,0,0,2022,1
1,00000123c3d4731c6b3a8e268c80aedf2,fc4b99b807fbed41,mercantil,2515.85,0.00,0,0,2022,1
3,000014e359592e62d8a3e5cebc255ca6E,6dbe14da38a31dc1,mercantil,847.02,0.00,0,0,2023,0
8,00005193f0c1f57c6b3a8e268c80aedf1,0bff42777c16d00c,imovel,424.62,0.00,0,0,2021,2
9,0000521b64bf28c8e52281a70553db461,b5df816e1786f0a3,imovel,490.83,0.00,0,0,2016,7
...,...,...,...,...,...,...,...,...,...
2170540,ffffafbbcfae89607c8fa7a67092eaab2,172160e5eb98bf3c,mercantil,2118.05,0.00,0,0,2016,7
2170544,ffffd120a8f3f2799fcacff532cf50c41,934532c6c448a899,imovel,802.85,0.00,0,0,2014,9
2170545,ffffd79d8aa612ac9fcacff532cf50c41,c6b16a1235d987d3,imovel,26104.42,26104.42,1,0,2022,1
2170546,ffffd8af5760cada9fcacff532cf50c41,d411f54ad97d1f39,imovel,1728.13,0.00,0,0,2019,4


In [25]:
dados_divida

Unnamed: 0,cda,id_pessoa,tipo_divida,valor_tot,valor_pago,protesto,divida_ajuizada,ano_inscricao_da,anos_idade_da
0,00000123c3d4731c6b3a8e268c80aedf1,ac28642d7c82b33f,imovel,2048.73,0.00,0,0,2022,1
1,00000123c3d4731c6b3a8e268c80aedf2,fc4b99b807fbed41,mercantil,2515.85,0.00,0,0,2022,1
2,00000b44c5ba1e669ceed47545e621dd2,96e8e553de69d7a4,mercantil,1278.97,0.00,0,0,2002,21
3,000014e359592e62d8a3e5cebc255ca6E,6dbe14da38a31dc1,mercantil,847.02,0.00,0,0,2023,0
4,0000331f601a73e52b46f673bf0c61251,870c08c252b25ad1,imovel,344.12,0.00,0,0,2012,11
...,...,...,...,...,...,...,...,...,...
2170545,ffffd79d8aa612ac9fcacff532cf50c41,c6b16a1235d987d3,imovel,26104.42,26104.42,1,0,2022,1
2170546,ffffd8af5760cada9fcacff532cf50c41,d411f54ad97d1f39,imovel,1728.13,0.00,0,0,2019,4
2170547,ffffdb721b90282718565e26094f06ee2,51d6043a4efa1ed1,mercantil,240.77,0.00,0,0,2017,6
2170548,ffffdc78aa7a90e26b3a8e268c80aedf1,9b512f210b460120,imovel,9390.43,0.00,0,1,1998,25


In [26]:
# Renomeia colunas para nome mais adequados e filtra dataframe
colunas_nome = {
    'valor_tot': 'valor_total_da'
}
df_divida_ativa = dados_divida.rename(columns=colunas_nome)


# Criando variável target Y que será predita
df_divida_ativa['percentual_pago_cda'] = df_divida_ativa['valor_pago'] / df_divida_ativa['valor_total_da']


print("Inicia a conexão com S3 para inscrição dos dados")
# Cria conexão ao s3 e preenche a tabela com os dados
s3_resource = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id=os.getenv("AWS_ACESS_KEY"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACESS_KEY")
    )

def up_s3_files(dataframe, bucket_name, folder_name, file_name):
    csv_buffer = BytesIO()
    dataframe.to_csv(csv_buffer, sep=';', index=False)
    file_key_aws = folder_name + file_name
    s3_resource.Object(bucket_name, file_key_aws).put(Body=csv_buffer.getvalue())

up_s3_files(dataframe=df_divida_ativa, 
            bucket_name=os.getenv("S3_BUCKET_NAME"),
            folder_name=os.getenv("S3_FOLDER_NAME"), 
            file_name='feature_store_divida.csv')

print("Dados atualizados e persistidos no bucket S3")
print("Processo finalizado")

Inicia a conexão com S3 para inscrição dos dados
Dados atualizados e persistidos no bucket S3
Processo finalizado
