## Libraries


In [1]:
import pandas as pd
from src.feature_engineering import features
import numpy as np

## Data


In [2]:
clients = pd.read_csv("data/portfolio_clientes.csv")
tpv = pd.read_csv("data/portfolio_tpv.csv")
comunicados = pd.read_csv("data/portfolio_comunicados.csv")
geral = pd.read_csv("data/portfolio_geral.csv")

### Apenas contratos que possuem comunicados

In [3]:
unique_contratos = comunicados["contrato_id"].unique()
geral_comunicados = geral[geral["contrato_id"].isin(unique_contratos)]

In [4]:
comunicados_grouped = (
    comunicados.groupby(["contrato_id", "dt_ref_portfolio", "data_acao"])[
        ["tipo_acao", "status", "acao"]
    ]
    .agg(list)
    .reset_index()
)

In [5]:
geral_comunicados_grouped = geral_comunicados.merge(
    right=comunicados_grouped, how="left", on=["contrato_id", "dt_ref_portfolio"]
)

In [6]:
geral_and_comunicados_sorted_df = geral_comunicados_grouped.sort_values(
    ["contrato_id", "dt_ref_portfolio"]
)

In [7]:
geral_and_comunicados_sorted_df

Unnamed: 0,contrato_id,dt_ref_portfolio,safra,nr_documento,status_contrato,dt_contrato,dt_desembolso,dt_vencimento,dt_wo,prazo,...,vlr_pgto_esperado,vlr_saldo_devedor,vlr_saldo_devedor_esperado,dsp,dspp,flag_transacao,data_acao,tipo_acao,status,acao
6618696,000180509391a5ac66ff83cae603ffb8,2020-06-15,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,27.45,6932.34,6924.81,0,0,0,,,,
3787516,000180509391a5ac66ff83cae603ffb8,2020-06-16,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,27.45,6952.26,6917.27,1,1,1,,,,
3252426,000180509391a5ac66ff83cae603ffb8,2020-06-17,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,27.45,6946.81,6909.70,0,2,1,,,,
3773240,000180509391a5ac66ff83cae603ffb8,2020-06-18,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,27.45,6925.38,6902.11,0,0,1,,,,
2338801,000180509391a5ac66ff83cae603ffb8,2020-06-19,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,27.45,6883.79,6894.50,0,0,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4422232,fffc9af28349e8ded3af8e2dcbf9c3bc,2022-04-14,2020-08,cee34a02820f1120a5046f416a3e0967,Settled,2020-08-19,2020-08-21,2022-02-19,2023-02-19,30.78,...,108.33,0.00,0.00,0,0,1,,,,
5558563,fffc9af28349e8ded3af8e2dcbf9c3bc,2022-04-15,2020-08,cee34a02820f1120a5046f416a3e0967,Settled,2020-08-19,2020-08-21,2022-02-19,2023-02-19,30.78,...,108.33,0.00,0.00,0,0,0,,,,
586422,fffc9af28349e8ded3af8e2dcbf9c3bc,2022-04-16,2020-08,cee34a02820f1120a5046f416a3e0967,Settled,2020-08-19,2020-08-21,2022-02-19,2023-02-19,30.78,...,108.33,0.00,0.00,0,0,1,,,,
564107,fffc9af28349e8ded3af8e2dcbf9c3bc,2022-04-17,2020-08,cee34a02820f1120a5046f416a3e0967,Settled,2020-08-19,2020-08-21,2022-02-19,2023-02-19,30.78,...,108.33,0.00,0.00,0,0,0,,,,


1. vamos fazer uma quebra entre dsp e dspp
2. vamos considerar se uma das duas foi entregue ou ainda lido

In [38]:
temp_df = geral_and_comunicados_sorted_df[["dsp", "status"]]

true_dsp_status = []
for idx, values in temp_df.iterrows():
    # values[0] dsp
    # values[1] status
    dsp_allowed = [5, 10, 15, 30, 60, 90]
    if isinstance(values[1], list) and values[0] in dsp_allowed:
        true_dsp_status.append(values[1])
    else:
        true_dsp_status.append(None)

print(len(true_dsp_status), temp_df.shape)

6647557 (6647557, 2)


In [41]:
temp_df = geral_and_comunicados_sorted_df[["dspp", "status"]]

true_dspp_status = []
for idx, values in temp_df.iterrows():
    # values[0] dsp
    # values[1] status
    dspp_allowed = [15, 30, 45]
    if isinstance(values[1], list) and values[0] in dspp_allowed:
        true_dspp_status.append(values[1])
    else:
        true_dspp_status.append(None)

print(len(true_dspp_status), temp_df.shape)

6647557 (6647557, 2)


In [43]:
geral_and_comunicados_sorted_df["status_dspp"] = true_dspp_status
geral_and_comunicados_sorted_df["status_dsp"] = true_dsp_status
geral_and_comunicados_sorted_df.head()

Unnamed: 0,contrato_id,dt_ref_portfolio,safra,nr_documento,status_contrato,dt_contrato,dt_desembolso,dt_vencimento,dt_wo,prazo,...,vlr_saldo_devedor_esperado,dsp,dspp,flag_transacao,data_acao,tipo_acao,status,acao,status_dsp,status_dspp
6618696,000180509391a5ac66ff83cae603ffb8,2020-06-15,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,6924.81,0,0,0,,,,,,
3787516,000180509391a5ac66ff83cae603ffb8,2020-06-16,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,6917.27,1,1,1,,,,,,
3252426,000180509391a5ac66ff83cae603ffb8,2020-06-17,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,6909.7,0,2,1,,,,,,
3773240,000180509391a5ac66ff83cae603ffb8,2020-06-18,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,6902.11,0,0,1,,,,,,
2338801,000180509391a5ac66ff83cae603ffb8,2020-06-19,2020-06,7996daab1bbe000bb5d1cc1bf317f390,Active,2020-06-09,2020-06-15,2021-09-09,2022-09-09,25.65,...,6894.5,0,0,1,,,,,,


Verificar se houveram listas de tamanho quatro e considerar duas pro dsp e duas últimas pro dspp

In [49]:
# vamos considerar que o agrupamento foi feito sequencialmente: primeiro dsp e segundo dspp
def separate_dsp_status(x):
    if isinstance(x, list):
        if len(x) > 2:
            value = [x[0], x[1]]
            return value
        else:
            return x


geral_and_comunicados_sorted_df["status_dsp_v2"] = geral_and_comunicados_sorted_df[
    "status_dsp"
].apply(separate_dsp_status)

In [50]:
def separate_dspp_status(x):
    if isinstance(x, list):
        if len(x) > 2:
            value = [x[2], x[3]]
            return value
        else:
            return x


geral_and_comunicados_sorted_df["status_dspp_v2"] = geral_and_comunicados_sorted_df[
    "status_dspp"
].apply(separate_dspp_status)

Agora, vamos simplificar os status, e considerar alguns níveis de acordo com o que está presente na lista:
- Respondido = 3
- Lido = 2
- Entregue = 1
- N-entregue = 0

Aqui existe uma ordem de preferência:
- Respondido > Lido > Entregue > Não entregue 

In [62]:
def transcoding_status(x):
    status = ["NAO ENTREGUE", "ENTREGUE", "LIDO", "RESPONDIDO"]
    codes = []
    if isinstance(x, list):
        if status[3] in x:
            codes.append(3)
        if status[2] in x:
            codes.append(2)
        if status[1] in x:
            codes.append(1)
        if status[0] in x:
            codes.append(0)

        # to return
        if 3 in codes:
            return 3
        elif 2 in codes:
            return 2
        elif 1 in codes:
            return 1
        elif 0 in codes:
            return 0
    else:
        return x


geral_and_comunicados_sorted_df["status_dsp_encoded"] = geral_and_comunicados_sorted_df[
    "status_dsp_v2"
].apply(transcoding_status)
geral_and_comunicados_sorted_df[
    "status_dspp_encoded"
] = geral_and_comunicados_sorted_df["status_dspp_v2"].apply(transcoding_status)

In [65]:
geral_and_comunicados_sorted_refined = geral_and_comunicados_sorted_df.drop(
    ["status", "status_dsp", "status_dspp", "status_dsp_v2", "status_dspp_v2"], axis=1
)

In [69]:
geral_and_comunicados_sorted_refined["status_dsp_encoded"].value_counts()

1.0    66606
2.0    48497
0.0    34048
3.0     2388
Name: status_dsp_encoded, dtype: int64

Vamos filtrar os 0s, por que entendemos que mensagem que não foi entregue, n apresenta nenhuma influência na nossa análise

### Criação de Features DSP e DSPP

In [70]:
geral_and_comunicados_sorted_refined_filtered_dsp = (
    geral_and_comunicados_sorted_refined[
        geral_and_comunicados_sorted_refined["status_dsp_encoded"] != 0
    ]
)
contrato_dsp_features = (
    geral_and_comunicados_sorted_refined_filtered_dsp.groupby(["contrato_id"])["dsp"]
    .agg(
        [
            features.total_success_dsp5,
            features.total_success_dsp10,
            features.total_success_dsp15,
            features.total_success_dsp30,
            features.total_success_dsp60,
            features.total_success_dsp90,
            features.prop_success_dsp5,
            features.prop_success_dsp10,
            features.prop_success_dsp15,
            features.prop_success_dsp30,
            features.prop_success_dsp60,
            features.prop_success_dsp90,
        ]
    )
    .reset_index()
)

In [71]:
geral_and_comunicados_sorted_refined_filtered_dspp = (
    geral_and_comunicados_sorted_refined[
        geral_and_comunicados_sorted_refined["status_dspp_encoded"] != 0
    ]
)
contrato_dspp_features = (
    geral_and_comunicados_sorted_refined_filtered_dspp.groupby(["contrato_id"])["dspp"]
    .agg(
        [
            features.total_success_dspp15,
            features.total_success_dspp30,
            features.total_success_dspp45,
            features.prop_success_dspp15,
            features.prop_success_dspp30,
            features.prop_success_dspp45,
        ]
    )
    .reset_index()
)

In [72]:
contrato_dsp_dspp = contrato_dsp_features.merge(
    right=contrato_dspp_features, on="contrato_id", how="inner"
)

### Score DSP e DSPP

In [73]:
means_dsp = []
means_dspp = []
for i, row in contrato_dsp_dspp.iterrows():
    means_dsp.append(np.nanmean(row[7:13]))
    means_dspp.append(np.nanmean(row[16:19]))

contrato_dsp_dspp["score_dsp"] = means_dsp
contrato_dsp_dspp["score_dspp"] = means_dspp

  means_dspp.append(np.nanmean(row[16:19]))
  means_dsp.append(np.nanmean(row[7:13]))


In [74]:
contrato_dsp_dspp

Unnamed: 0,contrato_id,total_success_dsp5,total_success_dsp10,total_success_dsp15,total_success_dsp30,total_success_dsp60,total_success_dsp90,prop_success_dsp5,prop_success_dsp10,prop_success_dsp15,...,prop_success_dsp60,prop_success_dsp90,total_success_dspp15,total_success_dspp30,total_success_dspp45,prop_success_dspp15,prop_success_dspp30,prop_success_dspp45,score_dsp,score_dspp
0,000180509391a5ac66ff83cae603ffb8,1,1,0,0,0,0,0.000000,1.000000,,...,,,0,0,0,,,,0.500000,
1,000c35a61297edadc2842f6d5b4028e1,4,1,1,1,1,1,0.750000,0.000000,0.0,...,0.0,1.0,1,1,1,0.0,0.0,0.00,0.291667,0.000000
2,000dcdc93a545ee45a1aee85ef85c34a,10,6,5,2,1,0,0.400000,0.166667,0.8,...,0.0,,2,1,1,1.0,0.0,0.00,0.373333,0.333333
3,0014cccd47b66a47af4e62c4dcbe95f1,2,1,1,0,0,0,1.000000,0.000000,0.0,...,,,2,1,0,0.5,1.0,,0.333333,0.750000
4,001621e2d725ab3d3773692745be79b2,3,1,0,0,0,0,1.000000,1.000000,,...,,,2,0,0,1.0,,,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12197,fff4bf15d5b48a5c1bda798ea77290d1,1,0,0,0,0,0,1.000000,,,...,,,0,0,0,,,,1.000000,
12198,fff6edc99168d63a10ea51a7f808ff38,3,1,0,0,0,0,0.666667,1.000000,,...,,,0,0,0,,,,0.833333,
12199,fffb0d6ec51d163ca8c69900f748d61b,4,3,2,0,0,0,0.250000,0.333333,1.0,...,,,4,3,4,0.0,0.0,0.75,0.527778,0.250000
12200,fffb47b9ce3835c85513ef5b19165f6e,1,1,1,1,0,0,0.000000,0.000000,0.0,...,,,0,0,0,,,,0.250000,


### Entregou? Não entregou? Leu?

Essa é uma média que computa os tipos de acionamentos em cada contrato 

In [77]:
acionamentos_delivery_dsp = (
    geral_and_comunicados_sorted_refined_filtered_dsp.groupby(["contrato_id"])[
        "status_dsp_encoded"
    ]
    .agg([np.nanmean])
    .reset_index()
)
acionamentos_delivery_dspp = (
    geral_and_comunicados_sorted_refined_filtered_dspp.groupby(["contrato_id"])[
        "status_dspp_encoded"
    ]
    .agg([np.nanmean])
    .reset_index()
)

In [80]:
acionamentos_delivery_dsp.columns = ["contrato_id", "mean_dsp_contrato"]
acionamentos_delivery_dspp.columns = ["contrato_id", "mean_dspp_contrato"]

In [81]:
contrato_dsp_dspp_qtd_acoes = contrato_dsp_dspp.merge(
    right=acionamentos_delivery_dsp, how="inner", on="contrato_id"
).merge(right=acionamentos_delivery_dspp, how="inner", on="contrato_id")
contrato_dsp_dspp_qtd_acoes

Unnamed: 0,contrato_id,total_success_dsp5,total_success_dsp10,total_success_dsp15,total_success_dsp30,total_success_dsp60,total_success_dsp90,prop_success_dsp5,prop_success_dsp10,prop_success_dsp15,...,total_success_dspp15,total_success_dspp30,total_success_dspp45,prop_success_dspp15,prop_success_dspp30,prop_success_dspp45,score_dsp,score_dspp,mean_dsp_contrato,mean_dspp_contrato
0,000180509391a5ac66ff83cae603ffb8,1,1,0,0,0,0,0.000000,1.000000,,...,0,0,0,,,,0.500000,,1.500000,
1,000c35a61297edadc2842f6d5b4028e1,4,1,1,1,1,1,0.750000,0.000000,0.0,...,1,1,1,0.0,0.0,0.00,0.291667,0.000000,1.333333,1.000000
2,000dcdc93a545ee45a1aee85ef85c34a,10,6,5,2,1,0,0.400000,0.166667,0.8,...,2,1,1,1.0,0.0,0.00,0.373333,0.333333,1.416667,1.750000
3,0014cccd47b66a47af4e62c4dcbe95f1,2,1,1,0,0,0,1.000000,0.000000,0.0,...,2,1,0,0.5,1.0,,0.333333,0.750000,1.000000,1.666667
4,001621e2d725ab3d3773692745be79b2,3,1,0,0,0,0,1.000000,1.000000,,...,2,0,0,1.0,,,1.000000,1.000000,1.500000,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12197,fff4bf15d5b48a5c1bda798ea77290d1,1,0,0,0,0,0,1.000000,,,...,0,0,0,,,,1.000000,,1.000000,
12198,fff6edc99168d63a10ea51a7f808ff38,3,1,0,0,0,0,0.666667,1.000000,,...,0,0,0,,,,0.833333,,1.500000,
12199,fffb0d6ec51d163ca8c69900f748d61b,4,3,2,0,0,0,0.250000,0.333333,1.0,...,4,3,4,0.0,0.0,0.75,0.527778,0.250000,1.555556,1.272727
12200,fffb47b9ce3835c85513ef5b19165f6e,1,1,1,1,0,0,0.000000,0.000000,0.0,...,0,0,0,,,,0.250000,,1.750000,


### Valor devedor esperado

In [82]:
# features de vlr_saldo_devedor
vlr_saldo_devedor_inicial = geral_and_comunicados_sorted_refined.drop_duplicates(
    ["contrato_id"]
)[["contrato_id", "vlr_saldo_devedor_esperado"]]

c_dsp_dspp_qtd_acoes_devedor = contrato_dsp_dspp_qtd_acoes.merge(
    right=vlr_saldo_devedor_inicial, how="inner", on="contrato_id"
)

### Dados cadastrais

In [83]:
x_contrato_id_nr_documento = geral_and_comunicados_sorted_refined.drop_duplicates(
    ["contrato_id", "nr_documento"]
)[["contrato_id", "nr_documento"]]

In [84]:
c_dsp_dspp_qtd_acoes_devedor_w_doc = c_dsp_dspp_qtd_acoes_devedor.merge(
    right=x_contrato_id_nr_documento, how="inner", on="contrato_id"
)

In [85]:
# clientes_unique
clientes_unique_nr_doc = (
    clients.groupby("nr_documento")[
        ["tipo_empresa", "cidade", "estado", "subsegmento", "segmento"]
    ]
    .agg(lambda x: list(x) if len(x) > 1 else x)
    .reset_index()
)

In [86]:
c_dsp_dspp_qtd_acoes_devedor_w_doc_and_clients = (
    c_dsp_dspp_qtd_acoes_devedor_w_doc.merge(
        right=clientes_unique_nr_doc, on="nr_documento", how="inner"
    )
)

### TPV

In [87]:
qtd_trans_tpv = tpv.groupby("nr_documento")[["qtd_transacoes", "vlr_tpv"]].agg(
    ["mean", "min", "max", np.median, "sum"]
)

In [88]:
final_df = c_dsp_dspp_qtd_acoes_devedor_w_doc_and_clients.merge(
    right=qtd_trans_tpv, how="left", on="nr_documento"
)

  final_df = c_dsp_dspp_qtd_acoes_devedor_w_doc_and_clients.merge(


In [89]:
final_df.to_csv("data/to_analysis_v2.csv", index=False)