In [1]:
import pandas as pd
import gc
rcc_train = pd.read_csv("../data/rcc_train.csv")
y_train = pd.read_csv("../data/y_train.csv", index_col="key_value")
rcc_test = pd.read_csv("../data/rcc_test.csv")

In [2]:
### se puede observar que en el rcc_train aquellos clientes que tienen un solo registro, es porque se refiere al ultimo
### periodo de la data de train que es del febrero del 2018, cuyo RIESGO_DIRECTO es 1 y COD_CLASIFICACION_DEUDOR es 0 
### su condicion (días de atraso) es 0 siendo estos cerca de 623 clientes. Pero OJO el 9% de los nuevos clientes tienen un target igual a 1

### el target de la data train representa el score crediticio al febrero del 2018 
### por lo que se sobre-entiende que lo que se quiere predecir es el score crediticio al febrero del 2019

### La data train abarca desde 201703 hasta 201802
### La data test comprende desde 201803 hasta 201902
### Los primeros registros de los clientes tienen un RIESGO DIRECTO de 1 ***

### La distribucion del target general es de 16% para 1 y 84% para los 0
### A mayor COD_CLASIFICACION_DEUDOR y codmes mes cercano, el target se aproxima a 1. Ejm COD_CLASIFICACION_DEUDOR=5 y codmes=201802
### el target esta representado por el 97% de 1 y 3% de cero 

In [3]:
### condicion, saldo   ..... son variables cuantitativas
### tipo_credito en train no tiene la clase de 7 como en el test, pero este tiene solo 1 valor (8 clases) ### Frecuencia 11
### cod_instit_financiera , 12 clases más se encuentran en el train que en el test (mas de 100 clases) ### Frecuencia 34 ----
### PRODUCTO, el train tiene la clase 255 y el test no (42 clases) ### Frecuencia 1  ----
### RIESGO_DIRECTO es lo mismo , es lo mismo en el test y en el train (5 clases) ### Frecuencia -1
### COD_CLASIFICACION_DEUDOR , es lo mismo en el test y en el train (6 clases) ### Frecuencia 0
### en el rcc_test, faltan valores en la columma de cod_instit_financiera y producto

In [4]:
bins = [-1, 0, 10, 20, 30, 60, 90, 180, 360, 720, float("inf")]
rcc_train["condicion"] = pd.cut(rcc_train.condicion, bins, labels = range(0,10))
rcc_test["condicion"] = pd.cut(rcc_test.condicion, bins, labels = range(0,10))
rcc_test['cod_instit_financiera'] = rcc_test['cod_instit_financiera'].fillna(34)
rcc_test['PRODUCTO'] = rcc_test['PRODUCTO'].fillna(1)

In [5]:
rcc_train = rcc_train.astype({'condicion':'int32',
                  'tipo_credito':'int32',
                  'cod_instit_financiera':'int32',
                  'PRODUCTO':'int32',
                  'RIESGO_DIRECTO':'int32',
                  'COD_CLASIFICACION_DEUDOR':'int32'})
rcc_test = rcc_test.astype({'condicion':'int32',
                  'tipo_credito':'int32',
                  'cod_instit_financiera':'int32',
                  'PRODUCTO':'int32',
                  'RIESGO_DIRECTO':'int32',
                  'COD_CLASIFICACION_DEUDOR':'int32'})

In [6]:
from tqdm.notebook import tqdm 
list_rcc_train_agg = []
for n,i in enumerate(tqdm(sorted(set(rcc_train.codmes),reverse=True))):
    if i in [201802,201712,201709,201703]:
        for c in ['tipo_credito','cod_instit_financiera','PRODUCTO','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
            rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=0)
            rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_ult{n+1}meses' for c in rcc_train_agg.columns]
            list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

rcc_train_ = pd.concat(list_rcc_train_agg, axis=1)
del rcc_train, list_rcc_train_agg

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [7]:
from tqdm.notebook import tqdm 
list_rcc_test_agg = []
for n,i in enumerate(tqdm(sorted(set(rcc_test.codmes),reverse=True))):
    if i in [201902,201812,201809,201803]:
        for c in ['tipo_credito','cod_instit_financiera','PRODUCTO','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
            rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=0)
            rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_ult{n+1}meses' for c in rcc_test_agg.columns]
            list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()
rcc_test_ = pd.concat(list_rcc_test_agg, axis=1)
del rcc_test, list_rcc_test_agg

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [8]:
keep_cols = list(set(rcc_train_.columns).intersection(set(rcc_test_.columns)))
rcc_train_ = rcc_train_[keep_cols].copy()
rcc_test_ = rcc_test_[keep_cols].copy()
len(set(rcc_train_.columns) - set(rcc_test_.columns)) , len(set(rcc_test_.columns) - set(rcc_train_.columns))
gc.collect()

26

In [9]:
#Función para obtener variables con alta correlación
def get_correlated(data, cut=0.85):
    correlated_features = set()
    correlation_matrix = data.corr()
    
    for i in range(len(correlation_matrix .columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > cut:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
            
    return correlated_features
print(rcc_train_.shape)
correlated = get_correlated(rcc_train_,cut=0.9)
rcc_train_ = rcc_train_.drop(columns=correlated)
rcc_test_ = rcc_test_.drop(columns=correlated)
print(rcc_train_.shape)

(358487, 642)
(358487, 263)


In [13]:
rcc_train_

Unnamed: 0_level_0,tipo_credito_8_ult3meses,cod_instit_financiera_59_ult3meses,cod_instit_financiera_7_ult1meses,cod_instit_financiera_71_ult12meses,cod_instit_financiera_60_ult1meses,cod_instit_financiera_37_ult1meses,PRODUCTO_0_ult6meses,COD_CLASIFICACION_DEUDOR_0_ult1meses,PRODUCTO_36_ult1meses,condicion_9_ult6meses,...,condicion_4_ult3meses,tipo_credito_12_ult1meses,PRODUCTO_28_ult3meses,tipo_credito_6_ult12meses,condicion_3_ult1meses,condicion_4_ult1meses,condicion_1_ult1meses,condicion_4_ult12meses,cod_instit_financiera_20_ult1meses,PRODUCTO_38_ult1meses
key_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.462743,-0.170666,0.0,0.0,...,0.000000,-0.021401,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,-0.270619,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,-0.150384,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.117343,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.117343,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,-0.710497,-0.523501,0.0,0.0,...,-0.071022,-0.508788,0.0,0.0,0.0,0.0,-0.118821,-0.071022,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358482,0.0,0.0,0.0,0.0,0.0,0.0,-0.050498,-0.195957,0.0,0.0,...,0.000000,-0.123312,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
358483,0.0,0.0,0.0,0.0,0.0,0.0,-0.061673,-0.207356,0.0,0.0,...,0.000000,-0.207356,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
358484,0.0,0.0,0.0,0.0,0.0,0.0,-0.059344,-0.259724,0.0,0.0,...,0.000000,-0.259724,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
358485,0.0,0.0,0.0,0.0,0.0,0.0,-0.058336,-0.058336,0.0,0.0,...,0.000000,-0.058336,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0


In [14]:
rcc_train_.to_csv('../data/intermediate/rcc_train.csv')
rcc_test_.to_csv('../data/intermediate/rcc_test.csv')