In [1]:
import pandas as pd
import gc

In [2]:
# sunat_train = pd.read_csv('../data/sunat_train.csv')
rcc_train = pd.read_csv('../data/rcc_train.csv')
se_train = pd.read_csv('../data/se_train.csv', index_col = 'key_value')
censo_train = pd.read_csv('../data/censo_train.csv', index_col = 'key_value')
y_train = pd.read_csv('../data/y_train.csv', index_col = 'key_value')

# sunat_test = pd.read_csv('../data/sunat_test.csv')
rcc_test = pd.read_csv('../data/rcc_test.csv')
se_test = pd.read_csv('../data/se_test.csv', index_col = 'key_value')
censo_test = pd.read_csv('../data/censo_test.csv', index_col = 'key_value')

In [3]:
### se puede observar que en el rcc_train aquellos clientes que tienen un solo registro, es porque se refiere al ultimo
### periodo de la data de train que es del febrero del 2018, cuyo RIESGO_DIRECTO es 1 y COD_CLASIFICACION_DEUDOR es 0 
### su condicion (días de atraso) es 0 siendo estos cerca de 623 clientes. Pero OJO el 9% de los nuevos clientes tienen un target igual a 1

### el target de la data train representa el score crediticio al febrero del 2018 
### por lo que se sobre-entiende que lo que se quiere predecir es el score crediticio al febrero del 2019

### La data train abarca desde 201703 hasta 201802
### La data test comprende desde 201803 hasta 201902
### Los primeros registros de los clientes tienen un RIESGO DIRECTO de 1 ***

### La distribucion del target general es de 16% para 1 y 84% para los 0
### A mayor COD_CLASIFICACION_DEUDOR y codmes mes cercano, el target se aproxima a 1. Ejm COD_CLASIFICACION_DEUDOR=5 y codmes=201802
### el target esta representado por el 97% de 1 y 3% de cero 

In [4]:
### condicion, saldo   ..... son variables cuantitativas
### tipo_credito en train no tiene la clase de 7 como en el test, pero este tiene solo 1 valor (8 clases) ### Frecuencia 11
### cod_instit_financiera , 12 clases más se encuentran en el train que en el test (mas de 100 clases) ### Frecuencia 34 ----
### PRODUCTO, el train tiene la clase 255 y el test no (42 clases) ### Frecuencia 1  ----
### RIESGO_DIRECTO es lo mismo , es lo mismo en el test y en el train (5 clases) ### Frecuencia -1
### COD_CLASIFICACION_DEUDOR , es lo mismo en el test y en el train (6 clases) ### Frecuencia 0
### en el rcc_test, faltan valores en la columma de cod_instit_financiera y producto

In [5]:
rcc_train = rcc_train.astype({'condicion':'int32',
                  'tipo_credito':'int32',
                  'RIESGO_DIRECTO':'int32',
                  'COD_CLASIFICACION_DEUDOR':'int32'})
rcc_test = rcc_test.astype({'condicion':'int32',
                  'tipo_credito':'int32',
                  'RIESGO_DIRECTO':'int32',
                  'COD_CLASIFICACION_DEUDOR':'int32'})

In [6]:
import statistics as stat
import numpy as np
def calculate_mode(x):
    try:
        moda=stat.mode(x)
    except:
        moda=-99
    return moda

In [7]:
from tqdm.notebook import tqdm 
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_rcc = {'saldo':['sum','std'],
           'condicion':['sum','max','std'],
           'tipo_credito':['nunique', moda],
           'cod_instit_financiera':['nunique',moda],
           'PRODUCTO':['nunique',moda],
           'RIESGO_DIRECTO':['nunique',moda],
            'COD_CLASIFICACION_DEUDOR':['nunique','max',moda]
          }
list_rcc_train_agg = []
for n,i in enumerate(tqdm(sorted(set(rcc_train.codmes),reverse=True))):
    rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_train_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_train_agg.columns]
    list_rcc_train_agg.append(rcc_train_agg)
rcc_train_ = pd.concat(list_rcc_train_agg, axis=1)
del rcc_train, list_rcc_train_agg

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [8]:
list_rcc_test_agg = []
for n,i in enumerate(tqdm(sorted(set(rcc_test.codmes),reverse=True))):
    rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_test_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_test_agg.columns]
    list_rcc_test_agg.append(rcc_test_agg)
rcc_test_ = pd.concat(list_rcc_test_agg, axis=1)
del rcc_test, list_rcc_test_agg

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [9]:
train = rcc_train_.join(censo_train).join(se_train)
test = rcc_test_.join(censo_test).join(se_test)

In [10]:
keep_cols = list(set(train.columns).intersection(set(test.columns)))
train = train[keep_cols]
test = test[keep_cols]
len(set(train.columns) - set(test.columns)) , len(set(test.columns) - set(train.columns))

(0, 0)

In [11]:
import re
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_-]+', '', x))
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_-]+', '', x))

In [12]:
#Función para obtener variables con alta correlación
def get_correlated(data, cut=0.85):
    correlated_features = set()
    correlation_matrix = data.corr()
    
    for i in range(len(correlation_matrix .columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > cut:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)
            
    return correlated_features 

In [13]:
train.shape

(358487, 461)

In [14]:
correlated=get_correlated(train,cut=0.9)
train=train.drop(columns=correlated)
test= test.drop(columns=correlated)

In [15]:
train.isna().sum()

COD_CLASIFICACION_DEUDOR_max_ult11mes            0
COD_CLASIFICACION_DEUDOR_nunique_ult1mes         0
acteco15                                    205348
saldo_std_ult12mes                             623
adu                                         205348
                                             ...  
mattec7                                     205348
matpar2                                     205348
hig4                                        205348
tipviv1                                     205348
hig6                                        205348
Length: 193, dtype: int64

### Probando con CatBoost

In [16]:
# !pip install catboost
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index]

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index]
    
    learner = CatBoostClassifier(n_estimators=1000, depth = 6, eval_metric = 'AUC' )
    learner.fit(Xt, yt,  early_stopping_rounds=10,
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))
          
test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)

print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))

********** 0 **********
Learning rate set to 0.128212
0:	test: 0.7674718	test1: 0.7756717	best: 0.7756717 (0)	total: 234ms	remaining: 3m 53s
50:	test: 0.8367936	test1: 0.8387051	best: 0.8387051 (50)	total: 9.1s	remaining: 2m 49s
100:	test: 0.8455141	test1: 0.8457027	best: 0.8457027 (100)	total: 17.7s	remaining: 2m 37s
150:	test: 0.8511525	test1: 0.8489618	best: 0.8489618 (150)	total: 25.9s	remaining: 2m 25s
200:	test: 0.8548389	test1: 0.8506452	best: 0.8506452 (200)	total: 33.9s	remaining: 2m 14s
250:	test: 0.8576717	test1: 0.8515234	best: 0.8515234 (250)	total: 41.6s	remaining: 2m 4s
300:	test: 0.8602101	test1: 0.8521503	best: 0.8521503 (300)	total: 49.4s	remaining: 1m 54s
350:	test: 0.8626540	test1: 0.8526497	best: 0.8526503 (347)	total: 57.1s	remaining: 1m 45s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.852760389
bestIteration = 369

Shrink model to first 370 iterations.
********** 1 **********
Learning rate set to 0.128212
0:	test: 0.7693328	test1: 0.7688910

In [17]:
fi.sort_values(ascending=False).head(30)#.plot.bar()

COD_CLASIFICACION_DEUDOR_nunique_ult1mes    0.106535
cod_instit_financiera_nunique_ult7mes       0.056005
saldo_sum_ult12mes                          0.050954
PRODUCTO_mode_ult1mes                       0.031634
cod_instit_financiera_mode_ult1mes          0.030985
agundia4                                    0.029895
PRODUCTO_nunique_ult7mes                    0.029384
saldo_std_ult2mes                           0.029234
COD_CLASIFICACION_DEUDOR_nunique_ult2mes    0.029217
condicion_max_ult6mes                       0.027680
acteco12                                    0.026442
saldo_std_ult12mes                          0.022639
edad                                        0.022363
acteco2                                     0.020359
tip_lvledu                                  0.016809
COD_CLASIFICACION_DEUDOR_max_ult11mes       0.016758
tipo_credito_mode_ult1mes                   0.015293
cod_ubi                                     0.013419
sexo                                        0.

In [19]:
test_probs.name = "target"
test_probs.to_csv("test3_v2.csv")