In [1]:
import pandas as pd
from tqdm.notebook import tqdm 
import gc
import numpy as np

In [2]:
import statistics as stat
def calculate_mode(x):
    try:
        moda=stat.mode(x)
    except:
        moda=np.nan
    return moda

In [3]:
path = '../data'
y_train = pd.read_csv(f'{path}/y_train.csv', index_col = 'key_value')

## Procesar RCC

In [None]:
rcc_train = pd.read_csv(f'{path}/rcc_train.csv')
rcc_test = pd.read_csv(f'{path}/rcc_test.csv')

In [None]:
rcc_test['cod_instit_financiera'].fillna(rcc_test['cod_instit_financiera'].value_counts().index[0], inplace=True)
rcc_test['PRODUCTO'].fillna(rcc_test['PRODUCTO'].value_counts().index[0], inplace=True)

In [None]:
dict_ = {'codmes': 'int32',
 'key_value': 'int32',
 'condicion': 'int32',
 'tipo_credito': 'int32',
 'cod_instit_financiera': 'int32',
 'PRODUCTO': 'int32',
 'RIESGO_DIRECTO': 'int32',
 'COD_CLASIFICACION_DEUDOR': 'int32'}
rcc_train = rcc_train.astype(dict_)
rcc_test = rcc_test.astype(dict_)

In [None]:
bins = [-1, 0, 10, 20, 30, 60, 90, 180, 360, 720, float("inf")]
rcc_train["condicion"] = pd.cut(rcc_train.condicion, bins)
rcc_train["condicion"] = rcc_train["condicion"].cat.codes
rcc_test["condicion"] = pd.cut(rcc_test.condicion, bins)
rcc_test["condicion"] = rcc_test["condicion"].cat.codes

In [None]:
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_rcc = {'cod_instit_financiera':['nunique','min','max',moda],
           'PRODUCTO':['nunique','min','max',moda],}

In [None]:
list_rcc_train_agg = []
for n,i in enumerate(sorted(set(rcc_train.codmes),reverse=True)):
    for c in ['tipo_credito','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
        print(f'haciendo {c} desde {i}')
        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoSum_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.size().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoUnique_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.min().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMin_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.max().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMax_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.std().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoStd_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.mean().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMean_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()
        
        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.median().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMedian_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()
        
    print(f'haciendo aggregate de {i}')
    rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_train_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_train_agg.columns]
    list_rcc_train_agg.append(rcc_train_agg)
    gc.collect()
        
rcc_train_ = pd.concat(list_rcc_train_agg, axis=1)
del rcc_train, list_rcc_train_agg

In [None]:
list_rcc_test_agg = []
for n,i in enumerate(sorted(set(rcc_test.codmes),reverse=True)):
    for c in ['tipo_credito','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
        print(f'haciendo {c} desde {i}')
        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoSum_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.size().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoUnique_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.min().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMin_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.max().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMax_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.std().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoStd_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.mean().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMean_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.median().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMedian_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        
    print(f'haciendo aggregate de {i}')
    rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_test_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_test_agg.columns]
    list_rcc_test_agg.append(rcc_test_agg)
    gc.collect()

rcc_test_ = pd.concat(list_rcc_test_agg, axis=1)
del rcc_test, list_rcc_test_agg

In [None]:
### asegurar que las columnas esten en ambas bases (train/test)
print(rcc_train_.shape, rcc_test_.shape)
keep_columns = list(set(rcc_train_.columns).intersection(rcc_test_.columns))
rcc_train_ = rcc_train_[keep_columns].copy()
rcc_test_ = rcc_test_[keep_columns].copy()
print(rcc_train_.shape, rcc_test_.shape)

In [None]:
### unir RCC en la base final
train = rcc_train_.copy()
test = rcc_test_.copy()
del rcc_train_, rcc_test_

In [None]:
# train.to_pickle('train_2532features.pkl')
# test.to_pickle('test_2532features.pkl')

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

def cross_validation_lightgbm_fi(train, y_train, test):
    folds = [train.index[t] for t, v in KFold(5).split(train)]
    test_probs = []
    train_probs = []
    fi = []
    for i, idx in enumerate(folds):
        print("*"*10, i, "*"*10)
        Xt = train.loc[idx]
        yt = y_train.loc[Xt.index].target

        Xv = train.drop(Xt.index)
        yv = y_train.loc[Xv.index].target

        learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                       colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
        learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                    eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
        test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
        train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
        fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

    test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
    train_probs = pd.concat(train_probs)
    fi = pd.concat(fi, axis=1).mean(axis=1)
    print("*" * 21)
    print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
    print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 
    return fi

In [None]:
# eliminar variables con zero_importance
while True:
    fi = cross_validation_lightgbm_fi(train, y_train, test)
    zero_importance = fi[fi==0]
    aux = fi[fi>0].sort_values(ascending=False)
    keep_columns = []
    count = 0
    for feature,values in zip(aux.index, aux.values):
        count+=values
        if count<=0.99:
            keep_columns.append(feature)
            
    print(f'total de variables : {len(train.columns)}')
    print(f'variables con importancia acumulada al 99% : {len(keep_columns)}')
    print(f'variables con zero importancia : {len(zero_importance)}')
    train = train[keep_columns]
    test = test[keep_columns]
    if len(zero_importance)==0:
        break

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

In [None]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1064features_0.83955.csv') ### score de 0.84242 en la tabla publica

In [None]:
# train.to_pickle('train_1064features.pkl')
# test.to_pickle('test_1064features.pkl')

### Cargar dataset preprocesado del RCC

In [4]:
train = pd.read_pickle('train_1064features.pkl')
test = pd.read_pickle('test_1064features.pkl')

## Procesar SE

In [5]:
se_train = pd.read_csv(f'{path}/se_train.csv', index_col = 'key_value')
se_test = pd.read_csv(f'{path}/se_test.csv', index_col = 'key_value')

In [6]:
dict_ = {'sexo':'int32',
         'est_cvl':'int32',
         'sit_lab':'int32',
         'cod_ocu':'int32',
         'ctd_hijos':'int32',
         'flg_sin_email':'int32',
         'ctd_veh':'int32',
         'lgr_vot':'int32',
         'prv':'int32',
         'dto':'int32',
         'rgn':'int32',
         'tip_lvledu':'int32'}
se_train = se_train.astype(dict_)
se_test = se_test.astype(dict_)

In [7]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

def cross_validation_lightgbm(train, test, y_train):
    folds = [train.index[t] for t, v in KFold(5).split(train)]
    test_probs = []
    train_probs = []
    fi = []
    for i, idx in enumerate(folds):
        print("*"*10, i, "*"*10)
        Xt = train.loc[idx]
        yt = y_train.loc[Xt.index].target

        Xv = train.drop(Xt.index)
        yv = y_train.loc[Xv.index].target

        learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                       colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
        learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                    eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
        test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
        train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
        fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))
        gc.collect()
    test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
    train_probs = pd.concat(train_probs)
    fi = pd.concat(fi, axis=1).mean(axis=1)
    print("*" * 21)
    print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
    print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))]))
    return [test_probs, fi]

In [8]:
def get_one_hot(df, feature):
    one_hot = pd.get_dummies(df[feature])
    one_hot.columns = [feature+'_'+str(i) for i in one_hot.columns]
    return one_hot
one_hot_sexo_train = get_one_hot(se_train, 'sexo')
one_hot_sexo_test = get_one_hot(se_test, 'sexo')
one_hot_est_cvl_train = get_one_hot(se_train, 'est_cvl')
one_hot_est_cvl_test = get_one_hot(se_test, 'est_cvl')
one_hot_rgn_train = get_one_hot(se_train, 'rgn')
one_hot_rgn_test = get_one_hot(se_test, 'rgn')

In [9]:
train = train.join(one_hot_sexo_train).join(one_hot_est_cvl_train).join(one_hot_rgn_train).join(se_train[['edad','ctd_veh']])
test = test.join(one_hot_sexo_test).join(one_hot_est_cvl_test).join(one_hot_rgn_test).join(se_test[['edad','ctd_veh']])

In [11]:
[test_probs,fi] = cross_validation_lightgbm(train, test, y_train)

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.841961	training's binary_logloss: 0.303502	valid_1's auc: 0.833136	valid_1's binary_logloss: 0.309781
[100]	training's auc: 0.855334	training's binary_logloss: 0.293031	valid_1's auc: 0.840443	valid_1's binary_logloss: 0.303984
[150]	training's auc: 0.86342	training's binary_logloss: 0.286803	valid_1's auc: 0.842834	valid_1's binary_logloss: 0.302076
[200]	training's auc: 0.869938	training's binary_logloss: 0.281867	valid_1's auc: 0.844005	valid_1's binary_logloss: 0.301129
[250]	training's auc: 0.875516	training's binary_logloss: 0.277572	valid_1's auc: 0.844612	valid_1's binary_logloss: 0.300671
[300]	training's auc: 0.88052	training's binary_logloss: 0.273588	valid_1's auc: 0.845113	valid_1's binary_logloss: 0.300427
Early stopping, best iteration is:
[308]	training's auc: 0.881344	training's binary_logloss: 0.272952	valid_1's auc: 0.845203	valid_1's binary_logloss: 0.300395
*

In [12]:
test_probs.name = 'target'
test_probs.to_csv('../results/test_withRccSe_0.84422.csv') #### score de 0.84738 en la tabla publica

## Procesar SUNAT

In [10]:
sunat_train = pd.read_csv(f'{path}/sunat_train.csv')
sunat_test = pd.read_csv(f'{path}/sunat_test.csv')

In [11]:
### eliminar registros duplicados
sunat_train.drop_duplicates(inplace=True)
sunat_test.drop_duplicates(inplace=True)
sunat_train.shape, sunat_test.shape

((292479, 18), (318821, 18))

In [12]:
dict_ = {'tipcontribuyente': 'int32',
         'tippersona': 'int32',
         'ciiu': 'int32',
         'ubigeo': 'int32',
         'condiciondomicilio': 'int32',
         'estadocontribuyente': 'int32',
         'codvia': 'int32',
         'codzona': 'int32',
         'contabilidad': 'int32',
         'facturacion': 'int32',
         'domiciliado': 'int32',
         'comercioexterior': 'int32',
         'cargorele': 'int32',
         'codentidadtributo': 'int32',
         'estadotributo': 'int32'}
sunat_train = sunat_train.astype(dict_)
sunat_test = sunat_test.astype(dict_)

In [13]:
sunat_train['diff_fech'] = sunat_train['fecbaja'] - sunat_train['fecalta']
sunat_test['diff_fech'] = sunat_test['fecbaja'] - sunat_test['fecalta']

In [14]:
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_sunat = {
            'tipcontribuyente':['nunique',moda],
           'tippersona':['nunique',moda],
           'ciiu':['nunique', moda],
           'ubigeo':['nunique',moda],
           'condiciondomicilio':['nunique',moda],
#            'estadocontribuyente':['nunique',moda],
           'codvia':['nunique',moda],
           'codzona':['nunique',moda],
           'contabilidad':['nunique',moda],
           'facturacion':['nunique',moda],
           'domiciliado':['nunique',moda],
           'comercioexterior':['nunique',moda],
           'cargorele':['nunique',moda],
           'codentidadtributo':['nunique',moda],
           'estadotributo':['nunique',moda],
           'fecalta':['mean','max', 'nunique','min','std'],
           'fecbaja':['mean','max', 'nunique','min'], 
           'diff_fech':['mean','max', 'nunique','min'], 
            }
sunat_train_ = sunat_train.groupby('key_value').agg(agg_sunat)
sunat_train_.columns = [i+'_'+j for i,j in sunat_train_.columns]
sunat_test_ = sunat_test.groupby('key_value').agg(agg_sunat)
sunat_test_.columns = [i+'_'+j for i,j in sunat_test_.columns]
# del sunat_train, sunat_test

In [15]:
def get_keep_columns(train, test):
    print(train.shape, test.shape)
    keep_columns = list(set(train.columns).intersection(test.columns))
    train = train[keep_columns]
    test = test[keep_columns]
    print(train.shape, test.shape)
    return  train, test
def get_crosstab(df, index, feature):
    df = pd.crosstab(df[index], df[feature])
    df.columns = [f'{df.columns.name}_{i}' for i in df.columns]
    return df

import category_encoders as ce
def get_binec(df, feature):
    X_binenc = df[feature]
    encoder = ce.BinaryEncoder(cols=feature)
    X_binenc = encoder.fit_transform(X_binenc)
    return X_binenc

def get_one_hot(df, feature):
    one_hot = pd.get_dummies(df[feature])
    one_hot.columns = [feature+'_'+str(i) for i in one_hot.columns]
    return one_hot

In [16]:
crosstab_estadocontribuyente_train = get_crosstab(sunat_train, 'key_value', 'estadocontribuyente')
crosstab_estadocontribuyente_test = get_crosstab(sunat_test, 'key_value', 'estadocontribuyente')

In [17]:
train = train.join(crosstab_estadocontribuyente_train).join(sunat_train_)
test = test.join(crosstab_estadocontribuyente_test).join(sunat_test_)
train, test = get_keep_columns(train, test)

(358487, 1137) (396666, 1136)
(358487, 1136) (396666, 1136)


In [18]:
[test_probs, fi] = cross_validation_lightgbm(train, test, y_train)

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.842656	training's binary_logloss: 0.303229	valid_1's auc: 0.834396	valid_1's binary_logloss: 0.3093
[100]	training's auc: 0.856885	training's binary_logloss: 0.292173	valid_1's auc: 0.842629	valid_1's binary_logloss: 0.302872
[150]	training's auc: 0.865332	training's binary_logloss: 0.285703	valid_1's auc: 0.845335	valid_1's binary_logloss: 0.300704
[200]	training's auc: 0.872167	training's binary_logloss: 0.280538	valid_1's auc: 0.846596	valid_1's binary_logloss: 0.299734
[250]	training's auc: 0.877745	training's binary_logloss: 0.276136	valid_1's auc: 0.847317	valid_1's binary_logloss: 0.299193
[300]	training's auc: 0.88267	training's binary_logloss: 0.272096	valid_1's auc: 0.847638	valid_1's binary_logloss: 0.298928
Early stopping, best iteration is:
[294]	training's auc: 0.882096	training's binary_logloss: 0.272538	valid_1's auc: 0.847688	valid_1's binary_logloss: 0.298904
**

In [22]:
fi.sort_values(ascending=False).head(50)

RIESGO_DIRECTO_1_saldoUnique_ult1meses               0.012741
cod_instit_financiera_max_ult1mes                    0.011633
edad                                                 0.010780
ubigeo_mode                                          0.009118
COD_CLASIFICACION_DEUDOR_0_saldoUnique_ult12meses    0.008692
RIESGO_DIRECTO_-1_saldoUnique_ult12meses             0.008199
RIESGO_DIRECTO_1_saldoStd_ult1meses                  0.008176
RIESGO_DIRECTO_-1_saldoMedian_ult1meses              0.007871
RIESGO_DIRECTO_-1_saldoMedian_ult2meses              0.006790
cod_instit_financiera_min_ult1mes                    0.006697
RIESGO_DIRECTO_1_saldoMin_ult1meses                  0.006585
cod_instit_financiera_mode_ult1mes                   0.006355
sexo_0                                               0.005697
RIESGO_DIRECTO_1_saldoMax_ult12meses                 0.005697
condicion_0_saldoUnique_ult12meses                   0.005648
tipo_credito_11_saldoStd_ult1meses                   0.005628
tipo_cre

In [21]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1136features_0.84646.csv') ### score de  0.84862 en la tabla publica

In [45]:
train.to_pickle('../data/train_score_0.84862.pkl')
test.to_pickle('../data/test_score_0.84862.pkl')

In [None]:
# test_probs.name = 'target'
# test_probs.to_csv('../results/lightgbm_with_1136features_0.84646.csv') ### score de  0.84862 en la tabla publica

# test_probs.name = 'target'
# test_probs.to_csv('../results/test_withRccSe_0.84422.csv') #### score de 0.84738 en la tabla publica

# test_probs.name = 'target'
# test_probs.to_csv('../results/lightgbm_with_1064features_0.83955.csv') ### score de 0.84242 en la tabla publica

#### Podemos  ver que agregando la base del SE, el modelo mejora en tan solo casi 0.00496
#### Podemos  ver que agregando la base del SE, el modelo mejora en tan solo casi 0.00124
#### Podemos la diferencia entre el test final y test validacion difiere en solo difiere 0.003 , evitando asi el overfitting 

In [None]:
# binenc_ubigeo_mode   *********
# roc auc estimado:  0.8451880633349523
# roc auc varianza:  0.0006811048507579711

# binenc_estadocontribuyente_mode **************
# roc auc estimado:  0.8454404252384857
# roc auc varianza:  0.0007407556976310795

# binenc_ciiu_mode
# roc auc estimado:  0.8452323957840445 *************
# roc auc varianza:  0.0006930959347139597

# binenc_codzona_mode *************************
# roc auc estimado:  0.8452934126596989
# roc auc varianza:  0.0006864251485789392

# binenc_condiciondomicilio
# roc auc estimado:  0.8449534357725325
# roc auc varianza:  0.0006904443537461785

# onehot_codentidadtributo_mode
# roc auc estimado:  0.8448728908251362
# roc auc varianza:  0.0006799673871728364

# binenc_tipcontribuyente_mode_train
# roc auc estimado:  0.8452350383631082
# roc auc varianza:  0.000742731434028307

# onehot_contabilidad_mode_train
# roc auc estimado:  0.8450975852481284
# roc auc varianza:  0.000563203090351886

# onehot_domiciliado_mode
# roc auc estimado:  0.8450611911417196
# roc auc varianza:  0.0006969148526762507

# onehot_tippersona_mode
# roc auc estimado:  0.845083437988461
# roc auc varianza:  0.0007284692072829943

# binenc_facturacion_mode
# roc auc estimado:  0.8450877054575421
# roc auc varianza:  0.0007139054949380171

# binenc_comercioexterior_mode
# roc auc estimado:  0.8452255931055561
# roc auc varianza:  0.0006762403588663643

In [25]:
train_ = train.copy()
test_ = test.copy()
# del train, test

### all_training_data

In [29]:
learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
               colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
learner.fit(train_, y_train, eval_metric="auc")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
               importance_type='split', learning_rate=0.1, max_bin=200,
               max_depth=-1, min_child_samples=1500, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=0.8, subsample_for_bin=200000,
               subsample_freq=0)

In [33]:
test_probs = pd.Series(learner.predict_proba(test_)[:,-1], index=test_.index, name='target')
test_probs.to_csv('../results/lightgbm_training_allData_with_1136features.csv')  ### score de  0.84567 en la tabla publica  

### Probando con cat_features

In [19]:
var_mode = [ i for i in train_.columns if 'mode' in i ]
for i in var_mode:
    train_[i] = train_[i].astype('category')
    test_[i] = test_[i].astype('category')

In [20]:
[test_probs, fi] = cross_validation_lightgbm(train_,
                                             test_, 
                                             y_train)

********** 0 **********




Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.853018	training's binary_logloss: 0.297856	valid_1's auc: 0.83479	valid_1's binary_logloss: 0.309108
[100]	training's auc: 0.86851	training's binary_logloss: 0.284755	valid_1's auc: 0.841822	valid_1's binary_logloss: 0.303217
[150]	training's auc: 0.878911	training's binary_logloss: 0.276473	valid_1's auc: 0.844192	valid_1's binary_logloss: 0.301272
[200]	training's auc: 0.887114	training's binary_logloss: 0.269921	valid_1's auc: 0.84546	valid_1's binary_logloss: 0.300323
Early stopping, best iteration is:
[200]	training's auc: 0.887114	training's binary_logloss: 0.269921	valid_1's auc: 0.84546	valid_1's binary_logloss: 0.300323
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.853105	training's binary_logloss: 0.297583	valid_1's auc: 0.83342	valid_1's binary_logloss: 0.309619
[100]	training's auc: 0.868977	training's binary_logloss: 0.284423	val

In [22]:
fi.sort_values(ascending=False).head(50)

ciiu_mode                                            0.113727
cod_instit_financiera_mode_ult1mes                   0.048391
ubigeo_mode                                          0.047601
cod_instit_financiera_mode_ult12mes                  0.028814
cod_instit_financiera_mode_ult2mes                   0.026436
cod_instit_financiera_mode_ult3mes                   0.019045
cod_instit_financiera_mode_ult11mes                  0.018853
cod_instit_financiera_mode_ult4mes                   0.016344
RIESGO_DIRECTO_1_saldoUnique_ult1meses               0.014236
cod_instit_financiera_mode_ult10mes                  0.013086
cod_instit_financiera_mode_ult5mes                   0.013068
cod_instit_financiera_mode_ult8mes                   0.012467
cod_instit_financiera_mode_ult6mes                   0.011805
cod_instit_financiera_mode_ult9mes                   0.011791
cod_instit_financiera_mode_ult7mes                   0.011316
COD_CLASIFICACION_DEUDOR_0_saldoUnique_ult12meses    0.009576
edad    

In [21]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1136features_CategoricalEncoding_0.84445.csv') ### score de  0.84789 en la tabla publica

### Probando con mean-encoding, pero no funciona

In [34]:
var_mode = ['tipcontribuyente_mode', 'contabilidad_mode', 'condiciondomicilio_mode', 'cargorele_mode', 'comercioexterior_mode',                 
'codentidadtributo_mode', 'codvia_mode', 'domiciliado_mode', 'ciiu_mode', 'tippersona_mode', 'facturacion_mode',                       
'ubigeo_mode', 'codzona_mode', 'estadotributo_mode']  
train_['target'] = y_train.loc[train_.index]
for i in var_mode:
    me = train_.groupby([i])['target'].mean().to_dict() 
    train_[f'{i}_me'] = train_[i].map(me) 
    test_[f'{i}_me'] = test_[i].map(me)
    ### imputar datos faltantes
#     test_[f'{i}_me'] =  test_[f'{i}_me'].fillna(sunat_train_[f'{i}_me'].median())
train_.drop('target', axis=1, inplace=True)
train_.drop(var_mode, axis=1, inplace=True)
test_.drop(var_mode, axis=1, inplace=True)

In [38]:
[test_probs, fi] = cross_validation_lightgbm(train_,
                                             test_, 
                                             y_train)

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.844739	training's binary_logloss: 0.302081	valid_1's auc: 0.836933	valid_1's binary_logloss: 0.308151
[100]	training's auc: 0.858633	training's binary_logloss: 0.291064	valid_1's auc: 0.84481	valid_1's binary_logloss: 0.301901
[150]	training's auc: 0.867347	training's binary_logloss: 0.28442	valid_1's auc: 0.847874	valid_1's binary_logloss: 0.299494
[200]	training's auc: 0.873678	training's binary_logloss: 0.279384	valid_1's auc: 0.849272	valid_1's binary_logloss: 0.298374
[250]	training's auc: 0.879284	training's binary_logloss: 0.274931	valid_1's auc: 0.850144	valid_1's binary_logloss: 0.297768
Early stopping, best iteration is:
[280]	training's auc: 0.882243	training's binary_logloss: 0.272459	valid_1's auc: 0.850424	valid_1's binary_logloss: 0.297529
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.844802	training's b

In [41]:
fi.sort_values(ascending=False).head(50)

ubigeo_mode_me                                       0.018583
RIESGO_DIRECTO_1_saldoUnique_ult1meses               0.011942
cod_instit_financiera_max_ult1mes                    0.011176
edad                                                 0.011128
COD_CLASIFICACION_DEUDOR_0_saldoUnique_ult12meses    0.008531
RIESGO_DIRECTO_-1_saldoUnique_ult12meses             0.008339
RIESGO_DIRECTO_1_saldoStd_ult1meses                  0.008166
RIESGO_DIRECTO_-1_saldoMedian_ult1meses              0.008096
ciiu_mode_me                                         0.007292
cod_instit_financiera_mode_ult1mes                   0.006379
RIESGO_DIRECTO_1_saldoMin_ult1meses                  0.006263
RIESGO_DIRECTO_-1_saldoMedian_ult2meses              0.006237
cod_instit_financiera_min_ult1mes                    0.005938
sexo_0                                               0.005311
RIESGO_DIRECTO_1_saldoMax_ult12meses                 0.005276
condicion_0_saldoUnique_ult12meses                   0.005157
RIESGO_D

In [39]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1136features_me_0.84887.csv') ### score de  0.84649 en la tabla publica

### drop 5 features related in cod_instit_financiera_mode_ultxxmes

In [37]:
drop_columns = ['cod_instit_financiera_mode_ult11mes', 'cod_instit_financiera_mode_ult10mes',
'cod_instit_financiera_mode_ult8mes', 'cod_instit_financiera_mode_ult9mes', 'cod_instit_financiera_mode_ult7mes']

In [38]:
[test_probs, fi_] = cross_validation_lightgbm(train_.drop(drop_columns, axis=1),
                                             test_.drop(drop_columns, axis=1), 
                                             y_train)

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.842846	training's binary_logloss: 0.303058	valid_1's auc: 0.834692	valid_1's binary_logloss: 0.309181
[100]	training's auc: 0.856809	training's binary_logloss: 0.29212	valid_1's auc: 0.842807	valid_1's binary_logloss: 0.30278
[150]	training's auc: 0.865174	training's binary_logloss: 0.28571	valid_1's auc: 0.84551	valid_1's binary_logloss: 0.300698
[200]	training's auc: 0.871963	training's binary_logloss: 0.280493	valid_1's auc: 0.846676	valid_1's binary_logloss: 0.299735
[250]	training's auc: 0.877632	training's binary_logloss: 0.276077	valid_1's auc: 0.847408	valid_1's binary_logloss: 0.299166
[300]	training's auc: 0.882612	training's binary_logloss: 0.272098	valid_1's auc: 0.847794	valid_1's binary_logloss: 0.298804
Early stopping, best iteration is:
[298]	training's auc: 0.882378	training's binary_logloss: 0.272284	valid_1's auc: 0.847826	valid_1's binary_logloss: 0.298784
***

In [39]:
test_probs.name='target'
test_probs.to_csv('../results/lightgbm_with_1136features_drop5features__0.84650.csv') ### score de  0.84812 en la tabla publica 