In [1]:
import pandas as pd
from tqdm.notebook import tqdm 
import gc
import numpy as np

In [2]:
import statistics as stat
def calculate_mode(x):
    try:
        moda=stat.mode(x)
    except:
        moda=np.nan
    return moda

In [3]:
path = '../data'
y_train = pd.read_csv(f'{path}/y_train.csv', index_col = 'key_value')

## Procesar RCC

In [4]:
rcc_train = pd.read_csv(f'{path}/rcc_train.csv')
rcc_test = pd.read_csv(f'{path}/rcc_test.csv')

In [5]:
rcc_test['cod_instit_financiera'].fillna(rcc_test['cod_instit_financiera'].value_counts().index[0], inplace=True)
rcc_test['PRODUCTO'].fillna(rcc_test['PRODUCTO'].value_counts().index[0], inplace=True)

In [6]:
dict_ = {'codmes': 'int32',
 'key_value': 'int32',
 'condicion': 'int32',
 'tipo_credito': 'int32',
 'cod_instit_financiera': 'int32',
 'PRODUCTO': 'int32',
 'RIESGO_DIRECTO': 'int32',
 'COD_CLASIFICACION_DEUDOR': 'int32'}
rcc_train = rcc_train.astype(dict_)
rcc_test = rcc_test.astype(dict_)

In [7]:
bins = [-1, 0, 10, 20, 30, 60, 90, 180, 360, 720, float("inf")]
rcc_train["condicion"] = pd.cut(rcc_train.condicion, bins)
rcc_train["condicion"] = rcc_train["condicion"].cat.codes
rcc_test["condicion"] = pd.cut(rcc_test.condicion, bins)
rcc_test["condicion"] = rcc_test["condicion"].cat.codes

In [8]:
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_rcc = {'cod_instit_financiera':['nunique','min','max',moda],
           'PRODUCTO':['nunique','min','max',moda],}

In [9]:
list_rcc_train_agg = []
for n,i in enumerate(sorted(set(rcc_train.codmes),reverse=True)):
    for c in ['tipo_credito','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
        print(f'haciendo {c} desde {i}')
        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoSum_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.size().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoUnique_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.min().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMin_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.max().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMax_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.std().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoStd_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()

        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.mean().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMean_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()
        
        rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby(['key_value', c]).saldo.median().unstack(level = 1, fill_value=np.nan)
        rcc_train_agg.columns = [f'{rcc_train_agg.columns.name}_{c}_saldoMedian_ult{n+1}meses' for c in rcc_train_agg.columns]
        list_rcc_train_agg.append(rcc_train_agg)
        gc.collect()
        
    print(f'haciendo aggregate de {i}')
    rcc_train_agg = rcc_train[rcc_train.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_train_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_train_agg.columns]
    list_rcc_train_agg.append(rcc_train_agg)
    gc.collect()
        
rcc_train_ = pd.concat(list_rcc_train_agg, axis=1)
del rcc_train, list_rcc_train_agg

haciendo tipo_credito desde 201802
haciendo RIESGO_DIRECTO desde 201802
haciendo COD_CLASIFICACION_DEUDOR desde 201802
haciendo condicion desde 201802
haciendo aggregate de 201802
haciendo tipo_credito desde 201801
haciendo RIESGO_DIRECTO desde 201801
haciendo COD_CLASIFICACION_DEUDOR desde 201801
haciendo condicion desde 201801
haciendo aggregate de 201801
haciendo tipo_credito desde 201712
haciendo RIESGO_DIRECTO desde 201712
haciendo COD_CLASIFICACION_DEUDOR desde 201712
haciendo condicion desde 201712
haciendo aggregate de 201712
haciendo tipo_credito desde 201711
haciendo RIESGO_DIRECTO desde 201711
haciendo COD_CLASIFICACION_DEUDOR desde 201711
haciendo condicion desde 201711
haciendo aggregate de 201711
haciendo tipo_credito desde 201710
haciendo RIESGO_DIRECTO desde 201710
haciendo COD_CLASIFICACION_DEUDOR desde 201710
haciendo condicion desde 201710
haciendo aggregate de 201710
haciendo tipo_credito desde 201709
haciendo RIESGO_DIRECTO desde 201709
haciendo COD_CLASIFICACION_D

In [10]:
list_rcc_test_agg = []
for n,i in enumerate(sorted(set(rcc_test.codmes),reverse=True)):
    for c in ['tipo_credito','RIESGO_DIRECTO','COD_CLASIFICACION_DEUDOR','condicion']:
        print(f'haciendo {c} desde {i}')
        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.sum().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoSum_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.size().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoUnique_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.min().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMin_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.max().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMax_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.std().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoStd_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.mean().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMean_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby(['key_value', c]).saldo.median().unstack(level = 1, fill_value=np.nan)
        rcc_test_agg.columns = [f'{rcc_test_agg.columns.name}_{c}_saldoMedian_ult{n+1}meses' for c in rcc_test_agg.columns]
        list_rcc_test_agg.append(rcc_test_agg)
        gc.collect()

        
    print(f'haciendo aggregate de {i}')
    rcc_test_agg = rcc_test[rcc_test.codmes>=i].groupby('key_value').agg(agg_rcc)
    rcc_test_agg.columns = [i+'_'+j+f'_ult{n+1}mes' for i,j in rcc_test_agg.columns]
    list_rcc_test_agg.append(rcc_test_agg)
    gc.collect()

rcc_test_ = pd.concat(list_rcc_test_agg, axis=1)
del rcc_test, list_rcc_test_agg

haciendo tipo_credito desde 201902
haciendo RIESGO_DIRECTO desde 201902
haciendo COD_CLASIFICACION_DEUDOR desde 201902
haciendo condicion desde 201902
haciendo aggregate de 201902
haciendo tipo_credito desde 201901
haciendo RIESGO_DIRECTO desde 201901
haciendo COD_CLASIFICACION_DEUDOR desde 201901
haciendo condicion desde 201901
haciendo aggregate de 201901
haciendo tipo_credito desde 201812
haciendo RIESGO_DIRECTO desde 201812
haciendo COD_CLASIFICACION_DEUDOR desde 201812
haciendo condicion desde 201812
haciendo aggregate de 201812
haciendo tipo_credito desde 201811
haciendo RIESGO_DIRECTO desde 201811
haciendo COD_CLASIFICACION_DEUDOR desde 201811
haciendo condicion desde 201811
haciendo aggregate de 201811
haciendo tipo_credito desde 201810
haciendo RIESGO_DIRECTO desde 201810
haciendo COD_CLASIFICACION_DEUDOR desde 201810
haciendo condicion desde 201810
haciendo aggregate de 201810
haciendo tipo_credito desde 201809
haciendo RIESGO_DIRECTO desde 201809
haciendo COD_CLASIFICACION_D

In [11]:
### asegurar que las columnas esten en ambas bases (train/test)
print(rcc_train_.shape, rcc_test_.shape)
keep_columns = list(set(rcc_train_.columns).intersection(rcc_test_.columns))
rcc_train_ = rcc_train_[keep_columns].copy()
rcc_test_ = rcc_test_[keep_columns].copy()
print(rcc_train_.shape, rcc_test_.shape)

(358487, 2532) (396666, 2616)
(358487, 2532) (396666, 2532)


In [12]:
### unir RCC en la base final
train = rcc_train_.copy()
test = rcc_test_.copy()
del rcc_train_, rcc_test_

In [13]:
# train.to_pickle('train_2532features.pkl')
# test.to_pickle('test_2532features.pkl')

In [15]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

def cross_validation_lightgbm(train, y_train, test):
    folds = [train.index[t] for t, v in KFold(5).split(train)]
    test_probs = []
    train_probs = []
    fi = []
    for i, idx in enumerate(folds):
        print("*"*10, i, "*"*10)
        Xt = train.loc[idx]
        yt = y_train.loc[Xt.index].target

        Xv = train.drop(Xt.index)
        yv = y_train.loc[Xv.index].target

        learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                       colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
        learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                    eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
        test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
        train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
        fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

    test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
    train_probs = pd.concat(train_probs)
    fi = pd.concat(fi, axis=1).mean(axis=1)
    print("*" * 21)
    print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
    print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 
    return fi

In [22]:
# eliminar variables con zero_importance
while True:
    fi = cross_validation_lightgbm(train, y_train, test)
    zero_importance = fi[fi==0]
    aux = fi[fi>0].sort_values(ascending=False)
    keep_columns = []
    count = 0
    for feature,values in zip(aux.index, aux.values):
        count+=values
        if count<=0.99:
            keep_columns.append(feature)
            
    print(f'total de variables : {len(train.columns)}')
    print(f'variables con importancia acumulada al 99% : {len(keep_columns)}')
    print(f'variables con zero importancia : {len(zero_importance)}')
    train = train[keep_columns]
    test = test[keep_columns]
    if len(zero_importance)==0:
        break

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.83882	training's binary_logloss: 0.304788	valid_1's auc: 0.830858	valid_1's binary_logloss: 0.310663
[100]	training's auc: 0.851767	training's binary_logloss: 0.295217	valid_1's auc: 0.837418	valid_1's binary_logloss: 0.305689
[150]	training's auc: 0.85948	training's binary_logloss: 0.289459	valid_1's auc: 0.83942	valid_1's binary_logloss: 0.304103
[200]	training's auc: 0.865892	training's binary_logloss: 0.284753	valid_1's auc: 0.84049	valid_1's binary_logloss: 0.303278
Early stopping, best iteration is:
[216]	training's auc: 0.867463	training's binary_logloss: 0.283446	valid_1's auc: 0.840636	valid_1's binary_logloss: 0.303178
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.838444	training's binary_logloss: 0.30488	valid_1's auc: 0.830145	valid_1's binary_logloss: 0.310827
[100]	training's auc: 0.85122	training's binar

[100]	training's auc: 0.851265	training's binary_logloss: 0.295491	valid_1's auc: 0.836937	valid_1's binary_logloss: 0.306101
[150]	training's auc: 0.859193	training's binary_logloss: 0.289689	valid_1's auc: 0.839099	valid_1's binary_logloss: 0.304503
[200]	training's auc: 0.86571	training's binary_logloss: 0.284861	valid_1's auc: 0.84018	valid_1's binary_logloss: 0.303777
[250]	training's auc: 0.871272	training's binary_logloss: 0.280664	valid_1's auc: 0.840795	valid_1's binary_logloss: 0.303378
Early stopping, best iteration is:
[269]	training's auc: 0.873068	training's binary_logloss: 0.279276	valid_1's auc: 0.840894	valid_1's binary_logloss: 0.303313
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.838426	training's binary_logloss: 0.304897	valid_1's auc: 0.829758	valid_1's binary_logloss: 0.311015
[100]	training's auc: 0.851152	training's binary_logloss: 0.295415	valid_1's auc: 0.836629	valid_1's binary_logloss: 0.305945
[

In [28]:
train.shape,  test.shape

((358487, 1064), (396666, 1064))

In [29]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.838761	training's binary_logloss: 0.304822	valid_1's auc: 0.830556	valid_1's binary_logloss: 0.310838
[100]	training's auc: 0.851409	training's binary_logloss: 0.295462	valid_1's auc: 0.836985	valid_1's binary_logloss: 0.306073
[150]	training's auc: 0.85943	training's binary_logloss: 0.289585	valid_1's auc: 0.839204	valid_1's binary_logloss: 0.304374
[200]	training's auc: 0.865853	training's binary_logloss: 0.284853	valid_1's auc: 0.840292	valid_1's binary_logloss: 0.303612
[250]	training's auc: 0.871309	training's binary_logloss: 0.280703	valid_1's auc: 0.840861	valid_1's binary_logloss: 0.303241
Early stopping, best iteration is:
[247]	training's auc: 0.871011	training's binary_logloss: 0.280922	valid_1's auc: 0.84091	valid_1's binary_logloss: 0.303218
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.838125	training's b

In [32]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1064features_0.83955.csv') ### score de 0.84242 en la tabla publica

In [33]:
# train.to_pickle('train_1064features.pkl')
# test.to_pickle('test_1064features.pkl')

## Procesar SE

In [34]:
se_train = pd.read_csv(f'{path}/se_train.csv', index_col = 'key_value')
se_test = pd.read_csv(f'{path}/se_test.csv', index_col = 'key_value')

In [35]:
dict_ = {'sexo':'int32',
         'est_cvl':'int32',
         'sit_lab':'int32',
         'cod_ocu':'int32',
         'ctd_hijos':'int32',
         'flg_sin_email':'int32',
         'ctd_veh':'int32',
         'lgr_vot':'int32',
         'prv':'int32',
         'dto':'int32',
         'rgn':'int32',
         'tip_lvledu':'int32'}
se_train = se_train.astype(dict_)
se_test = se_test.astype(dict_)

In [36]:
### unir SE en la base final
train = train.join(se_train) 
test = test.join(se_test)
del se_train, se_test

In [37]:
train.shape, test.shape

((358487, 1078), (396666, 1078))

In [38]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.845062	training's binary_logloss: 0.301597	valid_1's auc: 0.83638	valid_1's binary_logloss: 0.308295
[100]	training's auc: 0.859504	training's binary_logloss: 0.290151	valid_1's auc: 0.845125	valid_1's binary_logloss: 0.301465
[150]	training's auc: 0.868168	training's binary_logloss: 0.283312	valid_1's auc: 0.848301	valid_1's binary_logloss: 0.298846
[200]	training's auc: 0.874831	training's binary_logloss: 0.278033	valid_1's auc: 0.849607	valid_1's binary_logloss: 0.29769
[250]	training's auc: 0.880484	training's binary_logloss: 0.273542	valid_1's auc: 0.850544	valid_1's binary_logloss: 0.296978
[300]	training's auc: 0.88537	training's binary_logloss: 0.269528	valid_1's auc: 0.851007	valid_1's binary_logloss: 0.296578
Early stopping, best iteration is:
[298]	training's auc: 0.885205	training's binary_logloss: 0.269669	valid_1's auc: 0.851064	valid_1's binary_logloss: 0.296548
**

In [39]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1078features_0.84992.csv') ### score de 0.84599 en la tabla publica

In [None]:
# train.to_pickle('train_1078features.pkl')
# test.to_pickle('test_1078features.pkl')

## Procesar SUNAT

In [40]:
sunat_train = pd.read_csv(f'{path}/sunat_train.csv')
sunat_test = pd.read_csv(f'{path}/sunat_test.csv')

In [41]:
### eliminar registros duplicados
sunat_train.drop_duplicates(inplace=True)
sunat_test.drop_duplicates(inplace=True)
sunat_train.shape, sunat_test.shape

((292479, 18), (318821, 18))

In [42]:
dict_ = {'tipcontribuyente': 'int32',
         'tippersona': 'int32',
         'ciiu': 'int32',
         'ubigeo': 'int32',
         'condiciondomicilio': 'int32',
         'estadocontribuyente': 'int32',
         'codvia': 'int32',
         'codzona': 'int32',
         'contabilidad': 'int32',
         'facturacion': 'int32',
         'domiciliado': 'int32',
         'comercioexterior': 'int32',
         'cargorele': 'int32',
         'codentidadtributo': 'int32',
         'estadotributo': 'int32'}
sunat_train = sunat_train.astype(dict_)
sunat_test = sunat_test.astype(dict_)

In [43]:
sunat_train['diff_fech'] = sunat_train['fecbaja'] - sunat_train['fecalta']
sunat_test['diff_fech'] = sunat_test['fecbaja'] - sunat_test['fecalta']

In [44]:
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_sunat = {'tipcontribuyente':['nunique',moda],
           'tippersona':['nunique',moda],
           'ciiu':['nunique', moda],
           'ubigeo':['nunique',moda],
           'condiciondomicilio':['nunique',moda],
           'estadocontribuyente':['nunique',moda],
           'codvia':['nunique',moda],
           'codzona':['nunique',moda],
           'contabilidad':['nunique',moda],
           'facturacion':['nunique',moda],
           'domiciliado':['nunique',moda],
           'comercioexterior':['nunique',moda],
           'cargorele':['nunique',moda],
           'codentidadtributo':['nunique',moda],
           'estadotributo':['nunique',moda],
           'fecalta':['mean','max', 'nunique','min','std'],
           'fecbaja':['mean','max', 'nunique','min','std'], 
           'diff_fech':['mean','max', 'nunique','min','std'], 
            }

sunat_train_ = sunat_train.groupby('key_value').agg(agg_sunat)
sunat_train_.columns = [i+'_'+j for i,j in sunat_train_.columns]
sunat_test_ = sunat_test.groupby('key_value').agg(agg_sunat)
sunat_test_.columns = [i+'_'+j for i,j in sunat_test_.columns]
del sunat_train, sunat_test

In [47]:
### unir SUNAT en la base final
train = train.join(sunat_train_)
test = test.join(sunat_test_)
del sunat_train_, sunat_test_

In [49]:
train.shape, test.shape

((358487, 1123), (396666, 1123))

In [48]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.845801	training's binary_logloss: 0.301333	valid_1's auc: 0.837149	valid_1's binary_logloss: 0.307805
[100]	training's auc: 0.860437	training's binary_logloss: 0.289712	valid_1's auc: 0.846237	valid_1's binary_logloss: 0.300648
[150]	training's auc: 0.869258	training's binary_logloss: 0.282753	valid_1's auc: 0.849549	valid_1's binary_logloss: 0.297963
[200]	training's auc: 0.875982	training's binary_logloss: 0.277379	valid_1's auc: 0.851073	valid_1's binary_logloss: 0.296692
[250]	training's auc: 0.881774	training's binary_logloss: 0.272785	valid_1's auc: 0.851484	valid_1's binary_logloss: 0.296332
[300]	training's auc: 0.88701	training's binary_logloss: 0.268602	valid_1's auc: 0.852077	valid_1's binary_logloss: 0.295892
Early stopping, best iteration is:
[316]	training's auc: 0.888411	training's binary_logloss: 0.2674	valid_1's auc: 0.852183	valid_1's binary_logloss: 0.295783
**

In [50]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1123features_0.85108.csv') ### score de 0.84723 en la tabla publica

In [51]:
# train.to_pickle('train_1123features.pkl')
# test.to_pickle('test_1123features.pkl')

In [55]:
zero_importance = fi[fi==0]
aux = fi[fi>0].sort_values(ascending=False)
keep_columns = []
count = 0
for feature,values in zip(aux.index, aux.values):
    count+=values
    if count<=0.99:
        keep_columns.append(feature)

print(f'total de variables : {len(train.columns)}')
print(f'variables con importancia acumulada al 99% : {len(keep_columns)}')
print(f'variables con zero importancia : {len(zero_importance)}')

total de variables : 1123
variables con importancia acumulada al 99% : 1048
variables con zero importancia : 4


In [56]:
train = train[keep_columns]
test = test[keep_columns]

In [57]:
train.shape, test.shape

((358487, 1048), (396666, 1048))

In [58]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.845481	training's binary_logloss: 0.301328	valid_1's auc: 0.836866	valid_1's binary_logloss: 0.3081
[100]	training's auc: 0.860185	training's binary_logloss: 0.289838	valid_1's auc: 0.846036	valid_1's binary_logloss: 0.301014
[150]	training's auc: 0.869007	training's binary_logloss: 0.282863	valid_1's auc: 0.849365	valid_1's binary_logloss: 0.298266
[200]	training's auc: 0.875714	training's binary_logloss: 0.27756	valid_1's auc: 0.851014	valid_1's binary_logloss: 0.296969
[250]	training's auc: 0.881521	training's binary_logloss: 0.272957	valid_1's auc: 0.851811	valid_1's binary_logloss: 0.29636
[300]	training's auc: 0.886627	training's binary_logloss: 0.268763	valid_1's auc: 0.85235	valid_1's binary_logloss: 0.295868
Early stopping, best iteration is:
[297]	training's auc: 0.886302	training's binary_logloss: 0.269021	valid_1's auc: 0.852364	valid_1's binary_logloss: 0.295882
****

In [59]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_1048features_0.85100.csv') ### score de 0.84695 en la tabla publica

In [60]:
zero_importance = fi[fi==0]
aux = fi[fi>0].sort_values(ascending=False)
keep_columns = []
count = 0
for feature,values in zip(aux.index, aux.values):
    count+=values
    if count<=0.99:
        keep_columns.append(feature)

print(f'total de variables : {len(train.columns)}')
print(f'variables con importancia acumulada al 99% : {len(keep_columns)}')
print(f'variables con zero importancia : {len(zero_importance)}')

total de variables : 1048
variables con importancia acumulada al 99% : 997
variables con zero importancia : 0
