In [1]:
import pandas as pd
from tqdm.notebook import tqdm 
import gc
import numpy as np

In [2]:
import statistics as stat
def calculate_mode(x):
    try:
        moda=stat.mode(x)
    except:
        moda=np.nan
    return moda

In [3]:
path = '../data'
y_train = pd.read_csv(f'{path}/y_train.csv', index_col = 'key_value')

In [4]:
train = pd.read_pickle('train_1064features.pkl')
test = pd.read_pickle('test_1064features.pkl')
train.shape, test.shape

((358487, 1064), (396666, 1064))

## Procesar SUNAT

In [5]:
sunat_train = pd.read_csv(f'{path}/sunat_train.csv')
sunat_test = pd.read_csv(f'{path}/sunat_test.csv')

In [6]:
### eliminar registros duplicados
sunat_train.drop_duplicates(inplace=True)
sunat_test.drop_duplicates(inplace=True)
sunat_train.shape, sunat_test.shape

((292479, 18), (318821, 18))

In [7]:
dict_ = {'tipcontribuyente': 'int32',
         'tippersona': 'int32',
         'ciiu': 'int32',
         'ubigeo': 'int32',
         'condiciondomicilio': 'int32',
         'estadocontribuyente': 'int32',
         'codvia': 'int32',
         'codzona': 'int32',
         'contabilidad': 'int32',
         'facturacion': 'int32',
         'domiciliado': 'int32',
         'comercioexterior': 'int32',
         'cargorele': 'int32',
         'codentidadtributo': 'int32',
         'estadotributo': 'int32'}
sunat_train = sunat_train.astype(dict_)
sunat_test = sunat_test.astype(dict_)

In [8]:
sunat_train['diff_fech'] = sunat_train['fecbaja'] - sunat_train['fecalta']
sunat_test['diff_fech'] = sunat_test['fecbaja'] - sunat_test['fecalta']

In [9]:
moda=lambda x: calculate_mode(x)
moda.__name__='mode'
agg_sunat = {'tipcontribuyente':['nunique',moda],
           'tippersona':['nunique',moda],
           'ciiu':['nunique', moda],
           'ubigeo':['nunique',moda],
           'condiciondomicilio':['nunique',moda],
           'estadocontribuyente':['nunique',moda],
           'codvia':['nunique',moda],
           'codzona':['nunique',moda],
           'contabilidad':['nunique',moda],
           'facturacion':['nunique',moda],
           'domiciliado':['nunique',moda],
           'comercioexterior':['nunique',moda],
           'cargorele':['nunique',moda],
           'codentidadtributo':['nunique',moda],
           'estadotributo':['nunique',moda],
           'fecalta':['mean','max', 'nunique','min','std'],
           'fecbaja':['mean','max', 'nunique','min','std'], 
           'diff_fech':['mean','max', 'nunique','min','std'], 
            }

sunat_train_ = sunat_train.groupby('key_value').agg(agg_sunat)
sunat_train_.columns = [i+'_'+j for i,j in sunat_train_.columns]
sunat_test_ = sunat_test.groupby('key_value').agg(agg_sunat)
sunat_test_.columns = [i+'_'+j for i,j in sunat_test_.columns]
del sunat_train, sunat_test

In [10]:
### unir SUNAT en la base final
train = train.join(sunat_train_)
test = test.join(sunat_test_)
del sunat_train_, sunat_test_
train.shape, test.shape

((358487, 1109), (396666, 1109))

In [11]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

folds = [train.index[t] for t, v in KFold(5).split(train)]
test_probs = []
train_probs = []
fi = []
for i, idx in enumerate(folds):
    print("*"*10, i, "*"*10)
    Xt = train.loc[idx]
    yt = y_train.loc[Xt.index].target

    Xv = train.drop(Xt.index)
    yv = y_train.loc[Xv.index].target

    learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                   colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
    test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
    train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
    fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
train_probs = pd.concat(train_probs)
fi = pd.concat(fi, axis=1).mean(axis=1)
print("*" * 21)
print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))])) 

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.838898	training's binary_logloss: 0.304707	valid_1's auc: 0.831139	valid_1's binary_logloss: 0.310739
[100]	training's auc: 0.852605	training's binary_logloss: 0.294738	valid_1's auc: 0.838601	valid_1's binary_logloss: 0.305226
[150]	training's auc: 0.861071	training's binary_logloss: 0.288522	valid_1's auc: 0.841252	valid_1's binary_logloss: 0.303124
[200]	training's auc: 0.867923	training's binary_logloss: 0.28343	valid_1's auc: 0.842414	valid_1's binary_logloss: 0.30222
[250]	training's auc: 0.874006	training's binary_logloss: 0.278962	valid_1's auc: 0.84297	valid_1's binary_logloss: 0.301801
[300]	training's auc: 0.879019	training's binary_logloss: 0.275043	valid_1's auc: 0.84343	valid_1's binary_logloss: 0.301518
Early stopping, best iteration is:
[319]	training's auc: 0.880877	training's binary_logloss: 0.273608	valid_1's auc: 0.84363	valid_1's binary_logloss: 0.301381
****

In [12]:
# test_probs.name = 'target'
# test_probs.to_csv('../results/lightgbm_with_1109features_sinSE_0.84199.csv') 
# score de 0.8424 ---------a ---  0.84479  en la tabla publica

## Procesar SE

In [11]:
se_train = pd.read_csv(f'{path}/se_train.csv', index_col = 'key_value')
se_test = pd.read_csv(f'{path}/se_test.csv', index_col = 'key_value')

In [12]:
dict_ = {'sexo':'int32',
         'est_cvl':'int32',
         'sit_lab':'int32',
         'cod_ocu':'int32',
         'ctd_hijos':'int32',
         'flg_sin_email':'int32',
         'ctd_veh':'int32',
         'lgr_vot':'int32',
         'prv':'int32',
         'dto':'int32',
         'rgn':'int32',
         'tip_lvledu':'int32'}
se_train = se_train.astype(dict_)
se_test = se_test.astype(dict_)

In [13]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

def cross_validation_lightgbm(train, test, y_train):
    folds = [train.index[t] for t, v in KFold(5).split(train)]
    test_probs = []
    train_probs = []
    fi = []
    for i, idx in enumerate(folds):
        print("*"*10, i, "*"*10)
        Xt = train.loc[idx]
        yt = y_train.loc[Xt.index].target

        Xv = train.drop(Xt.index)
        yv = y_train.loc[Xv.index].target

        learner = LGBMClassifier(n_estimators=1000, boosting_type='gbdt',min_child_samples=1500, 
                       colsample_bytree=0.8,subsample=0.8, max_bin=200, learning_rate=0.1)
        learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                    eval_set=[(Xt, yt), (Xv, yv)], verbose=50)
        test_probs.append(pd.Series(learner.predict_proba(test)[:, -1], index=test.index, name="fold_" + str(i)))
        train_probs.append(pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index, name="probs"))
        fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))
        gc.collect()
    test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
    train_probs = pd.concat(train_probs)
    fi = pd.concat(fi, axis=1).mean(axis=1)
    print("*" * 21)
    print("roc auc estimado: ", roc_auc_score(y_train, train_probs.loc[y_train.index]))
    print("roc auc varianza: ", np.std([roc_auc_score(y_train.loc[folds[i]], train_probs.iloc[folds[i]]) for i in range(len(folds))]))
    return test_probs

In [14]:
def get_one_hot(df, feature):
    one_hot = pd.get_dummies(df[feature])
    one_hot.columns = [feature+'_'+str(i) for i in one_hot.columns]
    return one_hot
one_hot_sexo_train = get_one_hot(se_train, 'sexo')
one_hot_sexo_test = get_one_hot(se_test, 'sexo')
one_hot_est_cvl_train = get_one_hot(se_train, 'est_cvl')
one_hot_est_cvl_test = get_one_hot(se_test, 'est_cvl')
one_hot_rgn_train = get_one_hot(se_train, 'rgn')
one_hot_rgn_test = get_one_hot(se_test, 'rgn')

In [24]:
train = train.join(one_hot_sexo_train).join(one_hot_est_cvl_train).join(one_hot_rgn_train).join(se_train[['edad','ctd_veh']])
test = test.join(one_hot_sexo_test).join(one_hot_est_cvl_test).join(one_hot_rgn_test).join(se_test[['edad','ctd_veh']])

In [25]:
test_probs = cross_validation_lightgbm(train, test, y_train)

********** 0 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.842519	training's binary_logloss: 0.303194	valid_1's auc: 0.834209	valid_1's binary_logloss: 0.309338
[100]	training's auc: 0.856883	training's binary_logloss: 0.292244	valid_1's auc: 0.842555	valid_1's binary_logloss: 0.302936
[150]	training's auc: 0.865238	training's binary_logloss: 0.285795	valid_1's auc: 0.84521	valid_1's binary_logloss: 0.300813
[200]	training's auc: 0.872005	training's binary_logloss: 0.280588	valid_1's auc: 0.846986	valid_1's binary_logloss: 0.299555
[250]	training's auc: 0.877827	training's binary_logloss: 0.276056	valid_1's auc: 0.847567	valid_1's binary_logloss: 0.299087
Early stopping, best iteration is:
[268]	training's auc: 0.879506	training's binary_logloss: 0.27466	valid_1's auc: 0.847734	valid_1's binary_logloss: 0.298954
********** 1 **********
Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.842644	training's b

In [26]:
test_probs.name = 'target'
test_probs.to_csv('../results/lightgbm_with_oneHotSexo_oneHotEstCvl_Edad_OneHotRgn_CtdVeh_0.84646.csv') 

#                           rcc de 0.8420                  . auc estimado 0.83999

# score de 0.8424 ---------a ---  0.84479  en la tabla publica. Una mejore de 0.0023 respecto al score anterior y una mejora
#                                 de 0.00279 respecto al test de validacion . auc estimado 0.84199 ***** rcc y sunat
# score de 0.84479 ---------a ---  0.84696  en la tabla publica. Una mejore de 0.0021 respecto al score anterior y una mejora
#                                 de 0.00264 respecto al test de validacion . auc estimado 0.84432 *******

# score de 0.84696  --------a---- 0.84583 . Es decir ha reducido . auc estimado 0.84640 --- one_hot_tip_lvledu_train
# score de 0.84696  --------a---- 0.84594 . Es decir ha reducido . auc estimado 0.84664 --- tip_lvledu_train

# score de 0.84696 ---------a ---  0.84760  en la tabla publica. Una mejore de 0.00063 respecto al score anterior y una mejora
#                                 de 0.00240 respecto al test de validacion . auc estimado 0.84519 ****

# score de 0.84760  --------a---- 0.84614 . Es decir ha reducido . auc estimado 0.84717 --- flg_sin_email

# score de 0.84760  --------a---- 0.84786 en la tabla publica. Una mejora de 0.00026 respecto al score anterior y una mejora de
#                                   0.00218 respecto al test de validacion. auc estimado de 0.84568 *******

# score de 0.84786  --------a---- 0.84757 . Es decir ha reducido . auc estimado 0.84627 --- OneHotSitLab
# score de 0.84786  --------a---- 0.84779 . Es decir ha reducido . auc estimado 0.84541 --- Dto

# score de 0.84786  --------a---- 0.84816 en la tabla publica. Una mejora de 0.00030 respecto al score anterior y una mejora de
#                                 0.00204 respecto al test de validacion. auc estimado de  0.84612 *******

# score de 0.84816  --------a---- 0.84829 en la tabla publica. Una mejora de 0.00013 respecto al score anterior y una mejora de
#                                 0.00183 respecto al test de validacion. auc estimado de  0.84646 *******


In [None]:
### sexo tiene 4 clases  ### freq 0
### est_cvl tiene 6 clases  ### freq 4
### sit_lab tiene 4 clases  ### freq 1
### cod_ocu tiene 39 clases  ### freq 22
### ctd_hijos tiene 12 clases  ### freq 0
### flg_sin_email tiene 2 clases  ### freq 1
### lgr_vot tiene 2070 clases  ### freq 18
### prv tiene 197 clases  ### freq 111
### dto tiene 1662 clases  ### freq 949
### rgn tiene 7 clases  ### freq 3
### tip_lvledu tiene 8 clases  ### freq 0

In [24]:
# import category_encoders as ce
# X_binenc_train = train[['dto']]
# encoder = ce.BinaryEncoder(cols=['dto'])
# X_binenc_train = encoder.fit_transform(X_binenc_train)
# X_binenc_train.head()

In [None]:
# se_columns = ['edad', 'sexo', 'est_cvl', 'sit_lab', 'cod_ocu', 'ctd_hijos', 'flg_sin_email',
#                 'ctd_veh', 'cod_ubi', 'lgr_vot', 'prv', 'dto', 'rgn', 'tip_lvledu', 'edad']
# for feature in se_columns[9:]:
#     keep_columns = list(set(train.columns).difference([feature]))
#     print("*"*40, f'eliminando {feature}', "*"*40)
#     cross_validation_lightgbm(train[keep_columns], test[keep_columns], y_train)

In [None]:
#### eliminar
#lgr_vot , cod_ocu, cod_ubi, prv

### probados
# sexo, tip_lvledu, edad, flg_sin_email, est_cvl, sit_lab, dto, rgn, ctd_hijos, ctd_veh
####### mantener oficialmente
# one hot sexo , edad , one hot est_cvl, one hot rgn

In [30]:
train.shape, test.shape

((358487, 1128), (396666, 1128))

In [31]:
train.to_pickle('train_1128features.pkl')
test.to_pickle('test_1128features.pkl')