In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  accuracy_score ,log_loss
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb
# from catboost import CatBoostClassifier, Pool
# import catboost as cb

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import display
from sklearn.model_selection import train_test_split

# pd.options.display.max_rows = 10000
pd.options.display.max_columns = 1000
# pd.options.display.max_colwidth = 1000

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 2021
seed_everything(SEED)
TARGET = 'credit'
LOCAL_TEST = True
MAKE_MODEL_TEST = True

In [4]:
train = pd.read_pickle("data/train_adjusted.pkl")
test = pd.read_pickle("data/test_adjusted.pkl")
sub = pd.read_csv("data/sample_submission.csv")

In [5]:
df = pd.concat([train.drop(['index','credit'],1), test.drop(['index'],1)], axis=0)
print(df.shape)
df.head()

(36457, 18)


Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,card_ID
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,4709_202500.0_13899_6.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5.0,1540_247500.0_11380_5.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22.0,4434_450000.0_19087_22.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37.0,2092_202500.0_15088_37.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26.0,2105_157500.0_15037_26.0


In [6]:
cat_cols = [x for x in df.columns if df[x].dtype == 'object']
num_cols = [x for x in df.columns if x not in cat_cols + [TARGET]]
feature_cols = num_cols + cat_cols
print(len(feature_cols), len(cat_cols), len(num_cols))

18 9 9


In [7]:
num_cols

['child_num',
 'income_total',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'work_phone',
 'phone',
 'email',
 'family_size',
 'begin_month']

In [8]:
lbe = LabelEncoder()
for col in cat_cols:
    df[col] = lbe.fit_transform(df[col])

In [9]:
df[cat_cols].head()

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,occyp_type,card_ID
0,0,0,0,0,1,1,2,18,24679
1,0,0,1,0,4,0,1,8,4690
2,1,1,1,4,1,1,1,10,23929
3,0,0,1,0,4,1,1,14,8501
4,0,1,1,2,1,1,1,10,8571


In [10]:
from tensorflow import keras

encoding_dim = 32

def get_model(encoding_dim, dropout=.2):
    num_dim = len(num_cols)
    num_input = keras.layers.Input((num_dim,), name='num_input')
    cat_inputs = []
    cat_embs = []
    emb_dims = 0
    for col in cat_cols:
        cat_input = keras.layers.Input((1,), name=f'{col}_input')
        emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4))
        cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input)
        cat_emb = keras.layers.Dropout(dropout)(cat_emb)
        cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb)

        cat_inputs.append(cat_input)
        cat_embs.append(cat_emb)
        emb_dims += emb_dim

    merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs)

    encoded = keras.layers.Dense(encoding_dim * 3, activation='relu')(merged_inputs)
    encoded = keras.layers.Dropout(dropout)(encoded)
    encoded = keras.layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    encoded = keras.layers.Dropout(dropout)(encoded)    
    encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    
    decoded = keras.layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = keras.layers.Dropout(dropout)(decoded)
    decoded = keras.layers.Dense(encoding_dim * 3, activation='relu')(decoded)
    decoded = keras.layers.Dropout(dropout)(decoded)    
    decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(encoded)

    encoder = keras.Model([num_input] + cat_inputs, encoded)
    ae = keras.Model([num_input] + cat_inputs, decoded)
    ae.add_loss(keras.losses.mean_squared_error(merged_inputs, decoded))
    ae.compile(optimizer='adam')
    return ae, encoder

In [11]:
ae, encoder = get_model(encoding_dim)
ae.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender_input (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
car_input (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
reality_input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
income_type_input (InputLayer)  [(None, 1)]          0                                            
____________________________________________________________________________________________

In [12]:
inputs = [df[num_cols].values] + [df[x].values for x in cat_cols]
ae.fit(inputs, inputs,
      epochs=100,
      batch_size=16384,
      shuffle=True,
      validation_split=.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x24281c197f0>

In [13]:
encoding = encoder.predict(inputs)
print(encoding.shape)

(36457, 32)


In [14]:
y = train[TARGET]
n_trn = train.shape[0]

In [15]:
df.reset_index(drop=True,inplace=True)

In [16]:
df_enc = pd.concat([df[feature_cols], pd.DataFrame(encoding, columns=[f'enc_{x}' for x in range(encoding_dim)])], axis=1)
df_enc['credit'] = y
train_enc = df_enc.iloc[:n_trn]
test_enc = df_enc.iloc[n_trn:]
print(train_enc.shape, test_enc.shape)

(26457, 51) (10000, 51)


In [17]:
df_train, df_val = train_test_split(train_enc, test_size=0.2, random_state=SEED, stratify=y)

In [18]:
def make_test(old_score = 0 , output =False):
    features_columns = [col for col in list(df_train) if col!='credit']
    X,y = df_train[features_columns], df_train[TARGET]    
    P,P_y = df_val[features_columns], df_val[TARGET]
    
    tt_df = df_val[[TARGET]]        
    y_preds = np.zeros((df_val.shape[0],3))
    
    tr_data = lgb.Dataset(X, label=y)
    vl_data = lgb.Dataset(P, label=P_y) 
    estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 100,
        )
    y_preds += estimator.predict(P)
    feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
    
    m_results = []
    print('#'*20)

    
    g_logloss = log_loss(tt_df[TARGET], y_preds)
    score_diff = g_logloss - old_score
    print('Global log loss', g_logloss)
    m_results.append(g_logloss)
    
    print('#'*20)
    print('Features Preformance:', g_logloss)
    print('Diff with previous__:', score_diff)
    
    return tt_df, feature_imp, m_results, estimator ,g_logloss

In [19]:
###### Model params
lgb_params = {
                    'objective':'multiclass',
                    'num_class': 3,
                    'boosting_type':'gbdt',
                    'metric':'multi_logloss',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 256,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    "min_child_weight": 0.01,
                    'colsample_bytree': 0.8,
                    'subsample_freq':1,
                    'subsample':0.8,
                    'n_estimators':100000,
                    'max_bin':800,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':500, 
                } 

## Start FE

In [20]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test()

Training until validation scores don't improve for 500 rounds
[100]	training's multi_logloss: 0.657041	valid_1's multi_logloss: 0.767871
[200]	training's multi_logloss: 0.545637	valid_1's multi_logloss: 0.735907
[300]	training's multi_logloss: 0.4716	valid_1's multi_logloss: 0.724456
[400]	training's multi_logloss: 0.415554	valid_1's multi_logloss: 0.722505
[500]	training's multi_logloss: 0.370558	valid_1's multi_logloss: 0.725578
[600]	training's multi_logloss: 0.333186	valid_1's multi_logloss: 0.731287
[700]	training's multi_logloss: 0.301629	valid_1's multi_logloss: 0.738745
[800]	training's multi_logloss: 0.274455	valid_1's multi_logloss: 0.748347
Early stopping, best iteration is:
[381]	training's multi_logloss: 0.425216	valid_1's multi_logloss: 0.72206
####################
Global log loss 0.722059670075446
####################
Features Preformance: 0.722059670075446
Diff with previous__: 0.722059670075446


In [21]:
best_score = m_results[0]

In [22]:
for df in [df_train,df_val]:
    df['DAYS_BIRTH_DAYS_EMPLOYED_ratio'] = df['DAYS_BIRTH'] / df['DAYS_EMPLOYED']
    df['income_total_DAYS_BIRTH_ratio'] = df['income_total'] / df['DAYS_BIRTH']
    df['income_total_DAYS_EMPLOYED_ratio'] = df['income_total'] / df['DAYS_EMPLOYED']
    df['parents'] = df['family_size'] - df['child_num']
    df['Workingdays'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_workingday_ratio'] = df['income_total'] / df['Workingdays']
    df['income_total_family_size_ratio'] = df['income_total'] / df['family_size']
    

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

Training until validation scores don't improve for 500 rounds
[100]	training's multi_logloss: 0.646388	valid_1's multi_logloss: 0.761592
[200]	training's multi_logloss: 0.531828	valid_1's multi_logloss: 0.729433
[300]	training's multi_logloss: 0.45688	valid_1's multi_logloss: 0.719533
[400]	training's multi_logloss: 0.400364	valid_1's multi_logloss: 0.719329
[500]	training's multi_logloss: 0.355428	valid_1's multi_logloss: 0.723388
[600]	training's multi_logloss: 0.318552	valid_1's multi_logloss: 0.730626
[700]	training's multi_logloss: 0.287375	valid_1's multi_logloss: 0.740045


In [None]:
best_score = m_results[0]

In [None]:
for df in [df_train,df_val]:
    drop_cols = ['DAYS_BIRTH','DAYS_EMPLOYED','income_total','family_size','child_num','Workingdays']
    df.drop(drop_cols, axis=1, inplace=True)

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)
best_score = m_results[0]

In [None]:
train_enc.describe()

In [None]:
train_enc.child_num.value_counts()

In [None]:
for df in [df_train,df_val]:
    df['DAYS_BIRTH_DAYS_EMPLOYED_ratio'] = df['DAYS_BIRTH'] / df['DAYS_EMPLOYED']

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

In [None]:
for df in [df_train,df_val]:
    df['income_total_DAYS_BIRTH_ratio'] = df['income_total'] / df['DAYS_BIRTH']

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

In [None]:
best_score = m_results[0]

In [None]:
for df in [df_train,df_val]:
    df['income_total_DAYS_EMPLOYED_ratio'] = df['income_total'] / df['DAYS_EMPLOYED']

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

In [None]:
best_score = m_results[0]

In [None]:
for df in [df_train,df_val]:
    df['income_total_family_size_ratio'] = df['income_total'] / df['family_size']

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

In [None]:
for df in [df_train,df_val]:
    df['parents'] = df['family_size'] - df['child_num']

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)

In [None]:
best_score = m_results[0]

In [None]:
for df in [df_train,df_val]:
    df['income_total_workingday_ratio'] = df['income_total'] / (df['DAYS_BIRTH'] - df['DAYS_EMPLOYED'])

In [None]:
tt_df, feature_imp, m_results, model ,g_logloss= make_test(best_score)