In [1]:

import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import featuretools as ft


In [2]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [3]:
num_rows = None
nan_as_category=False

In [4]:
df = pd.read_csv('../../data/application_train.csv', nrows=num_rows)
test_df = pd.read_csv('../../data/application_test.csv', nrows=num_rows)
df = df[df['CODE_GENDER'] != 'XNA']

docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)
df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']

# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])
# Categorical features with One-Hot encode
df, cat_cols = one_hot_encoder(df, nan_as_category)
dropcolum = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4',
             'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
             'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10',
             'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
             'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
             'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
             'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
df = df.drop(dropcolum, axis=1)

In [5]:
bureau = pd.read_csv('../../data/bureau.csv', nrows=num_rows)
bb = pd.read_csv('../../data/bureau_balance.csv', nrows=num_rows)
bb, bb_cat = one_hot_encoder(bb, nan_as_category)
bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
bb['index'] = bb.index

In [6]:
prev = pd.read_csv('../../data/previous_application.csv', nrows=num_rows)
prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

In [7]:
pos = pd.read_csv('../../data/POS_CASH_balance.csv', nrows=num_rows)
pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)
pos['index'] = pos.index

In [8]:
ins = pd.read_csv('../../data/installments_payments.csv', nrows=num_rows)
ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

In [9]:
ins['index']= ins.index

In [10]:
cc = pd.read_csv('../../data/credit_card_balance.csv', nrows=num_rows)
cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)
cc['index'] = cc.index

In [11]:
entities = {
    "df": (df, "SK_ID_CURR"),
    "bureau": (bureau, "SK_ID_BUREAU"),
    'bb': (bb, 'index'),
    'prev': (prev, 'SK_ID_PREV'),
    'pos': (pos, 'index'), 
    'ins': (ins, 'index'),
    'cc': (cc, 'index')
}


In [12]:
relationships = [('df', 'SK_ID_CURR', 'bureau', 'SK_ID_CURR'),
                 ('bureau', 'SK_ID_BUREAU', 'bb', 'SK_ID_BUREAU'),
                 ('df', 'SK_ID_CURR', 'prev', 'SK_ID_CURR'),
                 #('df', 'SK_ID_CURR', 'pos', 'SK_ID_CURR'),
                 #('df', 'SK_ID_CURR', 'ins', 'SK_ID_CURR'),
                 #('df', 'SK_ID_CURR', 'cc', 'SK_ID_CURR'),
                 ('prev', 'SK_ID_PREV', 'pos', 'SK_ID_PREV'),
                 ('prev', 'SK_ID_PREV', 'ins', 'SK_ID_PREV'),
                 ('prev', 'SK_ID_PREV', 'cc', 'SK_ID_PREV'),
                ]

In [25]:
df.shape

(307507, 239)

In [None]:
# 时间非常长
feature_matrix_customers, features_defs = ft.dfs(entities=entities, relationships=relationships, target_entity="df", max_depth=2)

In [43]:
features_defs

[<Feature: TARGET>,
 <Feature: CODE_GENDER>,
 <Feature: FLAG_OWN_CAR>,
 <Feature: FLAG_OWN_REALTY>,
 <Feature: CNT_CHILDREN>,
 <Feature: AMT_INCOME_TOTAL>,
 <Feature: AMT_CREDIT>,
 <Feature: AMT_ANNUITY>,
 <Feature: AMT_GOODS_PRICE>,
 <Feature: REGION_POPULATION_RELATIVE>,
 <Feature: DAYS_BIRTH>,
 <Feature: DAYS_EMPLOYED>,
 <Feature: DAYS_REGISTRATION>,
 <Feature: DAYS_ID_PUBLISH>,
 <Feature: OWN_CAR_AGE>,
 <Feature: FLAG_MOBIL>,
 <Feature: FLAG_EMP_PHONE>,
 <Feature: FLAG_WORK_PHONE>,
 <Feature: FLAG_CONT_MOBILE>,
 <Feature: FLAG_PHONE>,
 <Feature: FLAG_EMAIL>,
 <Feature: CNT_FAM_MEMBERS>,
 <Feature: REGION_RATING_CLIENT>,
 <Feature: REGION_RATING_CLIENT_W_CITY>,
 <Feature: HOUR_APPR_PROCESS_START>,
 <Feature: REG_REGION_NOT_LIVE_REGION>,
 <Feature: REG_REGION_NOT_WORK_REGION>,
 <Feature: LIVE_REGION_NOT_WORK_REGION>,
 <Feature: REG_CITY_NOT_LIVE_CITY>,
 <Feature: REG_CITY_NOT_WORK_CITY>,
 <Feature: LIVE_CITY_NOT_WORK_CITY>,
 <Feature: EXT_SOURCE_1>,
 <Feature: EXT_SOURCE_2>,
 <Featur

In [65]:
train_df = feature_matrix_customers
folds = KFold(n_splits=5, shuffle=True, random_state=47)
oof_preds = np.zeros(train_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['TARGET']]

In [37]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance",
                                                                                                   ascending=False)[
           :40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [66]:
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        clf = LGBMClassifier(
            nthread=4,
            # is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            # scale_pos_weight=11
        )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc', verbose=1000, early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
display_importances(feature_importance_df)

Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.862473	valid_1's auc: 0.792882
Early stopping, best iteration is:
[1277]	training's auc: 0.875965	valid_1's auc: 0.793167
Fold  1 AUC : 0.793167
Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.861972	valid_1's auc: 0.789143
Early stopping, best iteration is:
[1610]	training's auc: 0.888529	valid_1's auc: 0.789609
Fold  2 AUC : 0.789609
Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.862255	valid_1's auc: 0.789603


KeyboardInterrupt: 

In [62]:
mean = feature_importance_df.groupby('feature')['importance'].mean()

select_features = mean.index[mean.sort_values(ascending=False).map(lambda x: x!=0)]
feat = select_features

In [59]:
select_features = [i for i in select_features if "(" in i]

In [61]:
len(select_features)

815

(307507,)

In [19]:
clf = LGBMClassifier(
            nthread=4,
            # is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            # scale_pos_weight=11
        )


In [90]:
x_train = feature_matrix_customers.iloc[:, 1:]
y_train = feature_matrix_customers.TARGET.values

In [91]:
x_train = pd.get_dummies(x_train)

In [92]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, x_train, y_train, scoring='roc_auc')

array([0.75820804, 0.75553215, 0.75660627])

In [65]:
?np.column_stack