In [1]:
import numpy as np
import pandas as pd
import os
import time
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")
plt.style.use("seaborn")
sns.set(font_scale=1)

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [None]:
var_list = ["var_108","var_34","var_9","var_94","var_127","var_13","var_123","var_1","var_80","var_2"]

In [None]:
def ext_feat(list_, df_):
    for i, values in enumerate(var_list):
        if i<(len(var_list)-1):
            df[var_list[i]+"+"+var_list[i+1]] = df[var_list[i]] + df[var_list[i+1]]
            df[var_list[i]+"-"+var_list[i+1]] = df[var_list[i]] - df[var_list[i+1]]
            df[var_list[i]+"*"+var_list[i+1]] = df[var_list[i]] * df[var_list[i+1]]
            df[var_list[i]+"/"+var_list[i+1]] = df[var_list[i]] / df[var_list[i+1]]
        if i<(len(var_list)-2):
            df[var_list[i]+"+"+var_list[i+2]] = df[var_list[i]] + df[var_list[i+2]]
            df[var_list[i]+"-"+var_list[i+2]] = df[var_list[i]] - df[var_list[i+2]]
            df[var_list[i]+"*"+var_list[i+2]] = df[var_list[i]] * df[var_list[i+2]]
            df[var_list[i]+"/"+var_list[i+2]] = df[var_list[i]] / df[var_list[i+2]]
        if i<(len(var_list)-3):
            df[var_list[i]+"+"+var_list[i+3]] = df[var_list[i]] + df[var_list[i+3]]
            df[var_list[i]+"-"+var_list[i+3]] = df[var_list[i]] - df[var_list[i+3]]
            df[var_list[i]+"*"+var_list[i+3]] = df[var_list[i]] * df[var_list[i+3]]
            df[var_list[i]+"/"+var_list[i+3]] = df[var_list[i]] / df[var_list[i+3]]
        if i<(len(var_list)-4):
            df[var_list[i]+"+"+var_list[i+4]] = df[var_list[i]] + df[var_list[i+4]]
            df[var_list[i]+"-"+var_list[i+4]] = df[var_list[i]] - df[var_list[i+4]]
            df[var_list[i]+"*"+var_list[i+4]] = df[var_list[i]] * df[var_list[i+4]]
            df[var_list[i]+"/"+var_list[i+4]] = df[var_list[i]] / df[var_list[i+4]]
        if i<(len(var_list)-5):
            df[var_list[i]+"+"+var_list[i+5]] = df[var_list[i]] + df[var_list[i+5]]
            df[var_list[i]+"-"+var_list[i+5]] = df[var_list[i]] - df[var_list[i+5]]
            df[var_list[i]+"*"+var_list[i+5]] = df[var_list[i]] * df[var_list[i+5]]
            df[var_list[i]+"/"+var_list[i+5]] = df[var_list[i]] / df[var_list[i+5]]
        if i<(len(var_list)-6):
            df[var_list[i]+"+"+var_list[i+6]] = df[var_list[i]] + df[var_list[i+6]]
            df[var_list[i]+"-"+var_list[i+6]] = df[var_list[i]] - df[var_list[i+6]]
            df[var_list[i]+"*"+var_list[i+6]] = df[var_list[i]] * df[var_list[i+6]]
            df[var_list[i]+"/"+var_list[i+6]] = df[var_list[i]] / df[var_list[i+6]]
        if i<(len(var_list)-7):
            df[var_list[i]+"+"+var_list[i+7]] = df[var_list[i]] + df[var_list[i+7]]
            df[var_list[i]+"-"+var_list[i+7]] = df[var_list[i]] - df[var_list[i+7]]
            df[var_list[i]+"*"+var_list[i+7]] = df[var_list[i]] * df[var_list[i+7]]
            df[var_list[i]+"/"+var_list[i+7]] = df[var_list[i]] / df[var_list[i+7]]
        if i<(len(var_list)-8):
            df[var_list[i]+"+"+var_list[i+8]] = df[var_list[i]] + df[var_list[i+8]]
            df[var_list[i]+"-"+var_list[i+8]] = df[var_list[i]] - df[var_list[i+8]]
            df[var_list[i]+"*"+var_list[i+8]] = df[var_list[i]] * df[var_list[i+8]]
            df[var_list[i]+"/"+var_list[i+8]] = df[var_list[i]] / df[var_list[i+8]]
        if i<(len(var_list)-9):
            df[var_list[i]+"+"+var_list[i+9]] = df[var_list[i]] + df[var_list[i+9]]
            df[var_list[i]+"-"+var_list[i+9]] = df[var_list[i]] - df[var_list[i+9]]
            df[var_list[i]+"*"+var_list[i+9]] = df[var_list[i]] * df[var_list[i+9]]
            df[var_list[i]+"/"+var_list[i+9]] = df[var_list[i]] / df[var_list[i+9]]
        else:
            continue
    
    return df

In [None]:
df_train = ext_feat(var_list, df_train)
df_test = ext_feat(var_list, df_test)

In [3]:
# random_state = 42
# np.random.seed(random_state)

def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
lgb_params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.0083,
    "bagging_freq": 5,
    "bagging_fraction" : 0.335,
    "feature_fraction" : 0.041,
    "min_data_in_leaf": 80,
#     'subsample': 0.85,
#     'min_child_weight': 1.5,
#     'num_leaves': 2 ** 5,
#     'colsample_bytree': 0.8,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
#     "lambda_l1" : 5,
#     "lambda_l2" : 5,
#     "bagging_seed" : random_state,
    "verbosity" : 1,
#     "seed": random_state,
    "n_jobs": -1
}



In [5]:
skf = StratifiedKFold(n_splits=13, shuffle=True)
oof = df_train[['ID_code', 'target']]
oof['predict'] = 0
predictions = df_test[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

In [6]:
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]
X_test = df_test[features].values

In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(df_train, df_train['target'])):
    X_train, y_train = df_train.iloc[trn_idx][features], df_train.iloc[trn_idx]['target']
    X_valid, y_valid = df_train.iloc[val_idx][features], df_train.iloc[val_idx]['target']
    
    N = 3
    p_valid,yp = 0,0
    for i in range(N):
        X_t, y_t = augment(X_train.values, y_train.values)
        X_t = pd.DataFrame(X_t)
        X_t = X_t.add_prefix('var_')
    
        trn_data = lgb.Dataset(X_t, label=y_t)
        val_data = lgb.Dataset(X_valid, label=y_valid)
        evals_result = {}
        lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        1000000,
                        valid_sets = [trn_data, val_data],
                        early_stopping_rounds=4000,
                        verbose_eval = 1000,
                        evals_result=evals_result
                       )
        p_valid += lgb_clf.predict(X_valid)
        yp += lgb_clf.predict(X_test)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    oof['predict'][val_idx] = p_valid/N
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = yp/N

Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.891705	valid_1's auc: 0.889905
[2000]	training's auc: 0.899379	valid_1's auc: 0.895426
[3000]	training's auc: 0.904493	valid_1's auc: 0.898882
[4000]	training's auc: 0.908256	valid_1's auc: 0.901246
[5000]	training's auc: 0.91115	valid_1's auc: 0.902823
[6000]	training's auc: 0.913623	valid_1's auc: 0.904059
[7000]	training's auc: 0.915749	valid_1's auc: 0.904877
[8000]	training's auc: 0.917658	valid_1's auc: 0.905517
[9000]	training's auc: 0.91947	valid_1's auc: 0.905844
[10000]	training's auc: 0.921153	valid_1's auc: 0.906112
[11000]	training's auc: 0.922764	valid_1's auc: 0.906404
[12000]	training's auc: 0.924337	valid_1's auc: 0.906421
[13000]	training's auc: 0.925858	valid_1's auc: 0.906286
[14000]	training's auc: 0.92736	valid_1's auc: 0.906313
[15000]	training's auc: 0.928846	valid_1's auc: 0.906287
[16000]	training's auc: 0.930277	valid_1's auc: 0.9063
Early stopping, best iteration is:
[12

In [None]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,26))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (averaged over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code', 'target']]].values, axis=1)
predictions.to_csv('lgb_all_predictions.csv', index=None)
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = predictions['target']
sub_df.to_csv("lgb_submission.csv", index=False)
predictions[["ID_code", "target"]].to_csv("../submission/lgb_oof"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))))+".csv", index=False)