In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from skopt.space import Real, Categorical, Integer
import warnings
warnings.simplefilter('ignore')
OriginDataDir = '../../Data/OriginData'
TrainTestDataDir = '../../Data/TrainTestData'
ModelSaveDir = 'models'
ResultSaveDir = 'result'

In [2]:
def loadData(DataName):
    DataPath = os.path.join(TrainTestDataDir, DataName)
    if not os.path.exists(DataPath):
        print('%s does not exist!' % DataPath)
        return
    OriginData = pd.read_csv(DataPath, index_col=0)
    #OriginData = OriginData.sample(frac=1)  # 打乱顺序后返回
    return OriginData


def NormalData(TrainData, TestData):
    # 对一些列的均值大于100的进行归一化处理
    AllData = TrainData.append(TestData)
    for col in AllData.columns:
        if abs(AllData[col].mean())>1:
            scaler = StandardScaler().fit(np.atleast_2d(AllData[col]).T)
            TrainData[col] = scaler.transform(np.atleast_2d(TrainData[col]).T)
    return TrainData

In [3]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [4]:
TestData = loadData('AllTest_withnull.csv')
TestID = TestData.SK_ID_CURR.values
TestData.drop('SK_ID_CURR', axis=1, inplace=True)
AllTrainData = loadData('AllTrain_withnull.csv')
AllTrainData.reset_index(drop=True, inplace=True)
AllTrainData.drop('SK_ID_CURR', axis=1, inplace=True)

In [6]:
features = [x for x in AllTrainData.columns if x != 'TARGET' and x != 'SK_ID_CURR']

In [7]:
len(features)

847

In [8]:
df_train = AllTrainData
df_test = TestData
num_folds = 5
stratified = False
debug= False

In [9]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [12]:
lgbm_params = {
    'boosting': 'dart',
    'application': 'binary',
    'learning_rate': 0.02,
    'min_data_in_leaf': 30,
    'num_leaves': 34,
    'max_depth': 8,
    'feature_fraction': 0.9497036,
    'scale_pos_weight': 2,
    'drop_rate': 0.02,
    'reg_alpha': 0.041545473,
    'reg_lambda': 0.0735294,
    'min_split_gain': 0.0222415,
    'min_child_weight': 39.3259775,
}

In [16]:
# Divide in training/validation and test data
print("Starting LightGBM. Train shape: {}, test shape: {}".format(df_train.shape, df_test.shape))
# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train.shape[0])
sub_preds = np.zeros(df_test.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in df_train.columns if f not in ['TARGET','SK_ID_CURR']]
    
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[feats], df_train['TARGET'])):
    train_x, train_y = df_train[feats].iloc[train_idx], df_train['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_train[feats].iloc[valid_idx], df_train['TARGET'].iloc[valid_idx]

    # LightGBM parameters found by Bayesian optimization
    clf =  LGBMClassifier(
        nthread=-1,
        #is_unbalance=True,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40,
        silent=-1,
        verbose=-1,
        #scale_pos_weight=11
        )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(df_test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

Starting LightGBM. Train shape: (307511, 848), test shape: (48744, 847)
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.804193	valid_1's auc: 0.778819
[400]	training's auc: 0.831035	valid_1's auc: 0.789218
[600]	training's auc: 0.849771	valid_1's auc: 0.792629
[800]	training's auc: 0.864713	valid_1's auc: 0.793828
[1000]	training's auc: 0.877706	valid_1's auc: 0.794113
[1200]	training's auc: 0.889249	valid_1's auc: 0.794302
Early stopping, best iteration is:
[1188]	training's auc: 0.888636	valid_1's auc: 0.794332
Fold  1 AUC : 0.794332
Training until validation scores don't improve for 200 rounds.


KeyboardInterrupt: 

In [None]:
print('Full AUC score %.6f' % roc_auc_score(df_train['TARGET'], oof_preds))
# Write submission file and plot feature importance
gender_submission = pd.DataFrame({'SK_ID_CURR':TestID,'TARGET':sub_preds})
gender_submission.to_csv(os.path.join(ResultSaveDir,'result_lgbm_new.csv'), index = False)
display_importances(feature_importance_df)