In [1]:
"""
    导入相关包
"""
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge

class SBBTree():
    """
        SBBTree
        Stacking,Bootstap,Bagging
    """
    def __init__(
                    self, 
                    params,
                    stacking_num,
                    bagging_num,
                    bagging_test_size,
                    num_boost_round,
                    early_stopping_rounds
                ):
        """
            Initializes the SBBTree.
            Args:
              params : lgb params.
              stacking_num : k_flod stacking.
              bagging_num : bootstrap num.
              bagging_test_size : bootstrap sample rate.
              num_boost_round : boost num.
              early_stopping_rounds : early_stopping_rounds.
        """
        self.params = params
        self.stacking_num = stacking_num
        self.bagging_num = bagging_num
        self.bagging_test_size = bagging_test_size
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds

        self.model = lgb
        self.stacking_model = []
        self.bagging_model = []

    def fit(self, X, y):
        """ fit model. """
        if self.stacking_num > 1:
            layer_train = np.zeros((X.shape[0], 2))
            self.SK = StratifiedKFold(n_splits=self.stacking_num, shuffle=True, random_state=1)
            for k,(train_index, test_index) in enumerate(self.SK.split(X, y)):
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]

                lgb_train = lgb.Dataset(X_train, y_train)
                lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

                gbm = lgb.train(self.params,
                            lgb_train,
                            num_boost_round=self.num_boost_round,
                            valid_sets=lgb_eval,
                            early_stopping_rounds=self.early_stopping_rounds)

                self.stacking_model.append(gbm)

                pred_y = gbm.predict(X_test, num_iteration=gbm.best_iteration)
                layer_train[test_index, 1] = pred_y

            X = np.hstack((X, layer_train[:,1].reshape((-1,1)))) 
        else:
            pass
        for bn in range(self.bagging_num):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.bagging_test_size, random_state=bn)

            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

            gbm = lgb.train(self.params,
                        lgb_train,
                        num_boost_round=10000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=200)

            self.bagging_model.append(gbm)

    def predict(self, X_pred):
        """ predict test data. """
        if self.stacking_num > 1:
            test_pred = np.zeros((X_pred.shape[0], self.stacking_num))
            for sn,gbm in enumerate(self.stacking_model):
                pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
                test_pred[:, sn] = pred
            X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1,1))))  
        else:
            pass 
        for bn,gbm in enumerate(self.bagging_model):
            pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration)
            if bn == 0:
                pred_out=pred
            else:
                pred_out+=pred
        return pred_out/self.bagging_num

In [10]:
import pre_process
#读取相关文件
train_path = '../data/train.csv'
test_path = '../data/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

submit_path = '../data/车辆贷款违约预测挑战赛sample_submit.csv'
submit_data = pd.read_csv(submit_path)
#生成提交文件
submit_data['customer_id'] = test_data['customer_id']
submit_data['loan_default'] = 0
train_data,test_data = pre_process.fill_inf(train_data,test_data)#填补inf值
train_data,test_data = pre_process.del_singular_feature(train_data,test_data)#删除单值属性

In [5]:
def find_outliers(model,X,y,sigma=3):
    # predict y values using model
    try:
        y_pred = pd.Series(model.predict(X),index=y.index)
    # if predicting fails, try fitting the model first
    except:
        model.fit(X,y)
        y_pred = pd.Series(model.predict(X),index=y.index)
    
    # calculate residuals between the model prediction and true y values
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid  = resid.std()
    
    # calculate z statistic, define outliers to be where |z|>sigma
    z = (resid-mean_resid)/std_resid
    outliers = z[abs(z)>sigma].index
    return outliers

# 通过岭回归模型找出异常值，并绘制其分布

X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(),X_train,y_train)
outlier_index = list(outliers)
outlier_sum = np.sum(train_data.iloc[outlier_index]['loan_default'].values)
print(outlier_sum/len(outlier_index)*100)
train_data=train_data.drop(labels=outlier_index)

nan




In [11]:
train_data

Unnamed: 0,main_account_loan_no,main_account_active_loan_no,main_account_overdue_no,main_account_outstanding_loan,main_account_sanction_loan,main_account_disbursed_loan,sub_account_loan_no,sub_account_active_loan_no,sub_account_overdue_no,sub_account_outstanding_loan,...,total_monthly_payment,outstanding_disburse_ratio,main_account_tenure,sub_account_tenure,disburse_to_sactioned_ratio,active_to_inactive_act_ratio,Credit_level,employment_type,age,loan_default
0,4,3,0,384989,666207,666207,0,0,0,0,...,8169,1.73,81,0,1.00,2.50,1,0,51,0
1,7,2,0,268670,387994,387994,0,0,0,0,...,2400,1.44,161,0,1.00,1.33,9,0,27,0
2,5,4,1,3519013,3613854,3576048,0,0,0,0,...,0,1.02,3576048,0,0.99,3.00,13,1,28,0
3,43,13,6,1867106,2484678,2486856,0,0,0,0,...,4320912,1.33,0,0,1.00,1.42,3,1,55,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1.00,0,0,1.00,1.00,-1,0,24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,1,0,0,0,0,0,0,0,0,0,...,0,1.00,0,0,1.00,1.00,13,0,33,0
149996,1,0,0,0,0,0,0,0,0,0,...,1996,1.00,0,0,1.00,1.00,6,1,24,0
149997,21,4,0,60522,119000,119000,0,0,0,0,...,30703,1.97,3,0,1.00,1.22,7,0,38,0
149998,0,0,0,0,0,0,0,0,0,0,...,0,1.00,0,0,1.00,1.00,-1,0,31,0


In [5]:
# from sklearn.feature_selection import VarianceThreshold
# from sklearn.datasets import load_iris

# #方差选择法，返回值为特征选择后的数据
# #参数threshold为方差的阈值
# train_data = VarianceThreshold(threshold=3).fit_transform(train_data)
# test_data = VarianceThreshold(threshold=3).fit_transform(test_data)

In [12]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold


features_columns = [col for col in train_data.columns if col not in ['loan_default']]
train = train_data[features_columns].values
test = test_data[features_columns].values
target =train_data['loan_default'].values

In [10]:
# from sklearn.feature_selection import SelectFromModel
# from sklearn.ensemble import GradientBoostingClassifier
 
# # GBDT作为基模型的特征选择
# SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target)

array([[0.0000e+00, 3.1324e+04, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        5.1000e+01],
       [0.0000e+00, 5.3078e+04, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        2.7000e+01],
       [1.0000e+00, 5.3639e+04, 5.6000e+01, ..., 9.9000e-01, 1.0000e+00,
        2.8000e+01],
       ...,
       [0.0000e+00, 5.3278e+04, 1.1000e+01, ..., 1.0000e+00, 0.0000e+00,
        3.8000e+01],
       [0.0000e+00, 5.9066e+04, 2.6000e+01, ..., 1.0000e+00, 0.0000e+00,
        3.1000e+01],
       [0.0000e+00, 4.8349e+04, 1.1000e+01, ..., 1.0000e+00, 1.0000e+00,
        3.1000e+01]])

In [17]:
# params = {
#         'task': 'train',
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric': 'auc',
#         'num_leaves': 9,
#         'learning_rate': 0.03,
#         'feature_fraction_seed': 2,
#         'feature_fraction': 0.9,
#         'bagging_fraction': 0.8,
#         'bagging_freq': 5,
#         'min_data': 20,
#         'min_hessian': 1,
#         'verbose': -1,
#         'silent': 0
#         }
params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'num_leaves': 2 ** 7,
            'metric': 'auc',
            'min_child_weight': 5,
            'learning_rate': 0.01,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'seed': 2021,
            'n_jobs':-1
        }
model = SBBTree(params=params,
                stacking_num=5,
                bagging_num=3,
                bagging_test_size=0.33,
                num_boost_round=10000,
                early_stopping_rounds=200)

model.fit(train, target)

[LightGBM] [Info] Number of positive: 21236, number of negative: 98764
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 120000.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6595
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 47
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 30000.000000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176967 -> initscore=-1.537035
[LightGBM] [Info] Start training from score -1.537035
[1]	valid_0's cross_entropy: 0.466439
Training until validation scores don't improve for 200 rounds
[2]	valid_0's cross_entropy: 0.466135
[3]	valid_0's cross_entropy: 0.465871
[4]	valid_0's cross_entropy: 0.465574
[5]	valid_0's cross_

[191]	valid_0's cross_entropy: 0.447392
[192]	valid_0's cross_entropy: 0.447359
[193]	valid_0's cross_entropy: 0.447332
[194]	valid_0's cross_entropy: 0.4473
[195]	valid_0's cross_entropy: 0.447264
[196]	valid_0's cross_entropy: 0.44724
[197]	valid_0's cross_entropy: 0.4472
[198]	valid_0's cross_entropy: 0.44719
[199]	valid_0's cross_entropy: 0.447168
[200]	valid_0's cross_entropy: 0.447144
[201]	valid_0's cross_entropy: 0.447115
[202]	valid_0's cross_entropy: 0.447091
[203]	valid_0's cross_entropy: 0.447061
[204]	valid_0's cross_entropy: 0.447031
[205]	valid_0's cross_entropy: 0.447005
[206]	valid_0's cross_entropy: 0.446981
[207]	valid_0's cross_entropy: 0.446964
[208]	valid_0's cross_entropy: 0.446941
[209]	valid_0's cross_entropy: 0.446916
[210]	valid_0's cross_entropy: 0.446894
[211]	valid_0's cross_entropy: 0.44686
[212]	valid_0's cross_entropy: 0.446844
[213]	valid_0's cross_entropy: 0.446812
[214]	valid_0's cross_entropy: 0.446783
[215]	valid_0's cross_entropy: 0.446747
[216]	v

[401]	valid_0's cross_entropy: 0.444317
[402]	valid_0's cross_entropy: 0.444312
[403]	valid_0's cross_entropy: 0.444314
[404]	valid_0's cross_entropy: 0.444304
[405]	valid_0's cross_entropy: 0.444293
[406]	valid_0's cross_entropy: 0.444292
[407]	valid_0's cross_entropy: 0.444288
[408]	valid_0's cross_entropy: 0.444281
[409]	valid_0's cross_entropy: 0.444275
[410]	valid_0's cross_entropy: 0.444274
[411]	valid_0's cross_entropy: 0.444267
[412]	valid_0's cross_entropy: 0.44427
[413]	valid_0's cross_entropy: 0.444254
[414]	valid_0's cross_entropy: 0.444258
[415]	valid_0's cross_entropy: 0.44426
[416]	valid_0's cross_entropy: 0.444258
[417]	valid_0's cross_entropy: 0.444246
[418]	valid_0's cross_entropy: 0.444233
[419]	valid_0's cross_entropy: 0.444232
[420]	valid_0's cross_entropy: 0.44422
[421]	valid_0's cross_entropy: 0.444215
[422]	valid_0's cross_entropy: 0.444215
[423]	valid_0's cross_entropy: 0.444212
[424]	valid_0's cross_entropy: 0.444218
[425]	valid_0's cross_entropy: 0.444203
[42

[612]	valid_0's cross_entropy: 0.443855
[613]	valid_0's cross_entropy: 0.443857
[614]	valid_0's cross_entropy: 0.443855
[615]	valid_0's cross_entropy: 0.443854
[616]	valid_0's cross_entropy: 0.443843
[617]	valid_0's cross_entropy: 0.443842
[618]	valid_0's cross_entropy: 0.443832
[619]	valid_0's cross_entropy: 0.443837
[620]	valid_0's cross_entropy: 0.443834
[621]	valid_0's cross_entropy: 0.443841
[622]	valid_0's cross_entropy: 0.443843
[623]	valid_0's cross_entropy: 0.443836
[624]	valid_0's cross_entropy: 0.44383
[625]	valid_0's cross_entropy: 0.443836
[626]	valid_0's cross_entropy: 0.44383
[627]	valid_0's cross_entropy: 0.443824
[628]	valid_0's cross_entropy: 0.443825
[629]	valid_0's cross_entropy: 0.44383
[630]	valid_0's cross_entropy: 0.443834
[631]	valid_0's cross_entropy: 0.443829
[632]	valid_0's cross_entropy: 0.443829
[633]	valid_0's cross_entropy: 0.443832
[634]	valid_0's cross_entropy: 0.443832
[635]	valid_0's cross_entropy: 0.443832
[636]	valid_0's cross_entropy: 0.443833
[63

[819]	valid_0's cross_entropy: 0.443872
[820]	valid_0's cross_entropy: 0.443882
[821]	valid_0's cross_entropy: 0.443889
[822]	valid_0's cross_entropy: 0.44389
[823]	valid_0's cross_entropy: 0.443892
[824]	valid_0's cross_entropy: 0.443899
[825]	valid_0's cross_entropy: 0.443901
[826]	valid_0's cross_entropy: 0.443905
[827]	valid_0's cross_entropy: 0.443907
[828]	valid_0's cross_entropy: 0.443911
[829]	valid_0's cross_entropy: 0.443915
[830]	valid_0's cross_entropy: 0.443913
[831]	valid_0's cross_entropy: 0.443913
[832]	valid_0's cross_entropy: 0.443913
[833]	valid_0's cross_entropy: 0.443917
[834]	valid_0's cross_entropy: 0.443916
[835]	valid_0's cross_entropy: 0.443913
[836]	valid_0's cross_entropy: 0.443917
[837]	valid_0's cross_entropy: 0.443923
[838]	valid_0's cross_entropy: 0.443921
[839]	valid_0's cross_entropy: 0.443913
[840]	valid_0's cross_entropy: 0.443913
[841]	valid_0's cross_entropy: 0.443915
[842]	valid_0's cross_entropy: 0.443917
[843]	valid_0's cross_entropy: 0.443917
[

[94]	valid_0's cross_entropy: 0.450142
[95]	valid_0's cross_entropy: 0.450061
[96]	valid_0's cross_entropy: 0.449972
[97]	valid_0's cross_entropy: 0.449876
[98]	valid_0's cross_entropy: 0.449786
[99]	valid_0's cross_entropy: 0.449707
[100]	valid_0's cross_entropy: 0.449634
[101]	valid_0's cross_entropy: 0.449546
[102]	valid_0's cross_entropy: 0.449483
[103]	valid_0's cross_entropy: 0.449403
[104]	valid_0's cross_entropy: 0.449312
[105]	valid_0's cross_entropy: 0.449233
[106]	valid_0's cross_entropy: 0.449166
[107]	valid_0's cross_entropy: 0.449092
[108]	valid_0's cross_entropy: 0.449017
[109]	valid_0's cross_entropy: 0.448942
[110]	valid_0's cross_entropy: 0.448865
[111]	valid_0's cross_entropy: 0.448795
[112]	valid_0's cross_entropy: 0.448707
[113]	valid_0's cross_entropy: 0.44864
[114]	valid_0's cross_entropy: 0.448572
[115]	valid_0's cross_entropy: 0.448495
[116]	valid_0's cross_entropy: 0.448422
[117]	valid_0's cross_entropy: 0.448359
[118]	valid_0's cross_entropy: 0.448295
[119]	v

[304]	valid_0's cross_entropy: 0.441902
[305]	valid_0's cross_entropy: 0.441884
[306]	valid_0's cross_entropy: 0.441857
[307]	valid_0's cross_entropy: 0.441841
[308]	valid_0's cross_entropy: 0.441833
[309]	valid_0's cross_entropy: 0.441821
[310]	valid_0's cross_entropy: 0.441799
[311]	valid_0's cross_entropy: 0.441777
[312]	valid_0's cross_entropy: 0.441766
[313]	valid_0's cross_entropy: 0.441752
[314]	valid_0's cross_entropy: 0.441727
[315]	valid_0's cross_entropy: 0.441698
[316]	valid_0's cross_entropy: 0.441683
[317]	valid_0's cross_entropy: 0.441676
[318]	valid_0's cross_entropy: 0.441657
[319]	valid_0's cross_entropy: 0.441639
[320]	valid_0's cross_entropy: 0.441619
[321]	valid_0's cross_entropy: 0.441603
[322]	valid_0's cross_entropy: 0.441594
[323]	valid_0's cross_entropy: 0.441581
[324]	valid_0's cross_entropy: 0.441572
[325]	valid_0's cross_entropy: 0.441553
[326]	valid_0's cross_entropy: 0.441537
[327]	valid_0's cross_entropy: 0.441521
[328]	valid_0's cross_entropy: 0.441505


[513]	valid_0's cross_entropy: 0.439819
[514]	valid_0's cross_entropy: 0.439818
[515]	valid_0's cross_entropy: 0.439812
[516]	valid_0's cross_entropy: 0.439815
[517]	valid_0's cross_entropy: 0.439817
[518]	valid_0's cross_entropy: 0.439812
[519]	valid_0's cross_entropy: 0.439804
[520]	valid_0's cross_entropy: 0.439802
[521]	valid_0's cross_entropy: 0.439795
[522]	valid_0's cross_entropy: 0.43979
[523]	valid_0's cross_entropy: 0.439789
[524]	valid_0's cross_entropy: 0.439788
[525]	valid_0's cross_entropy: 0.439781
[526]	valid_0's cross_entropy: 0.43978
[527]	valid_0's cross_entropy: 0.439776
[528]	valid_0's cross_entropy: 0.439769
[529]	valid_0's cross_entropy: 0.439766
[530]	valid_0's cross_entropy: 0.439758
[531]	valid_0's cross_entropy: 0.439746
[532]	valid_0's cross_entropy: 0.439731
[533]	valid_0's cross_entropy: 0.439726
[534]	valid_0's cross_entropy: 0.439727
[535]	valid_0's cross_entropy: 0.439727
[536]	valid_0's cross_entropy: 0.439725
[537]	valid_0's cross_entropy: 0.439731
[5

[722]	valid_0's cross_entropy: 0.43944
[723]	valid_0's cross_entropy: 0.439442
[724]	valid_0's cross_entropy: 0.439448
[725]	valid_0's cross_entropy: 0.439448
[726]	valid_0's cross_entropy: 0.439454
[727]	valid_0's cross_entropy: 0.439452
[728]	valid_0's cross_entropy: 0.439453
[729]	valid_0's cross_entropy: 0.439439
[730]	valid_0's cross_entropy: 0.43944
[731]	valid_0's cross_entropy: 0.439441
[732]	valid_0's cross_entropy: 0.439439
[733]	valid_0's cross_entropy: 0.439444
[734]	valid_0's cross_entropy: 0.439449
[735]	valid_0's cross_entropy: 0.439445
[736]	valid_0's cross_entropy: 0.439439
[737]	valid_0's cross_entropy: 0.439437
[738]	valid_0's cross_entropy: 0.439443
[739]	valid_0's cross_entropy: 0.439445
[740]	valid_0's cross_entropy: 0.439447
[741]	valid_0's cross_entropy: 0.439436
[742]	valid_0's cross_entropy: 0.439433
[743]	valid_0's cross_entropy: 0.439437
[744]	valid_0's cross_entropy: 0.43944
[745]	valid_0's cross_entropy: 0.439438
[746]	valid_0's cross_entropy: 0.439437
[74

[930]	valid_0's cross_entropy: 0.439326
[931]	valid_0's cross_entropy: 0.439323
[932]	valid_0's cross_entropy: 0.439324
[933]	valid_0's cross_entropy: 0.439331
[934]	valid_0's cross_entropy: 0.439329
[935]	valid_0's cross_entropy: 0.439321
[936]	valid_0's cross_entropy: 0.439317
[937]	valid_0's cross_entropy: 0.439314
[938]	valid_0's cross_entropy: 0.439312
[939]	valid_0's cross_entropy: 0.439316
[940]	valid_0's cross_entropy: 0.43931
[941]	valid_0's cross_entropy: 0.439307
[942]	valid_0's cross_entropy: 0.439305
[943]	valid_0's cross_entropy: 0.439301
[944]	valid_0's cross_entropy: 0.439297
[945]	valid_0's cross_entropy: 0.439292
[946]	valid_0's cross_entropy: 0.439292
[947]	valid_0's cross_entropy: 0.439288
[948]	valid_0's cross_entropy: 0.439289
[949]	valid_0's cross_entropy: 0.439284
[950]	valid_0's cross_entropy: 0.439282
[951]	valid_0's cross_entropy: 0.439276
[952]	valid_0's cross_entropy: 0.439278
[953]	valid_0's cross_entropy: 0.439276
[954]	valid_0's cross_entropy: 0.439267
[

[1136]	valid_0's cross_entropy: 0.439154
[1137]	valid_0's cross_entropy: 0.439157
[1138]	valid_0's cross_entropy: 0.439154
[1139]	valid_0's cross_entropy: 0.439155
[1140]	valid_0's cross_entropy: 0.439161
[1141]	valid_0's cross_entropy: 0.439152
[1142]	valid_0's cross_entropy: 0.439154
[1143]	valid_0's cross_entropy: 0.439154
[1144]	valid_0's cross_entropy: 0.439148
[1145]	valid_0's cross_entropy: 0.439152
[1146]	valid_0's cross_entropy: 0.439149
[1147]	valid_0's cross_entropy: 0.439146
[1148]	valid_0's cross_entropy: 0.439144
[1149]	valid_0's cross_entropy: 0.439148
[1150]	valid_0's cross_entropy: 0.439149
[1151]	valid_0's cross_entropy: 0.43915
[1152]	valid_0's cross_entropy: 0.439154
[1153]	valid_0's cross_entropy: 0.439147
[1154]	valid_0's cross_entropy: 0.439147
[1155]	valid_0's cross_entropy: 0.439147
[1156]	valid_0's cross_entropy: 0.439152
[1157]	valid_0's cross_entropy: 0.439154
[1158]	valid_0's cross_entropy: 0.439146
[1159]	valid_0's cross_entropy: 0.43915
[1160]	valid_0's c

[11]	valid_0's cross_entropy: 0.463665
[12]	valid_0's cross_entropy: 0.463421
[13]	valid_0's cross_entropy: 0.463177
[14]	valid_0's cross_entropy: 0.462943
[15]	valid_0's cross_entropy: 0.462708
[16]	valid_0's cross_entropy: 0.462472
[17]	valid_0's cross_entropy: 0.462247
[18]	valid_0's cross_entropy: 0.462007
[19]	valid_0's cross_entropy: 0.461803
[20]	valid_0's cross_entropy: 0.461581
[21]	valid_0's cross_entropy: 0.461357
[22]	valid_0's cross_entropy: 0.461156
[23]	valid_0's cross_entropy: 0.46096
[24]	valid_0's cross_entropy: 0.460764
[25]	valid_0's cross_entropy: 0.460558
[26]	valid_0's cross_entropy: 0.460353
[27]	valid_0's cross_entropy: 0.460162
[28]	valid_0's cross_entropy: 0.459992
[29]	valid_0's cross_entropy: 0.459805
[30]	valid_0's cross_entropy: 0.459626
[31]	valid_0's cross_entropy: 0.45944
[32]	valid_0's cross_entropy: 0.459265
[33]	valid_0's cross_entropy: 0.459079
[34]	valid_0's cross_entropy: 0.458897
[35]	valid_0's cross_entropy: 0.458721
[36]	valid_0's cross_entrop

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.5, random_state=0)

# model 
clf = RandomForestClassifier(n_jobs=-1)

# Set the parameters by cross-validation

tuned_parameters = {
                    'n_estimators': [50, 100, 200]
#                     ,'criterion': ['gini', 'entropy']
#                     ,'max_depth': [2, 5]
#                     ,'max_features': ['log2', 'sqrt', 'int']
#                     ,'bootstrap': [True, False]
#                     ,'warm_start': [True, False]
                    }

scores = ['precision']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(clf, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


In [9]:
pred = model.predict(test)
# df_out = pd.DataFrame()
# df_out['user_id'] = test_data['user_id']
# df_out['predict_prob'] = pred
# df_out.head()


0.2128

In [None]:
np.sum([pred>0.207])/len(pred)

In [32]:
new_pred = []
threshold = 0.207
for index,x in enumerate(pred):
    if x>threshold:
        new_pred.append(1)
    else:
        new_pred.append(0)
new_pred = np.array(new_pred)

In [33]:
submit_data['loan_default'] = new_pred

ValueError: Length of values (29894) does not match length of index (30000)

In [81]:
submit_data.to_csv('lgb_stacking.csv',index=False)