In [45]:
import pandas as pd
import lightgbm as lgb
import statistics
from efficient_tuning import *
from sklearn.model_selection import KFold

In [2]:
train = pd.read_csv('./final/train_final.csv', engine='python')
test = pd.read_csv('./final/test_final.csv', engine='python')

In [3]:
data = pd.concat([train, test], axis=0, ignore_index=True)
data.shape

(100000, 146)

In [4]:
train.shape

(50000, 146)

## lgb默认参数

In [5]:
data.shape[0] == sum(data['loan_status'].value_counts())

True

In [6]:
X = data.drop(columns='loan_status').values
y = data['loan_status'].values

In [7]:
train_data = lgb.Dataset(X, label=y)

In [8]:
param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'binary_error'
num_round = 10

In [None]:
lgb.cv(param, train_data, num_round, nfold=5)

In [None]:
"""{'binary_error-mean': [0.19985999999999998,
  0.19985999999999998,
  0.19985999999999998,
  0.19985999999999998,
  0.14916,
  0.12000999999999999,
  0.10207999999999999,
  0.09613999999999999,
  0.09107000000000001,
  0.08845],
 'binary_error-stdv': [1.99999999999978e-05,
  1.99999999999978e-05,
  1.99999999999978e-05,
  1.99999999999978e-05,
  0.0020941346661568845,
  0.0015242703172337918,
  0.0018993683160461554,
  0.0020594659501919437,
  0.002299695632034813,
  0.0020344532435030266]}
  """

## 自动调参选择最优参数

In [10]:
fitter = LGBFitter(label='loan_status')

In [11]:
kfold = KFold(n_splits=5)

In [None]:
# 耗时：1:45:23
fitter.search_k_fold(kfold, data)

In [17]:
fitter.opt_params

{'bagging_fraction': 0.9834654040956224,
 'boosting': 2,
 'device_type': 0,
 'drop_rate': 0.1997907495675753,
 'extra_tress': 1,
 'feature_fraction': 0.6838115928097803,
 'lambda_l1': 5.187010489820754,
 'lambda_l2': 5.837652960694227,
 'learning_rate': 0.07163325236711929,
 'metric': 0,
 'min_gain_to_split': 0.5254984036581041,
 'num_leaves': 0,
 'num_rounds': 0,
 'num_threads': 0,
 'objective': 0,
 'uniform_drop': 0}

In [5]:
# params = {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary',
#                 'num_round': 2000, 'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

## 特征选择

### 全量特征baseline
效果好于lgb默认参数

In [12]:
params = {
    'num_thread': 4,
    'bagging_fraction': 0.9834654040956224,
    'boosting': 'goss',
    'device_type': 'cpu',
    'drop_rate': 0.1997907495675753,
    'extra_tress': 1,
    'feature_fraction': 0.6838115928097803,
    'lambda_l1': 5.187010489820754,
    'lambda_l2': 5.837652960694227,
    'learning_rate': 0.07163325236711929,
    'metric': 'binary_error',
    'min_gain_to_split': 0.5254984036581041,
    'num_leaves': 64,
    'num_round': 1000,
    'objective': 'binary',
    'uniform_drop': True,
}

In [18]:
%%time
*_, base_acc_result, base_models = fitter.train_k_fold(kfold, train, test, params=params)

The minimum is attained in round 51
Finished loading model, total used 970 iterations
The minimum is attained in round 50
Finished loading model, total used 952 iterations
The minimum is attained in round 32
Finished loading model, total used 964 iterations
The minimum is attained in round 230
Finished loading model, total used 959 iterations
The minimum is attained in round 157
Finished loading model, total used 956 iterations
CPU times: user 58.6 s, sys: 526 ms, total: 59.1 s
Wall time: 15.2 s


In [46]:
mean_base_acc_result = statistics.mean(base_acc_result)

In [48]:
mean_base_acc_result

0.08036000000000001

In [26]:
import sys
sys.getsizeof(best_base_models)

48

### 去除一个feature变量

In [37]:
columns = data.columns.to_list()
columns.remove('loan_status')

In [None]:
%%time
# 半小时左右
del_one_feature_base_acc_res = []
del_one_feature_mean_base_acc_res = []
for del_feature_name in columns:
    temp_train = train.drop(columns=del_feature_name)
    temp_test = test.drop(columns=del_feature_name)
    temp_fitter = LGBFitter(label='loan_status')
    temp_kfold = KFold(n_splits=5)
    *_, temp_base_acc_result, _ = temp_fitter.train_k_fold(
        temp_kfold, temp_train, temp_test, params=params)
    del_one_feature_base_acc_res.append(temp_base_acc_result)
    del_one_feature_mean_base_acc_res.append(statistics.mean(temp_base_acc_result))

In [59]:
sorted_result = sorted(list(zip(columns, del_one_feature_mean_base_acc_res)), key = lambda x: x[1])

In [64]:
gt_base_acc = list(filter(lambda x: x[1] < mean_base_acc_result, sorted_result))

In [76]:
gt_columns = [i[0] for i in gt_base_acc]

In [97]:
# 大于基准的结果排序
gt_base_acc

[('discrete_purpose_1_one_hot', 0.07976000000000001),
 ('discrete_purpose_5_one_hot', 0.07977999999999999),
 ('discrete_emp_length_8_one_hot', 0.07986),
 ('discrete_sub_grade_6_one_hot', 0.07987999999999999),
 ('discrete_grade_3_one_hot', 0.07988000000000002),
 ('discrete_addr_state_44_one_hot', 0.07992000000000002),
 ('continuous_annual_inc', 0.07996),
 ('discrete_emp_length_7_one_hot', 0.07996),
 ('discrete_purpose_10_one_hot', 0.07997999999999998),
 ('discrete_term_1_one_hot', 0.07999999999999999),
 ('discrete_term_2_one_hot', 0.07999999999999999),
 ('discrete_home_ownership_3_one_hot', 0.08),
 ('discrete_addr_state_25_one_hot', 0.08001999999999998),
 ('discrete_emp_length_9_one_hot', 0.08001999999999998),
 ('discrete_addr_state_19_one_hot', 0.08002000000000001),
 ('discrete_sub_grade_11_one_hot', 0.08002000000000001),
 ('discrete_addr_state_1_one_hot', 0.08004000000000003),
 ('continuous_annual_inc_joint', 0.08008),
 ('continuous_dti_joint', 0.08008),
 ('discrete_addr_state_8_one_h

### 将大于baseline的结果叠加组合删除

In [None]:
%%time
# 分别去除前2-15个变量查看效果
del_combina_acc_res = []
del_combina_mean_acc_res = []
for num in range(2, 16):
    temp_train = train.drop(columns=gt_columns[:num])
    temp_test = test.drop(columns=gt_columns[: num])
    temp_fitter = LGBFitter(label='loan_status')
    temp_kfold = KFold(n_splits=5)
    *_, temp_base_acc_result, _ = temp_fitter.train_k_fold(
        temp_kfold, temp_train, temp_test, params=params)
    del_combina_acc_res.append(temp_base_acc_result)
    del_combina_mean_acc_res.append(statistics.mean(temp_base_acc_result))

In [100]:
combina_nams__mean_acc_res = dict(zip(list(range(2, 16)), del_combina_mean_acc_res))
combina_nams__mean_acc_res

{2: 0.0803,
 3: 0.0806,
 4: 0.07962,
 5: 0.08028000000000002,
 6: 0.08018000000000003,
 7: 0.08008,
 8: 0.08042,
 9: 0.08032000000000002,
 10: 0.08027999999999999,
 11: 0.08079999999999998,
 12: 0.08115999999999998,
 13: 0.08080000000000001,
 14: 0.08035999999999999,
 15: 0.08066}

In [93]:
mean_base_acc_result - 0.07962

0.0007400000000000184

In [104]:
sorted(combina_nams__mean_acc_res.items(), key= lambda kv: kv[1])

[(4, 0.07962),
 (7, 0.08008),
 (6, 0.08018000000000003),
 (10, 0.08027999999999999),
 (5, 0.08028000000000002),
 (2, 0.0803),
 (9, 0.08032000000000002),
 (14, 0.08035999999999999),
 (8, 0.08042),
 (3, 0.0806),
 (15, 0.08066),
 (11, 0.08079999999999998),
 (13, 0.08080000000000001),
 (12, 0.08115999999999998)]

### 由于上面都是onehot之后的变量，尝试删除所有purpose变量看看效果

In [108]:
drop_purpose = [i for i in data.columns if 'discrete_purpose' in i]
temp_train = train.drop(columns=drop_purpose)
temp_test = test.drop(columns=drop_purpose)
temp_fitter = LGBFitter(label='loan_status')
temp_kfold = KFold(n_splits=5)
*_, temp_base_acc_result, _ = temp_fitter.train_k_fold(temp_kfold, temp_train, temp_test, params=params)

The minimum is attained in round 68
Finished loading model, total used 972 iterations
The minimum is attained in round 31
Finished loading model, total used 962 iterations
The minimum is attained in round 38
Finished loading model, total used 941 iterations
The minimum is attained in round 290
Finished loading model, total used 957 iterations
The minimum is attained in round 52
Finished loading model, total used 965 iterations


In [110]:
statistics.mean(temp_base_acc_result)

0.08026

### 变量关于loan_status的相关性系数过滤

In [111]:
loan_status_cor = data.corr()[['loan_status']]

In [155]:
loan_status_cor['corr_absolute_value'] = loan_status_cor['loan_status'].abs()
cor_del_feature = loan_status_cor[(loan_status_cor['corr_absolute_value'] < 0.01) | (
    loan_status_cor['corr_absolute_value'].isna())].index.to_list()
cor_del_feature

['continuous_mths_since_last_delinq',
 'continuous_mths_since_last_major_derog',
 'discrete_addr_state_1_one_hot',
 'discrete_addr_state_2_one_hot',
 'discrete_addr_state_3_one_hot',
 'discrete_addr_state_5_one_hot',
 'discrete_addr_state_7_one_hot',
 'discrete_addr_state_8_one_hot',
 'discrete_addr_state_9_one_hot',
 'discrete_addr_state_10_one_hot',
 'discrete_addr_state_11_one_hot',
 'discrete_addr_state_12_one_hot',
 'discrete_addr_state_13_one_hot',
 'discrete_addr_state_14_one_hot',
 'discrete_addr_state_15_one_hot',
 'discrete_addr_state_16_one_hot',
 'discrete_addr_state_17_one_hot',
 'discrete_addr_state_18_one_hot',
 'discrete_addr_state_20_one_hot',
 'discrete_addr_state_22_one_hot',
 'discrete_addr_state_24_one_hot',
 'discrete_addr_state_25_one_hot',
 'discrete_addr_state_26_one_hot',
 'discrete_addr_state_27_one_hot',
 'discrete_addr_state_28_one_hot',
 'discrete_addr_state_29_one_hot',
 'discrete_addr_state_30_one_hot',
 'discrete_addr_state_31_one_hot',
 'discrete_addr_

In [156]:
temp_train = train.drop(columns=cor_del_feature)
temp_test = test.drop(columns=cor_del_feature)
temp_fitter = LGBFitter(label='loan_status')
temp_kfold = KFold(n_splits=5)
*_, temp_base_acc_result, _ = temp_fitter.train_k_fold(temp_kfold, temp_train, temp_test, params=params)

The minimum is attained in round 44
Finished loading model, total used 955 iterations
The minimum is attained in round 47
Finished loading model, total used 943 iterations
The minimum is attained in round 78
Finished loading model, total used 943 iterations
The minimum is attained in round 337
Finished loading model, total used 939 iterations
The minimum is attained in round 170
Finished loading model, total used 951 iterations


In [157]:
statistics.mean(temp_base_acc_result)

0.07984