## 读取数据

In [1]:
# 加载依赖
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV

import lightgbm as lgb

from tqdm import tqdm

In [2]:
trainData = pd.read_csv('./data/train_set.csv')
testData = pd.read_csv('./data/test_set.csv')
trainDataLen = trainData.shape[0]

allData = pd.concat([trainData, testData], sort=False)
allData.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0.0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0.0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0.0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0.0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0.0


##  参数配置

## 数据分析

In [3]:
allData.describe()

Unnamed: 0,ID,age,balance,day,duration,campaign,pdays,previous,y
count,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,25317.0
mean,18085.0,40.966961,1368.237026,15.788742,257.574497,2.771545,40.160552,0.578147,0.116957
std,10441.23528,10.639679,3098.876172,8.314523,255.040668,3.114445,100.005886,2.365582,0.321375
min,1.0,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,9043.0,33.0,73.0,8.0,103.0,1.0,-1.0,0.0,0.0
50%,18085.0,39.0,449.0,16.0,181.0,2.0,-1.0,0.0,0.0
75%,27127.0,48.0,1437.0,21.0,318.0,3.0,-1.0,0.0,0.0
max,36169.0,95.0,102127.0,31.0,3881.0,58.0,871.0,275.0,1.0


In [4]:
def uniqueValueOfDF(data):
    dic = {}
    objects = data.columns[data.dtypes=='object']
    max_len = 0
    for obj in objects:
        dic[obj] = pd.Series(sorted(data[obj].unique()))
        max_len = max(max_len, len(dic[obj]))
    
    return pd.DataFrame(dic).fillna('-')

In [5]:
allColTypes = uniqueValueOfDF(allData)
allColTypes

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,admin.,divorced,primary,no,no,no,cellular,apr,failure
1,blue-collar,married,secondary,yes,yes,yes,telephone,aug,other
2,entrepreneur,single,tertiary,-,-,-,unknown,dec,success
3,housemaid,-,unknown,-,-,-,-,feb,unknown
4,management,-,-,-,-,-,-,jan,-
5,retired,-,-,-,-,-,-,jul,-
6,self-employed,-,-,-,-,-,-,jun,-
7,services,-,-,-,-,-,-,mar,-
8,student,-,-,-,-,-,-,may,-
9,technician,-,-,-,-,-,-,nov,-


In [6]:
# output data used to analyze late
train_use_col = list(set(allData.columns) - set(['ID','y']))

X1 = allData[train_use_col][:trainDataLen]
y1 = allData['y'][:trainDataLen]

X2 = allData[train_use_col][trainDataLen:]

## 特征融合

In [7]:
sparse_feature= ['campaign','contact','default','education','housing','job','loan','marital','month','poutcome']
dense_feature = list(set(train_use_col)-set(sparse_feature))

def get_new_columns(name,aggs):
    l=[]
    for k in aggs.keys():
        for agg in aggs[k]:
            if str(type(agg))=="<class 'function'>":
                l.append(name + '_' + k + '_' + 'other')
            else:
                l.append(name + '_' + k + '_' + agg)
    return l

for d in tqdm(train_use_col):
    aggs={}
    if d in sparse_feature:
        aggs[d]=['count','nunique']
    if d in dense_feature:
        aggs[d]=['mean','max','min','std']
    temp=allData.groupby(d).agg(aggs).reset_index()
    temp.columns=[d]+get_new_columns(d,aggs)
    allData=pd.merge(allData,temp,on=d,how='left')

  0%|                                                                                           | 0/16 [00:00<?, ?it/s]

{'duration': ['mean', 'max', 'min', 'std']}
{'campaign': ['count', 'nunique']}


 12%|██████████▍                                                                        | 2/16 [00:00<00:00, 14.09it/s]

{'education': ['count', 'nunique']}


 19%|███████████████▌                                                                   | 3/16 [00:00<00:01, 11.51it/s]

{'day': ['mean', 'max', 'min', 'std']}
{'balance': ['mean', 'max', 'min', 'std']}


 31%|█████████████████████████▉                                                         | 5/16 [00:00<00:00, 11.53it/s]

{'housing': ['count', 'nunique']}


 38%|███████████████████████████████▏                                                   | 6/16 [00:00<00:00, 10.18it/s]

{'age': ['mean', 'max', 'min', 'std']}


 44%|████████████████████████████████████▎                                              | 7/16 [00:00<00:00,  9.72it/s]

{'loan': ['count', 'nunique']}


 50%|█████████████████████████████████████████▌                                         | 8/16 [00:00<00:00,  8.77it/s]

{'contact': ['count', 'nunique']}


 56%|██████████████████████████████████████████████▋                                    | 9/16 [00:00<00:00,  7.86it/s]

{'month': ['count', 'nunique']}


 62%|███████████████████████████████████████████████████▎                              | 10/16 [00:01<00:00,  7.28it/s]

{'default': ['count', 'nunique']}


 69%|████████████████████████████████████████████████████████▍                         | 11/16 [00:01<00:00,  7.04it/s]

{'pdays': ['mean', 'max', 'min', 'std']}


 75%|█████████████████████████████████████████████████████████████▌                    | 12/16 [00:01<00:00,  6.84it/s]

{'previous': ['mean', 'max', 'min', 'std']}


 81%|██████████████████████████████████████████████████████████████████▋               | 13/16 [00:01<00:00,  6.50it/s]

{'job': ['count', 'nunique']}


 88%|███████████████████████████████████████████████████████████████████████▊          | 14/16 [00:01<00:00,  5.81it/s]

{'poutcome': ['count', 'nunique']}


 94%|████████████████████████████████████████████████████████████████████████████▉     | 15/16 [00:02<00:00,  5.35it/s]

{'marital': ['count', 'nunique']}


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  4.96it/s]


In [8]:
for f in sparse_feature:
    allData['count_day_month_{}'.format(f)] = allData.groupby(['day','month',f])[f].transform('count')
    allData['count_month_{}'.format(f)] = allData.groupby(['month',f])[f].transform('count')
    allData['count_day_{}'.format(f)] = allData.groupby(['day',f])[f].transform('count')
    allData['count_day_month_{}/count_month_{}'.format(f,f)] = allData['count_month_{}'.format(f)]/allData['count_day_month_{}'.format(f)]

In [9]:
allData.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,...,count_day_marital,count_day_month_marital/count_month_marital,count_day_month_month,count_month_month,count_day_month,count_day_month_month/count_month_month,count_day_month_poutcome,count_month_poutcome,count_day_poutcome,count_day_month_poutcome/count_month_poutcome
0,1,43,management,married,tertiary,no,291,yes,no,unknown,...,783,35.466667,338,10957,338,32.41716,338,8949,1101,26.476331
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,...,170,24.555556,71,2365,71,33.309859,7,202,57,28.857143
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,...,939,21.138554,238,5569,238,23.39916,235,5363,1192,22.821277
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,...,575,25.415094,206,5569,206,27.033981,206,5363,1425,26.033981
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,...,173,53.291667,365,10957,365,30.019178,358,8949,1386,24.997207


## 编码

In [10]:
encodingFlag = True
if encodingFlag:
    for col in allData.columns[allData.dtypes == 'object']:
        le = LabelEncoder()
        le.fit(allData[col])

        allData[col] = le.transform(allData[col])
    allData.head()

    X1 = allData[train_use_col][:trainDataLen]
    y1 = allData['y'][:trainDataLen]

    X2 = allData[trainDataLen:]

## 数据归一化

In [1]:
# 对于基于树的模型来说，数据变换或者归一化没什么作用
dataScale = False
if dataScale:
    train_use_col = list(set(allData.columns)-set(['ID','y']))

    scaler = StandardScaler()
    scaler.fit(allData[train_use_col])
    allData[train_use_col] = scaler.transform(allData[train_use_col])
    allData[train_use_col].head()

In [12]:
if dataScale:
    X1 = allData[train_use_col][:trainDataLen]
    y1 = allData['y'][:trainDataLen]

    X2 = allData[trainDataLen:]

    delElements = X1.columns[X1.isna().any()].tolist()
    for ele in delElements:
        train_use_col.remove(ele)

    X1.head()

## 特征选择

In [13]:
y1_predict = np.zeros(len(y1))
len(train_use_col)

95

In [14]:
params1 = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':{'auc'},
    'verbose':0,
    'learning_rate':0.01,
    'is_unbalance':True,
    'num_leaves' : 30,
    'reg_alpha' : 0,
    'reg_lambda' : 0,
    'max_depth' : -1,
    'n_estimators' : 1500,
    'colsample_bytree':0.7,
    'subsample':0.95,
    'subsample_freq':1,
    'random_state':201907
}

n_splits = 10
kfold = KFold(n_splits, shuffle=True, random_state=88)
pred = 0
i = 0
fold_importance={}
for train_idx, val_idx in kfold.split(X1[train_use_col]):
    print("fold: ",i)
    i=i+1
    train_data = lgb.Dataset(X1.loc[train_idx][train_use_col], label = y1.loc[train_idx])
    val_data = lgb.Dataset(X1.loc[val_idx][train_use_col], label = y1.loc[val_idx])
    
    model = lgb.train(params1,
                  train_data,
                  num_boost_round=1000,
                  valid_sets=[train_data,val_data],
                  early_stopping_rounds=50,
                  # categorical_feature=['job','marital','education','default','housing','loan','contact','poutcome'],
                  verbose_eval = 300
                 )
    pred += model.predict(X2[train_use_col],num_iteratio=model.best_iteration)
    print(len(model.feature_importance()))
    fold_importance["importance_fold_{}".format(i)] = model.feature_importance()
    y1_predict[val_idx] = model.predict(X1.loc[val_idx,:][train_use_col], num_iteration=model.best_iteration)
    
pred = pred / n_splits
print("roc_auc_score:  ",roc_auc_score(y1, y1_predict))

fold:  0




Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956697	valid_1's auc: 0.940148
Early stopping, best iteration is:
[269]	training's auc: 0.954968	valid_1's auc: 0.940232
95
fold:  1
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956269	valid_1's auc: 0.946148
[600]	training's auc: 0.969912	valid_1's auc: 0.949133
Early stopping, best iteration is:
[579]	training's auc: 0.969054	valid_1's auc: 0.9492
95
fold:  2
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.957209	valid_1's auc: 0.935612
[600]	training's auc: 0.970668	valid_1's auc: 0.937722
Early stopping, best iteration is:
[657]	training's auc: 0.972926	valid_1's auc: 0.937846
95
fold:  3
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956635	valid_1's auc: 0.937445
[600]	training's auc: 0.97063	valid_1's auc: 0.940667
Early stopping, best iteration is:
[637]	training's auc: 0.97209

In [15]:
len(train_use_col)

95

In [16]:
fold_importance['sum'] = 0
for i in range(1,n_splits+1):
    fold_importance['sum'] += fold_importance['importance_fold_{}'.format(i)]
train_use_col = np.array(train_use_col)[(fold_importance['sum']!=0).tolist()].tolist()

In [17]:
len(train_use_col)
len((fold_importance['sum']).tolist())

95

## 模型搭建

In [18]:
y1_predict = np.zeros(len(y1))

In [19]:
params1 = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':{'auc'},
    'verbose':0,
    'learning_rate':0.01,
    'is_unbalance':True,
    'num_leaves' : 30,
    'reg_alpha' : 0,
    'reg_lambda' : 0,
    'max_depth' : -1,
    'n_estimators' : 1500,
    'colsample_bytree':0.7,
    'subsample':0.95,
    'subsample_freq':1,
    'random_state':201907
}
params2 = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 666}
params3 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'min_child_samples': 46,
    'min_child_weight': 0.01,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'num_leaves': 31,
    'max_depth': 5,
    'lambda_l2': 1,
    'lambda_l1': 0,
    'n_jobs': -1,
    'seed': 4590
}

n_splits = 10
kfold = KFold(n_splits, shuffle=True, random_state=88)
pred = 0
i = 0
fold_importance={}
for train_idx, val_idx in kfold.split(X1[train_use_col]):
    print("fold: ",i)
    i=i+1
    train_data = lgb.Dataset(X1.loc[train_idx][train_use_col], label = y1.loc[train_idx])
    val_data = lgb.Dataset(X1.loc[val_idx][train_use_col], label = y1.loc[val_idx])
    
    model = lgb.train(params1,
                  train_data,
                  num_boost_round=1000,
                  valid_sets=[train_data,val_data],
                  early_stopping_rounds=50,
                  # categorical_feature=['job','marital','education','default','housing','loan','contact','poutcome'],
                  verbose_eval = 300
                 )
    pred += model.predict(X2[train_use_col],num_iteratio=model.best_iteration)
    fold_importance["importance_fold_{}".format(i)] = model.feature_importance()
    y1_predict[val_idx] = model.predict(X1.loc[val_idx,:][train_use_col], num_iteration=model.best_iteration)
    
pred = pred / n_splits

print("roc_auc_score:  ",roc_auc_score(y1, y1_predict))

fold:  0




Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956697	valid_1's auc: 0.940148
Early stopping, best iteration is:
[269]	training's auc: 0.954968	valid_1's auc: 0.940232
fold:  1
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956269	valid_1's auc: 0.946148
[600]	training's auc: 0.969912	valid_1's auc: 0.949133
Early stopping, best iteration is:
[579]	training's auc: 0.969054	valid_1's auc: 0.9492
fold:  2
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.957209	valid_1's auc: 0.935612
[600]	training's auc: 0.970668	valid_1's auc: 0.937722
Early stopping, best iteration is:
[657]	training's auc: 0.972926	valid_1's auc: 0.937846
fold:  3
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.956635	valid_1's auc: 0.937445
[600]	training's auc: 0.97063	valid_1's auc: 0.940667
Early stopping, best iteration is:
[637]	training's auc: 0.972095	valid_1

## 输出比赛结果

In [20]:
resultData = pd.DataFrame(pred, index=X2['ID'], columns=['pred'])
resultData.to_csv('result.csv')