## 读取数据

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV

import lightgbm as lgb

from tqdm import tqdm

In [2]:
trainData = pd.read_csv('./data/train_set.csv')
testData = pd.read_csv('./data/test_set.csv')
trainDataLen = len(trainData)
# modification of allData will not be reflected into trainData or testData
allData = pd.concat([trainData, testData],sort=False)
allData.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0.0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0.0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0.0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0.0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0.0


## 参数配置

In [3]:
# 是否进行特征融合
featureCombination = False
# 是否进行归一化
dataScale = True
# 是否进行特征选择
featureSlection = False

## 数据分析

In [4]:
allData.describe()

Unnamed: 0,ID,age,balance,day,duration,campaign,pdays,previous,y
count,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,36169.0,25317.0
mean,18085.0,40.966961,1368.237026,15.788742,257.574497,2.771545,40.160552,0.578147,0.116957
std,10441.23528,10.639679,3098.876172,8.314523,255.040668,3.114445,100.005886,2.365582,0.321375
min,1.0,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,9043.0,33.0,73.0,8.0,103.0,1.0,-1.0,0.0,0.0
50%,18085.0,39.0,449.0,16.0,181.0,2.0,-1.0,0.0,0.0
75%,27127.0,48.0,1437.0,21.0,318.0,3.0,-1.0,0.0,0.0
max,36169.0,95.0,102127.0,31.0,3881.0,58.0,871.0,275.0,1.0


In [5]:
# 查看所有object的枚举类型
# 帅选出object类型
def objectInfoView(data):
    columnObjects = data.columns[data.dtypes=="object"]
    dic = {}
    for object in columnObjects:
        dic[object] = data[object].unique()
    return dic

In [6]:
# 查看字符串数据的所有类型
allTypes = objectInfoView(allData)
for key in allTypes.keys():
    print(key, allTypes[key])

job ['management' 'technician' 'admin.' 'services' 'retired' 'student'
 'blue-collar' 'unknown' 'entrepreneur' 'housemaid' 'self-employed'
 'unemployed']
marital ['married' 'divorced' 'single']
education ['tertiary' 'primary' 'secondary' 'unknown']
default ['no' 'yes']
housing ['yes' 'no']
loan ['no' 'yes']
contact ['unknown' 'cellular' 'telephone']
month ['may' 'apr' 'jul' 'jun' 'nov' 'aug' 'jan' 'feb' 'dec' 'oct' 'sep' 'mar']
poutcome ['unknown' 'other' 'failure' 'success']


In [7]:
train_use_col = list(set(allData.columns)-set(['ID','y']))

X1 = allData[train_use_col][:trainDataLen]
y1 = allData['y'][:trainDataLen]

X2 = allData[trainDataLen:]

## 特征融合

In [8]:
def feature_count(data, features):
    feature_name = 'count'
    for i in features:
        feature_name += '_'+i
    temp = data.groupby(features).size().reset_index().rename(columns={0:feature_name})
    data = data.merge(temp, 'left', on=features)
    return data, feature_name

In [9]:
if featureCombination:
    feature = allData.columns.tolist()
    feature.remove('ID')
    feature.remove('y')
    sparse_feature= ['campaign','contact','default','education','housing','job','loan','marital','month','poutcome']
    dense_feature = list(set(feature)-set(sparse_feature))

In [10]:
if featureCombination:
    ll=[]
    for f in['campaign', 'contact','default','education','housing','job','loan','marital','poutcome']:
        allData,_=feature_count(allData,['month','day',f])
        ll.append(_)
    allData.head()

In [11]:
def get_new_columns(name,aggs):
    l=[]
    for k in aggs.keys():
        for agg in aggs[k]:
            if str(type(agg))=="<class 'function'>":
                l.append(name + '_' + k + '_' + 'other')
            else:
                l.append(name + '_' + k + '_' + agg)
    return l

In [12]:
if featureCombination:
    for d in tqdm(sparse_feature):
        aggs={}
        for s in sparse_feature:
            aggs[s]=['count','nunique']
        for den in dense_feature:
            aggs[den]=['mean','max','min','std']
        aggs.pop(d)
        temp=allData.groupby(d).agg(aggs).reset_index()
        temp.columns=[d]+get_new_columns(d,aggs)
        allData=pd.merge(allData,temp,on=d,how='left')

In [13]:
if featureCombination:
    allData.head()

## 编码

In [14]:
encodingFlag = True
if encodingFlag:
    for col in allData.columns[allData.dtypes == 'object']:
        le = LabelEncoder()
        le.fit(allData[col])

        allData[col] = le.transform(allData[col])
    allData.head()

    X1 = allData[train_use_col][:trainDataLen]
    y1 = allData['y'][:trainDataLen]

    X2 = allData[trainDataLen:]

## 数据归一化

In [15]:
if dataScale:
    train_use_col = list(set(allData.columns)-set(['ID','y']))

    scaler = StandardScaler()
    scaler.fit(allData[train_use_col])
    allData[train_use_col] = scaler.transform(allData[train_use_col])
    allData[train_use_col].head()

In [16]:
if dataScale:
    X1 = allData[train_use_col][:trainDataLen]
    y1 = allData['y'][:trainDataLen]

    X2 = allData[trainDataLen:]

    delElements = X1.columns[X1.isna().any()].tolist()
    for ele in delElements:
        train_use_col.remove(ele)

    X1.head()

## 特征选择

In [17]:
# Feature Importance
if featureSlection:
    from sklearn.ensemble import ExtraTreesClassifier
    import matplotlib.pyplot as plt
    model = ExtraTreesClassifier()
    model.fit(X1[train_use_col],y1)
    #plot graph of feature importances for better visualization
    feat_importances = pd.Series(model.feature_importances_, index=train_use_col)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()


    train_use_col = feat_importances.sort_values(ascending=False)[:10].index.tolist()

    X1 = allData[train_use_col][:trainDataLen]
    y1 = allData['y'][:trainDataLen]

    X2 = allData[trainDataLen:]

## 模型搭建

In [18]:
y1_predict = np.zeros(len(y1))

In [19]:
params1 = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':{'auc'},
    'verbose':0,
    'learning_rate':0.01,
    'is_unbalance':True,
    'num_leaves' : 30,
    'reg_alpha' : 0,
    'reg_lambda' : 0,
    'max_depth' : -1,
    'n_estimators' : 1500,
    'colsample_bytree':0.7,
    'subsample':0.95,
    'subsample_freq':1,
    'random_state':201907
}
params2 = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 666}
params3 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'min_child_samples': 46,
    'min_child_weight': 0.01,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'num_leaves': 31,
    'max_depth': 5,
    'lambda_l2': 1,
    'lambda_l1': 0,
    'n_jobs': -1,
    'seed': 4590
}
score = []
n_splits = 10
kfold = KFold(n_splits, shuffle=True, random_state=88)
pred = 0
i = 0
for train_idx, val_idx in kfold.split(X1):
    print("fold: ",i)
    i=i+1
    train_data = lgb.Dataset(X1.loc[train_idx], label = y1.loc[train_idx])
    val_data = lgb.Dataset(X1.loc[val_idx], label = y1.loc[val_idx])
    
    model = lgb.train(params3,
                  train_data,
                  num_boost_round=1000,
                  valid_sets=[train_data,val_data],
                  early_stopping_rounds=50,
                  # categorical_feature=['job','marital','education','default','housing','loan','contact','poutcome'],
                  verbose_eval = 300
                 )
    pred += model.predict(X2[train_use_col],num_iteratio=model.best_iteration)
    
    y1_predict[val_idx] = model.predict(X1.loc[val_idx,:], num_iteration=model.best_iteration)
    
pred = pred / n_splits

print("roc_auc_score:  ",roc_auc_score(y1, y1_predict))

fold:  0
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.934569	valid_1's auc: 0.920968
Early stopping, best iteration is:
[311]	training's auc: 0.934999	valid_1's auc: 0.921294
fold:  1
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.932999	valid_1's auc: 0.933981
[600]	training's auc: 0.942314	valid_1's auc: 0.938275
[900]	training's auc: 0.950006	valid_1's auc: 0.941323
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.951876	valid_1's auc: 0.941687
fold:  2
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.935803	valid_1's auc: 0.911116
Early stopping, best iteration is:
[311]	training's auc: 0.936214	valid_1's auc: 0.911351
fold:  3
Training until validation scores don't improve for 50 rounds.
[300]	training's auc: 0.934479	valid_1's auc: 0.92357
[600]	training's auc: 0.943164	valid_1's auc: 0.929432
[900]	training's auc: 0.950599	valid_1's auc: 0.

## Prameters Tuning

In [20]:
if False:
    param_grid = {
        'learning_rate': [0.005,0.01,0.02],
        'n_estimators': list(range(10,401,10)),
        'num_leaves': [6,8,12,16],
        'boosting_type' : ['gbdt','goss','rf'],
        'objective' : ['binary'],
        'random_state' : [66],
        'colsample_bytree' : [0.65, 0.66],
        'subsample' : [0.7,0.75],
        'reg_alpha' : [0, 1,1.2],
        'reg_lambda' : [0,1,1.2,1.4],
        }
    lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', num_boost_round=2000, learning_rate=0.01, metric='auc')
    gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=n_splits)
    lgb_model = gsearch.fit(X=X1, y=y1)

    print(lgb_model.best_params_, lgb_model.best_score_)

## 输出比赛结果


In [21]:
resultData = pd.DataFrame(pred, index=X2['ID'], columns=['pred'])
resultData.to_csv('result.csv')