**导入所需库**

In [1]:
import datetime
import pandas as pd
import numpy as np
import gensim
import warnings
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
warnings.filterwarnings('ignore')

**读取数据**

In [2]:
data = pd.read_csv('../cache/features_add.csv')
print(data.shape)

(62767, 1509)


**特征选择**

剔除特征nunique个数小于等于1的特征列

In [4]:
for col in data.columns:
    if col not in ['UID', 'Tag']:
        if data[col].nunique() <= 1:
            del data[col]

In [5]:
def calIV(df, val, target):
    eps = 0.000001
    gbi = pd.crosstab(df[val], df[target]) + eps
    gb = df[target].value_counts() + eps
    gbri = gbi / gb
    gbri['woe'] = np.log(gbri[1]/gbri[0])
    gbri['iv'] = (gbri[1]-gbri[0]) * gbri['woe']
    return gbri['iv'].sum()

def select_feature(clf,x_train,y_train,x_valid):
    clf.fit(x_train.fillna(-99999), y_train)
    model = SelectFromModel(clf, prefit=True, threshold="mean")

    x_train = model.transform(x_train.fillna(-99999))
    x_valid = model.transform(x_valid.fillna(-99999))

    return x_train,x_valid

In [6]:
# iv_features_importance = {'col': [],
#                           'iv_score': []}
# for col in data.columns:
#     if col not in ['UID', 'Tag']:
#         iv_features_importance['col'].append(col)
#         iv_features_importance['iv_score'].append(calIV(data, col, 'Tag'))
# iv_features_importance = pd.DataFrame(iv_features_importance)
# iv_features_importance = iv_features_importance.sort_values(by=['iv_score'], ascending=False)

# n = int(data.shape[1]*0.8)
# iv_important_features = list(iv_features_importance['col'][0:n].values)
# data = data[iv_important_features+['UID', 'Tag']]
# data.shape

In [7]:
train = data[data['Tag']!=0.5].copy()
test = data[data['Tag']==0.5].copy()
y_train = train['Tag']
feats = [f for f in train.columns if f not in ['UID', 'Tag']]

In [None]:
clf=RandomForestClassifier()
x_train,x_valid=select_feature(clf,train[feats],train['Tag'], test[feats])
train=pd.DataFrame(x_train)
test=pd.DataFrame(x_valid)

In [8]:
feats = [f for f in train.columns if f not in ['UID', 'Tag']]

**评测函数**

In [9]:
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

def eval_function(y_predict,dtrain):
    y_true = dtrain.get_label()
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 'tpr', 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3, True

**LightGBM**

In [10]:
# 贝叶斯调参后的最优参数
parameters = {'boosting_type': 'gbdt',
              'objective': 'binary',
              'learning_rate': 0.04156748636531865,
              'metric': 'binary_logloss',
              'num_leaves': 112,
              'max_depth': 6,
              'feature_fraction': 0.7,
              'bagging_fraction': 0.7713637401928072,
              'subsample_freq': 1,
              'seed': 666,
              'verbose': -1,
              'n_jobs': 10,
              'lambda_l2': 0.8441547680817145,
              'lambda_l1': 7.689961311475777,
#               'max_bin': 310
              }
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)
xx_submit = []
xx_tpr = []
xx_auc = []
xx_iteration = []
oof_preds = np.zeros(train.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train['Tag'])):
    dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                         label=train['Tag'].iloc[train_idx])
    dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                         label=train['Tag'].iloc[valid_idx])
    clf = lgb.train(
        params=parameters,
        train_set=dtrain,
        num_boost_round=2000,
        valid_sets=[dvalid],
        early_stopping_rounds=100,
        verbose_eval=False,
#         feval=eval_function
    )
    # save feature's importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0)
    valid_preds = clf.predict(train[feats].iloc[valid_idx], num_iteration=clf.best_iteration)
    print('Fold%2d LOGLOSS: %.6f' % (n_fold + 1, clf.best_score['valid_0']['binary_logloss']), 'Fold%2d TPR: %.6f' % (n_fold + 1, tpr_weight_funtion(train['Tag'][valid_idx], valid_preds)))
    xx_auc.append(clf.best_score['valid_0']['binary_logloss'])
    xx_tpr.append(tpr_weight_funtion(train['Tag'][valid_idx], valid_preds))
    xx_iteration.append(clf.best_iteration)
    xx_submit.append(clf.predict(test[feats], num_iteration=clf.best_iteration))
    oof_preds[valid_idx] = clf.predict(train[feats].iloc[valid_idx], num_iteration=clf.best_iteration)

print('特征个数:%d' % (len(feats)))
print('线下平均LOGLOSS:%.5f' % (np.mean(xx_auc)))
print('线下全集TPR:%.5f' % (tpr_weight_funtion(train['Tag'], oof_preds)))
print('线下平均TPR:%.5f' % (np.mean(xx_tpr)))
print('线下平均迭代次数:%d' % (np.mean(xx_iteration)))
print(xx_iteration)

**特征重要性**

In [15]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by = "importance", ascending = False)[:50].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize = (15, 10))
    sns.barplot(x = "importance", y = "feature", data = best_features.sort_values(by = "importance", ascending = False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('../cache/lgbm_importances.png')
display_importances(feature_importance_df)
feature_importance_df.to_csv('../cache/feature_importance_df.csv', index=False)

**提交结果**

In [17]:
s = 0
for i in xx_submit:
    s = s + i

test['Tag'] = list(s / 5)
test = data[data['Tag']==0.5].copy()
submission = test[['UID', 'Tag']]
submission[['UID', 'Tag']].to_csv("../submission/lgb_basesub.csv", index=False)