In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from tqdm import trange
from sklearn.metrics import mean_squared_error,roc_auc_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [2]:
#查看结果的最大值，最小值，平均值
def show(result):
    result = np.array(result)
    print('max: ',str(max(result)))
    print('min: ',str(min(result)))
    print('mean: ',str((sum(result))/(len(result))))
    print('var: ',str(np.var(result)))
    print()

In [3]:
#官方的loss评估
def logloss(y_true, y_pred,deta = 1.85, eps=1e-15):
    # Prepare numpy array data
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert (len(y_true) and len(y_true) == len(y_pred))
    # Clip y_pred between eps and 1-eps
    p = np.clip(y_pred, eps, 1-eps)
    loss = np.sum(- y_true * np.log(p) * deta - (1 - y_true) * np.log(1-p))
    return loss / len(y_true)

In [4]:
'''
把结果保存成指定格式的文件
示例：save_result2csv(y_submit,submit,'./round1_diac2019_test.csv')
其中y_submit为模型输出结果，submit为读入数据时生成的一个变量，第三个参数为保存的文件路劲
'''
def save_result2csv(ys_submit,submit,csv_name):
    all_customers = pd.DataFrame(submit[['customer_id']]).drop_duplicates(['customer_id']).dropna()
    submits_df = submit[['customer_id']]
    submits_df['result'] = ys_submit
    all_customers = pd.merge(all_customers,submits_df,on=['customer_id'],how='left',copy=False)
    all_customers = all_customers.sort_values(['customer_id'])
    all_customers['customer_id'] = all_customers['customer_id'].astype('int64')
    all_customers['result'] = all_customers['result'].fillna(0)
    all_customers.to_csv(csv_name,index=False)

In [6]:
print('Load data...')
train = pd.read_csv('../Feature/train.csv',low_memory=False)
submit = pd.read_csv('../Feature/submit12.csv',low_memory=False)

Load data...


In [7]:
y = train.pop('label')
feature = [x for x in train.columns if x not in ['customer_id']]
X = train[feature]

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)
submit_df = submit[['customer_id']]
X_submit0 = submit[feature]
X_submit = X_submit0[feature]

In [9]:
def gbdt_lr_train(X_train,y_train,X_test):

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=30, max_depth=8, verbose=5,max_features=0.8)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_train)[:, 1]
    gbdt_auc = metrics.roc_auc_score(y_train, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)
    print('gbdt logloss: %.5f' % logloss(y_train,y_pred_gbdt))

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_train)[:, 1]
    lr_test_auc = metrics.roc_auc_score(y_train, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)
    print('基于原有特征的LR logloss: %.5f' % logloss(y_train,y_pred_test))

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1,verbose=5)
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[:train_rows, :])[:, 1]
    gbdt_lr_auc1 = metrics.roc_auc_score(y_train, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)
    print('基于GBDT特征编码后的LR logloss: %.5f' % logloss(y_train,y_pred_gbdtlr1))
    
    y_pred_res = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    return y_pred_res

In [110]:
y_preprob=gbdt_lr_train(X,y,X_submit)

      Iter       Train Loss   Remaining Time 
         1           0.9718            7.74m
         2           0.9449            7.42m
         3           0.9217            7.16m
         4           0.9024            6.94m
         5           0.8861            6.71m
         6           0.8723            6.44m
         7           0.8608            6.18m
         8           0.8509            5.91m
         9           0.8423            5.65m
        10           0.8349            5.38m
        11           0.8283            5.12m
        12           0.8228            4.85m
        13           0.8180            4.57m
        14           0.8138            4.29m
        15           0.8097            4.03m
        16           0.8064            3.77m
        17           0.8032            3.50m
        18           0.8004            3.23m
        19           0.7980            2.96m
        20           0.7960            2.68m
        21           0.7940            2.41m
        2

In [113]:
show(y_preprob)

max:  0.9993982919856202
min:  4.9135307938928126e-08
mean:  0.29932285237014233
var:  0.033054851061294195



In [114]:
save_result2csv(y_preprob,submit,'../Res/res_gbdt_new12.csv')