In [9]:
# !/usr/bin/python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings("ignore")
import gc
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [10]:
def preProcess():
    path = "../data/"
    print("读取数据...")
    operation_TRAIN = pd.read_csv("../data/operation_TRAIN.csv")
    transaction_TRAIN = pd.read_csv("../data/transaction_TRAIN.csv")
    tag_TRAIN = pd.read_csv("../data/tag_TRAIN.csv")
    df_train = 
    df_train = pd.read_csv(path + "train.csv")
    df_test = pd.read_csv(path + "test.csv")
    print("读取结束...")
    df_train.drop(["Id"], axis=1, inplace=True)
    df_test.drop(["Id"], axis=1, inplace=True)
    df_test["Label"] = -1
    data = pd.concat([df_train, df_test])
    data = data.fillna(-1)
    data.to_csv("../data/data.csv", index=False)
    return data

In [11]:
def gbdt_lr_predict(data, dis_feature, con_feature):
    """离散特征one-hot"""
    print("开始one-hot...")
    for col in dis_feature:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot结束")
    
    train = data[data["Label"] != -1]
    target = train.pop("Label")
    test = data[data["Label"] == -1]
    test.drop(["Label"], axis=1, inplace=True)
    
    print("划分数据集...")
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2018)
    
    print("开始训练gbdt...")
    gbm = lgb.LGBMRegressor(objective="binary",
                            subsample=0.8,
                            min_child_weight=0.5,
                            colsample_bytree=0.7,
                            num_leaves=100,
                            max_depth=12,
                            learning_rate=0.001,
                            n_estimators=10)
    gbm.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_val, y_val)],
            eval_names=["train", "val"],
            eval_metric="binary_logloss")
    model = gbm.booster_
    print("训练得到叶子数...")
    gbdt_feats_train = model.predict(train, pred_leaf=True)
    gbdt_feats_test = model.predict(test, pred_leaf=True)
    gbdt_feats_name = ["gbdt_leaf_" + str(i) for i in range(gbdt_feats_train.shape[1])]
    df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns=gbdt_feats_name)
    df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name)

    
    print("构造新的数据集...")
    train = pd.concat([train, df_train_gbdt_feats], axis=1)
    test = pd.concat([test, df_test_gbdt_feats], axis=1)
    train_len = train.shape[0]
    data = pd.concat([train, test])
    del train
    del test
    gc.collect()
    
    
    """连续特征归一化"""
    print("开始归一化...")
    scaler = MinMaxScaler()
    for col in con_feature:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    print("归一化结束")
    
    """叶子数One-Hot"""
    print("开始one-hot...")
    for col in gbdt_feats_name:
        print("this is feature:", col)
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print("one-hot结束")
    
    train = data[ :train_len]
    test = data[train_len: ]
    del data
    gc.collect()
    
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.3, random_state=2018)
    print("开始训练lr...")
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
    print("tr-logloss:", tr_logloss)
    val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
    print("val-logloss:", val_logloss)
    
    
    fpr, tpr, threshold = roc_curve(y_val, lr.predict_proba(x_val)[:, 1])
    roc_auc = auc(fpr, tpr)
    lw = 2
    plt.plot(fpr, tpr, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc)
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")  
    plt.xlim([0.0, 1.0])  
    plt.ylim([0.0, 1.05])  
    plt.xlabel("False Positive Rate")  
    plt.ylabel("True Positive Rate")  
    plt.title( "gbdt_lr"+ "ROC")  
    plt.legend(loc="lower right")
    plt.show()
    
    
    
    
    print("开始预测...")
    y_pred = lr.predict_proba(test)[:, 1]
    print("写入结果...")
    res = pd.read_csv("../data/test.csv")
    submission = pd.DataFrame({"Id": res["Id"], "Label": y_pred})
    submission.to_csv("../submission/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv" % (tr_logloss, val_logloss), index=False)
    print("结束")
    

In [None]:
if __name__ == "__main__":
    data = preProcess()
    con_feature = ["I"] * 13
    con_feature = [col + str(i+1) for i, col in enumerate(con_feature)]
    dis_feature = ["C"] * 26
    dis_feature = [col + str(i+1) for i, col in enumerate(dis_feature)]
    gbdt_lr_predict(data, dis_feature, con_feature)

读取数据...
读取结束...
开始one-hot...
one-hot结束
划分数据集...
开始训练gbdt...
[1]	train's binary_logloss: 0.499298	val's binary_logloss: 0.565496
[2]	train's binary_logloss: 0.499056	val's binary_logloss: 0.56541
[3]	train's binary_logloss: 0.498834	val's binary_logloss: 0.565312
[4]	train's binary_logloss: 0.498627	val's binary_logloss: 0.565209
[5]	train's binary_logloss: 0.498389	val's binary_logloss: 0.565123
[6]	train's binary_logloss: 0.49817	val's binary_logloss: 0.565016
[7]	train's binary_logloss: 0.49798	val's binary_logloss: 0.564969
[8]	train's binary_logloss: 0.497767	val's binary_logloss: 0.564883
[9]	train's binary_logloss: 0.497526	val's binary_logloss: 0.564783
[10]	train's binary_logloss: 0.497306	val's binary_logloss: 0.564678
训练得到叶子数...
