本文主要以kaggle一次二分类竞赛数据为例，深入研究改进forest

LayerDTree是LayerForest的精简版，是对Forest进一步探索的基础

- 数据地址：https://www.kaggle.com/c/porto-seguro-safe-driver-prediction
- 数据特点：
  - 非常不均衡、缺失值严重、二分类
- 包含内容：
  1. 数据获取
  2. 模型应用
  3. 结果分析

# Stage-1：获取数据

In [4]:
import os
import numpy as np
import pandas as pd
import time
import os.path as osp

In [9]:
class FeatureParser(object):
    def __init__(self, desc):
        desc = desc.strip()
        if desc == "C":
            self.f_type = "number"
        else:
            self.f_type = "categorical"
            f_names = [d.strip() for d in desc.split(",")]
            # missing value
            f_names.insert(0, "?")
            self.name2id = dict(zip(f_names, range(len(f_names))))

    def get_float(self, f_data):
        f_data = f_data.strip()
        if self.f_type == "number":
            return float(f_data)
        return float(self.name2id[f_data])

    def get_data(self, f_data):
        f_data = f_data.strip()
        if self.f_type == "number":
            return float(f_data)
        data = np.zeros(len(self.name2id), dtype=np.float32)
        data[self.name2id[f_data]] = 1
        return data

    def get_fdim(self):
        """
        get feature dimension
        """
        if self.f_type == "number":
            return 1
        return len(self.name2id)

In [13]:
train_data_path = osp.join(".\\datasetes", "adult", "adult.data")
test_data_path = osp.join(".\\datasetes", "adult", "adult.test")
feature_desc_path = osp.join(".\\datasetes", "adult", "features")
train_data_path, test_data_path, feature_desc_path

('.\\datasetes\\adult\\adult.data',
 '.\\datasetes\\adult\\adult.test',
 '.\\datasetes\\adult\\features')

In [29]:
f_parsers = []
with open(feature_desc_path) as f:
    for row in f.readlines():
        f_parsers.append(FeatureParser(row))
# f_parsers

In [33]:
with open(train_data_path) as f:
    rows = [row.strip().split(",") for row in f.readlines() if len(row.strip()) > 0 and not row.startswith("|")]
n_datas = len(rows)

cate_as_onehot = 0
if cate_as_onehot:
    X_dim = np.sum([f_parser.get_fdim() for f_parser in f_parsers])
    X = np.zeros((n_datas, X_dim), dtype=np.float32)
else:
    X = np.zeros((n_datas, 14), dtype=np.float32)
y = np.zeros(n_datas, dtype=np.int32)
for i, row in enumerate(rows):
    assert len(row) == 15, "len(row) wrong, i={}".format(i)
    foffset = 0
    for j in range(14):
        if cate_as_onehot:
            fdim = f_parsers[j].get_fdim()
            X[i, foffset:foffset+fdim] = f_parsers[j].get_data(row[j].strip())
            foffset += fdim
        else:
            X[i, j] = f_parsers[j].get_float(row[j].strip())
    y[i] = 0 if row[-1].strip().startswith("<=50K") else 1
print(X.shape, y.shape)
X_train = X
y_train = y

(32561, 14) (32561,)


In [40]:
with open(test_data_path) as f:
    rows = [row.strip().split(",") for row in f.readlines() if len(row.strip()) > 0 and not row.startswith("|")]
n_datas = len(rows)

cate_as_onehot = 0
if cate_as_onehot:
    X_dim = np.sum([f_parser.get_fdim() for f_parser in f_parsers])
    X = np.zeros((n_datas, X_dim), dtype=np.float32)
else:
    X = np.zeros((n_datas, 14), dtype=np.float32)
y = np.zeros(n_datas, dtype=np.int32)
for i, row in enumerate(rows):
    assert len(row) == 15, "len(row) wrong, i={}".format(i)
    foffset = 0
    for j in range(14):
        if cate_as_onehot:
            fdim = f_parsers[j].get_fdim()
            X[i, foffset:foffset+fdim] = f_parsers[j].get_data(row[j].strip())
            foffset += fdim
        else:
            X[i, j] = f_parsers[j].get_float(row[j].strip())
    y[i] = 0 if row[-1].strip().startswith("<=50K") else 1
print(X.shape, y.shape)
X_sub = X
y_sub = y

(16281, 14) (16281,)


# Stage-2：模型应用

In [51]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import ForestUtils
import time
import random
from sklearn import metrics

In [36]:
import EnhancedDTree
import importlib

In [116]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [137]:
importlib.reload(EnhancedDTree)

<module 'EnhancedDTree' from 'C:\\github_workspace\\LayerForest\\EnhancedDTree.py'>

# 决策树算法

In [89]:
clf = DecisionTreeClassifier()
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [90]:
clf = clf.fit(X_train, y_train)

In [93]:
p_train = clf.predict_proba(X_sub)
p_train = [item[1] for item in p_train]
p_train = np.array(p_train)
print("data auc", metrics.roc_auc_score(y_sub, p_train))

data auc 0.74431797296


In [94]:
p_train = clf.predict(X_sub)
print("data auc", metrics.accuracy_score(y_sub, p_train))

data auc 0.813402125177


# 随机森林算法

In [117]:
rf = RandomForestClassifier(n_estimators=2000, max_depth=16, n_jobs=4, random_state=1024, verbose=True)
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=4,
            oob_score=False, random_state=1024, verbose=True,
            warm_start=False)

In [118]:
rf.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:   14.0s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:   15.7s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=4,
            oob_score=False, random_state=1024, verbose=True,
            warm_start=False)

In [119]:
p_train = rf.predict_proba(X_sub)
p_train = [item[1] for item in p_train]
p_train = np.array(p_train)
print("data auc", metrics.roc_auc_score(y_sub, p_train))

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:    1.9s finished


data auc 0.917365234215


In [121]:
p_train = rf.predict(X_sub)
print("data acc", metrics.accuracy_score(y_sub, p_train))

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 2000 out of 2000 | elapsed:    1.9s finished


data acc 0.865303114059


# XGB算法

In [96]:
import xgboost as xgb

In [97]:
params = {
    'objective': 'binary:logistic',
    'silent': True,
    
    'max_depth': 4,
    'eta': 0.020,
    'gamma': 0.65,
    
    'colsample_bytree': 0.8,
    'subsample': 0.6,
    
    'num_boost_round' : 700,
#     'min_child_weight': 10.0,
#     'max_delta_step': 1.8,
}

In [109]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n

def gini_normalized(a, p):
#     if p.ndim == 2:#Required for sklearn wrapper
#         p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)

# Create an XGBoost-compatible metric from Gini
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [111]:
d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_sub, y_sub)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

mdl = xgb.train(params, d_train, 
                    num_boost_round=1600, evals=watchlist, early_stopping_rounds=100, 
                    feval=gini_xgb, maximize=True, verbose_eval=100)

[0]	train-gini:0.691894	valid-gini:0.700265
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.819327	valid-gini:0.816554
[200]	train-gini:0.836354	valid-gini:0.830442
[300]	train-gini:0.847753	valid-gini:0.839445
[400]	train-gini:0.856274	valid-gini:0.845215
[500]	train-gini:0.861858	valid-gini:0.848523
[600]	train-gini:0.866693	valid-gini:0.850783
[700]	train-gini:0.870262	valid-gini:0.851864
[800]	train-gini:0.87336	valid-gini:0.853076
[900]	train-gini:0.876152	valid-gini:0.853475
[1000]	train-gini:0.878787	valid-gini:0.854279
[1100]	train-gini:0.881256	valid-gini:0.854657
[1200]	train-gini:0.883566	valid-gini:0.854888
[1300]	train-gini:0.885647	valid-gini:0.855086
[1400]	train-gini:0.887623	valid-gini:0.855218
[1500]	train-gini:0.88951	valid-gini:0.85506
Stopping. Best iteration:
[1412]	train-gini:0.88786	valid-gini:0.855257



In [113]:
d_test = xgb.DMatrix(X_sub)
p_train = mdl.predict(d_test)
print("data auc", metrics.roc_auc_score(y_sub, p_train))

data auc 0.927533951378


In [114]:
test_y_acc_index = np.where(p_train > 0.5)[0]
test_y_acc = np.array([0] * len(p_train))
test_y_acc[test_y_acc_index] = 1
metrics.accuracy_score(y_sub, test_y_acc)

0.87414777962041645

# layerDTree算法

In [241]:
importlib.reload(EnhancedDTree)

<module 'EnhancedDTree' from 'C:\\github_workspace\\LayerForest\\EnhancedDTree.py'>

In [184]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_metrix(a, p):
    return "nor gini:", gini_normalized(a,p)

In [248]:
X = X_train.copy()
y = y_train.copy()

X_test = X_sub.copy()
test_y = np.array(([0.0] * len(X_test)))
all_data_mask = np.array([False] * len(X_test))
real_y = y_sub.copy()

# 均衡数据进行layer
X_train_np = X
y_train_np = y
maxlayer = 100
layer = 0

enhancedDTree = EnhancedDTree.EnhancedDTree()
counter = 0
while 1:
    layer += 1
    print()
    print("layer:", layer)
    X = X_train_np
    y = y_train_np
    
    # 均衡数据
    positive_mask = np.where(y == 1)[0]
    negative_index = np.where(y == 0)[0]
    random.shuffle(negative_index)
    negative_mask = negative_index[:len(positive_mask)]
    train_mask = np.hstack((positive_mask, negative_mask))
    train_data_x = X[train_mask]
    train_data_y = y[train_mask]
    guest_mask = negative_index[len(positive_mask):]
    guest_data_x = X[guest_mask]
    guest_data_y = y[guest_mask]
    
    print("train==1", train_data_y[train_data_y == 1].shape)
    print("train==0", train_data_y[train_data_y == 0].shape)
    clf, data_mask, all_false_data_index, p_test  = \
        enhancedDTree.TrainModelLayer(train_data_x, train_data_y, X_test, all_data_mask, test_y, real_y, verbose=True, feval=gini_metrix)
        
    X_train_np = enhancedDTree.X_train_np
    y_train_np = enhancedDTree.y_train_np
    
    # 均衡数据
    X_train_np = np.vstack((X_train_np, guest_data_x))
    y_train_np = np.hstack((y_train_np, guest_data_y))
    print("train==1", y_train_np[y_train_np == 1].shape)
    print("train==0", y_train_np[y_train_np == 0].shape)
    
#     time.sleep(1)
    if X_train_np.shape[0] < 1000 or layer > maxlayer or y_train_np[y_train_np==1].shape[0] <= 10 \
        or len(p_test[~data_mask]) == 0:
        all_data_mask = all_false_data_index[~data_mask]
        test_y[all_data_mask] = p_test[~data_mask]
        print(len(p_test[~data_mask]))
        break
        
    all_data_mask[~all_data_mask] = data_mask


layer: 1
train==1 (7841,)
train==0 (7841,)
X_train.shape, y_train.shape:(13329, 14)(13329,)
X_valid.shape, y_valid.shape:(2353, 14)(2353,)
all data ('nor gini:', 0.82967020482218001)
train data ('nor gini:', 0.81905350287151635)
valid data ('nor gini:', 0.80213300273380383)
mean of all impurity:0.0
pass node id shape:(7,)
pass data shape:(2932, 14) (1589,)
not pass data shape:(12750, 14) (46525,)
all data shape:15682
mean of all impurity:0.273708010192
pass node id shape:(73,)
pass test data shape: 9199
not pass test data shape: 7082
pass test data ('nor gini:', 0.93871838660069928)
train==1 (6252,)
train==0 (23377,)

layer: 2
train==1 (6252,)
train==0 (6252,)
X_train.shape, y_train.shape:(10628, 14)(10628,)
X_valid.shape, y_valid.shape:(1876, 14)(1876,)
all data ('nor gini:', 0.76932105974328802)
train data ('nor gini:', 0.76121341160162481)
valid data ('nor gini:', 0.7434613408740639)
mean of all impurity:0.0
pass node id shape:(3,)
pass data shape:(916, 14) (165,)
not pass data sha

all data ('nor gini:', 0.69876561570493756)
train data ('nor gini:', 0.69366513424947529)
valid data ('nor gini:', 0.65840219056464955)
mean of all impurity:0.0
pass node id shape:(1,)
pass data shape:(129, 14) (1,)
not pass data shape:(12007, 14) (44155,)
all data shape:12136
mean of all impurity:0.338099159576
pass node id shape:(60,)
pass test data shape: 34
not pass test data shape: 3043
pass test data ('nor gini:', 0.40659340659340665)
train==1 (6067,)
train==0 (18598,)

layer: 17
train==1 (6067,)
train==0 (6067,)
X_train.shape, y_train.shape:(10313, 14)(10313,)
X_valid.shape, y_valid.shape:(1821, 14)(1821,)
all data ('nor gini:', 0.71028400540973025)
train data ('nor gini:', 0.70552600252761488)
valid data ('nor gini:', 0.65945887263121095)
mean of all impurity:0.0
pass node id shape:(2,)
pass data shape:(319, 14) (1,)
not pass data shape:(11815, 14) (43393,)
all data shape:12134
mean of all impurity:0.32301276096
pass node id shape:(59,)
pass test data shape: 3
not pass test dat

  


train==1 (6065,)
train==0 (6065,)
X_train.shape, y_train.shape:(10310, 14)(10310,)
X_valid.shape, y_valid.shape:(1820, 14)(1820,)
all data ('nor gini:', 0.69844948479953006)
train data ('nor gini:', 0.69007818725240155)
valid data ('nor gini:', 0.66401159280280153)
mean of all impurity:0.012421875
pass node id shape:(1,)
pass data shape:(187, 14) (4,)
not pass data shape:(11943, 14) (44090,)
all data shape:12130
mean of all impurity:0.331603397852
pass node id shape:(59,)
pass test data shape: 18
not pass test data shape: 2956
pass test data ('nor gini:', 0.5)
train==1 (6061,)
train==0 (17961,)

layer: 20
train==1 (6061,)
train==0 (6061,)
X_train.shape, y_train.shape:(10303, 14)(10303,)
X_valid.shape, y_valid.shape:(1819, 14)(1819,)
all data ('nor gini:', 0.69187647086061022)
train data ('nor gini:', 0.67813273609056202)
valid data ('nor gini:', 0.66895634618406863)
mean of all impurity:0.0
pass node id shape:(1,)
pass data shape:(130, 14) (2,)
not pass data shape:(11992, 14) (44349,)


X_train.shape, y_train.shape:(9987, 14)(9987,)
X_valid.shape, y_valid.shape:(1763, 14)(1763,)
all data ('nor gini:', 0.66504885830692617)
train data ('nor gini:', 0.65731078375564778)
valid data ('nor gini:', 0.62394310732238434)
mean of all impurity:0.0570934256055
pass node id shape:(1,)
pass data shape:(164, 14) (5,)
not pass data shape:(11586, 14) (42846,)
all data shape:11750
mean of all impurity:0.348416371164
pass node id shape:(60,)
pass test data shape: 0
not pass test data shape: 2093
pass test data ('nor gini:', nan)
train==1 (5870,)
train==0 (16092,)

layer: 34
train==1 (5870,)
train==0 (5870,)
X_train.shape, y_train.shape:(9979, 14)(9979,)
X_valid.shape, y_valid.shape:(1761, 14)(1761,)
all data ('nor gini:', 0.66192727726522127)
train data ('nor gini:', 0.65583586495500545)
valid data ('nor gini:', 0.62174440202249526)
mean of all impurity:0.0151506322475
pass node id shape:(1,)
pass data shape:(154, 14) (2,)
not pass data shape:(11586, 14) (42903,)
all data shape:11740
me

valid data ('nor gini:', 0.57669565217391328)
mean of all impurity:0.0470314994919
pass node id shape:(1,)
pass data shape:(195, 14) (6,)
not pass data shape:(11459, 14) (42753,)
all data shape:11654
mean of all impurity:0.373116010906
pass node id shape:(56,)
pass test data shape: 13
not pass test data shape: 1815
pass test data ('nor gini:', 0.0)
train==1 (5821,)
train==0 (14019,)

layer: 48
train==1 (5821,)
train==0 (5821,)
X_train.shape, y_train.shape:(9895, 14)(9895,)
X_valid.shape, y_valid.shape:(1747, 14)(1747,)
all data ('nor gini:', 0.61936818574856534)
train data ('nor gini:', 0.61024842309891481)
valid data ('nor gini:', 0.5791727937803568)
mean of all impurity:0.0312421264802
pass node id shape:(1,)
pass data shape:(149, 14) (5,)
not pass data shape:(11493, 14) (42895,)
all data shape:11642
mean of all impurity:0.363313626301
pass node id shape:(60,)
pass test data shape: 12
not pass test data shape: 1803
pass test data ('nor gini:', 0.0625)
train==1 (5816,)
train==0 (13875

all data ('nor gini:', 0.5553891515774203)
train data ('nor gini:', 0.54191872483436354)
valid data ('nor gini:', 0.50688667879258276)
mean of all impurity:0.1171875
pass node id shape:(1,)
pass data shape:(225, 14) (17,)
not pass data shape:(10451, 14) (39492,)
all data shape:10676
mean of all impurity:0.396039540668
pass node id shape:(49,)
pass test data shape: 72
not pass test data shape: 1350
pass test data ('nor gini:', 0.041871921182266042)
train==1 (5321,)
train==0 (12141,)

layer: 64
train==1 (5321,)
train==0 (5321,)
X_train.shape, y_train.shape:(9045, 14)(9045,)
X_valid.shape, y_valid.shape:(1597, 14)(1597,)
all data ('nor gini:', 0.55635263622865505)
train data ('nor gini:', 0.54829446585993291)
valid data ('nor gini:', 0.48945580471830391)
mean of all impurity:0.101893491124
pass node id shape:(1,)
pass data shape:(145, 14) (9,)
not pass data shape:(10497, 14) (39580,)
all data shape:10642
mean of all impurity:0.394667402385
pass node id shape:(47,)
pass test data shape: 6


train==1 (4901,)
train==0 (10509,)

layer: 78
train==1 (4901,)
train==0 (4901,)
X_train.shape, y_train.shape:(8331, 14)(8331,)
X_valid.shape, y_valid.shape:(1471, 14)(1471,)
all data ('nor gini:', 0.51141168904771517)
train data ('nor gini:', 0.50450943699611384)
valid data ('nor gini:', 0.45929458740017742)
mean of all impurity:0.122755607651
pass node id shape:(1,)
pass data shape:(170, 14) (16,)
not pass data shape:(9632, 14) (36547,)
all data shape:9802
mean of all impurity:0.412367111865
pass node id shape:(50,)
pass test data shape: 22
not pass test data shape: 875
pass test data ('nor gini:', 0.4583333333333332)
train==1 (4885,)
train==0 (10355,)

layer: 79
train==1 (4885,)
train==0 (4885,)
X_train.shape, y_train.shape:(8304, 14)(8304,)
X_valid.shape, y_valid.shape:(1466, 14)(1466,)
all data ('nor gini:', 0.51348294289644425)
train data ('nor gini:', 0.49880098176796178)
valid data ('nor gini:', 0.44576940901451539)
mean of all impurity:0.151819262782
pass node id shape:(1,)
pas

all data ('nor gini:', 0.45303635944154086)
train data ('nor gini:', 0.44276536063500593)
valid data ('nor gini:', 0.39041516012396699)
mean of all impurity:0.175322839099
pass node id shape:(1,)
pass data shape:(122, 14) (12,)
not pass data shape:(9260, 14) (35288,)
all data shape:9382
mean of all impurity:0.439561651492
pass node id shape:(38,)
pass test data shape: 24
not pass test data shape: 536
pass test data ('nor gini:', -0.022222222222222143)
train==1 (4679,)
train==0 (8418,)

layer: 94
train==1 (4679,)
train==0 (4679,)
X_train.shape, y_train.shape:(7954, 14)(7954,)
X_valid.shape, y_valid.shape:(1404, 14)(1404,)
all data ('nor gini:', 0.4468058594509553)
train data ('nor gini:', 0.4354507237333804)
valid data ('nor gini:', 0.36108067304648511)
mean of all impurity:0.211762688615
pass node id shape:(1,)
pass data shape:(127, 14) (110,)
not pass data shape:(9231, 14) (35037,)
all data shape:9358
mean of all impurity:0.426621891537
pass node id shape:(44,)
pass test data shape: 9

In [160]:
# 均衡数据进行layer

In [249]:
metrics.roc_auc_score(y_sub, test_y)

0.89942379520673388

In [250]:
test_y_acc_index = np.where(test_y > 0.5)[0]
test_y_acc = np.array([0] * len(test_y))
test_y_acc[test_y_acc_index] = 1
metrics.accuracy_score(y_sub, test_y_acc)

0.82101836496529701

In [None]:
# 不均衡数据进行layer

In [254]:
metrics.roc_auc_score(y_sub, test_y)

0.90421133210426941

In [255]:
test_y_acc_index = np.where(test_y > 0.5)[0]
test_y_acc = np.array([0] * len(test_y))
test_y_acc[test_y_acc_index] = 1
metrics.accuracy_score(y_sub, test_y_acc)

0.86002088323813031

In [253]:
X = X_train.copy()
y = y_train.copy()

X_test = X_sub.copy()
test_y = np.array(([0.0] * len(X_test)))
all_data_mask = np.array([False] * len(X_test))
real_y = y_sub.copy()

# 不均衡数据进行layer
X_train_np = X
y_train_np = y
maxlayer = 100
layer = 0

enhancedDTree = EnhancedDTree.EnhancedDTree()
counter = 0
while 1:
    layer += 1
    print()
    print("layer:", layer)
    X = X_train_np
    y = y_train_np
    clf, data_mask, all_false_data_index, p_test  = \
        enhancedDTree.TrainModelLayer(X, y, X_test, all_data_mask, test_y, real_y, verbose=True, feval=gini_metrix)
    X_train_np = enhancedDTree.X_train_np
    y_train_np = enhancedDTree.y_train_np
    
    if X_train_np.shape[0] < 1000 or layer > maxlayer or y_train_np[y_train_np==1].shape[0] <= 10:
        all_data_mask = all_false_data_index[~data_mask]
        test_y[all_data_mask] = p_test[~data_mask]
        print(len(p_test[~data_mask]))
        break
        
    all_data_mask[~all_data_mask] = data_mask


layer: 1
X_train.shape, y_train.shape:(27676, 14)(27676,)
X_valid.shape, y_valid.shape:(4885, 14)(4885,)
all data ('nor gini:', 0.82519443890693234)
train data ('nor gini:', 0.82600761798654876)
valid data ('nor gini:', 0.82013964928111982)
mean of all impurity:0.0
pass node id shape:(12,)
pass data shape:(3317, 14) (1017,)
not pass data shape:(29244, 14) (101832,)
all data shape:32561
mean of all impurity:0.211628767054
pass node id shape:(142,)
pass test data shape: 9784
not pass test data shape: 6497
pass test data ('nor gini:', 0.88842402645976137)

layer: 2
X_train.shape, y_train.shape:(24857, 14)(24857,)
X_valid.shape, y_valid.shape:(4387, 14)(4387,)
all data ('nor gini:', 0.77777832972360772)
train data ('nor gini:', 0.78213332778434197)
valid data ('nor gini:', 0.75273019346565562)
mean of all impurity:0.0
pass node id shape:(7,)
pass data shape:(5951, 14) (3,)
not pass data shape:(23293, 14) (83752,)
all data shape:29244
mean of all impurity:0.23799863411
pass node id shape:(

X_train.shape, y_train.shape:(16807, 14)(16807,)
X_valid.shape, y_valid.shape:(2967, 14)(2967,)
all data ('nor gini:', 0.62419727645941947)
train data ('nor gini:', 0.63125323653993637)
valid data ('nor gini:', 0.58443541601061377)
mean of all impurity:0.0156240312481
pass node id shape:(1,)
pass data shape:(140, 14) (2,)
not pass data shape:(19634, 14) (71636,)
all data shape:19774
mean of all impurity:0.319238765938
pass node id shape:(86,)
pass test data shape: 71
not pass test data shape: 3905
pass test data ('nor gini:', -0.36470588235294132)

layer: 18
X_train.shape, y_train.shape:(16688, 14)(16688,)
X_valid.shape, y_valid.shape:(2946, 14)(2946,)
all data ('nor gini:', 0.61991618729263609)
train data ('nor gini:', 0.62632246289270821)
valid data ('nor gini:', 0.58278510403176231)
mean of all impurity:0.0
pass node id shape:(1,)
pass data shape:(117, 14) (1,)
not pass data shape:(19517, 14) (71311,)
all data shape:19634
mean of all impurity:0.317969035931
pass node id shape:(96,)


  



X_valid.shape, y_valid.shape:(2646, 14)(2646,)
all data ('nor gini:', 0.56515212083300959)
train data ('nor gini:', 0.56785523950590855)
valid data ('nor gini:', 0.54686166075835219)
mean of all impurity:0.0629255073039
pass node id shape:(1,)
pass data shape:(150, 14) (6,)
not pass data shape:(17485, 14) (64594,)
all data shape:17635
mean of all impurity:0.351165173049
pass node id shape:(75,)
pass test data shape: 21
not pass test data shape: 3317
pass test data ('nor gini:', 0.52941176470588225)

layer: 33
X_train.shape, y_train.shape:(14862, 14)(14862,)
X_valid.shape, y_valid.shape:(2623, 14)(2623,)
all data ('nor gini:', 0.56943940791057424)
train data ('nor gini:', 0.57265204628801747)
valid data ('nor gini:', 0.54736148107170057)
mean of all impurity:0.0521364795918
pass node id shape:(1,)
pass data shape:(129, 14) (5,)
not pass data shape:(17356, 14) (64233,)
all data shape:17485
mean of all impurity:0.341173817546
pass node id shape:(77,)
pass test data shape: 56
not pass tes

  if sys.path[0] == '':


all data ('nor gini:', 0.55874130099461161)
train data ('nor gini:', 0.56512687706074249)
valid data ('nor gini:', 0.51815654794378185)
mean of all impurity:0.0582
pass node id shape:(1,)
pass data shape:(113, 14) (3,)
not pass data shape:(17072, 14) (63239,)
all data shape:17185
mean of all impurity:0.343726761635
pass node id shape:(76,)
pass test data shape: 44
not pass test data shape: 3216
pass test data ('nor gini:', 0.19691119691119688)

layer: 36
X_train.shape, y_train.shape:(14511, 14)(14511,)
X_valid.shape, y_valid.shape:(2561, 14)(2561,)
all data ('nor gini:', 0.55419345210385018)
train data ('nor gini:', 0.55986297788054906)
valid data ('nor gini:', 0.52018323336002215)
mean of all impurity:0.0499671268902
pass node id shape:(1,)
pass data shape:(141, 14) (136,)
not pass data shape:(16931, 14) (62701,)
all data shape:17072
mean of all impurity:0.356693324456
pass node id shape:(84,)
pass test data shape: 62
not pass test data shape: 3154
pass test data ('nor gini:', 0.28846

all data ('nor gini:', 0.49246318728602551)
train data ('nor gini:', 0.5022578407335887)
valid data ('nor gini:', 0.43455076145472721)
mean of all impurity:0.0932333717801
pass node id shape:(1,)
pass data shape:(118, 14) (7,)
not pass data shape:(14983, 14) (55914,)
all data shape:15101
mean of all impurity:0.384343103027
pass node id shape:(69,)
pass test data shape: 4
not pass test data shape: 2654
pass test data ('nor gini:', 1.0)

layer: 51
X_train.shape, y_train.shape:(12735, 14)(12735,)
X_valid.shape, y_valid.shape:(2248, 14)(2248,)
all data ('nor gini:', 0.48843380228234123)
train data ('nor gini:', 0.49222754275532316)
valid data ('nor gini:', 0.46655989857030844)
mean of all impurity:0.105860773867
pass node id shape:(1,)
pass data shape:(126, 14) (9,)
not pass data shape:(14857, 14) (55538,)
all data shape:14983
mean of all impurity:0.389992624409
pass node id shape:(59,)
pass test data shape: 69
not pass test data shape: 2585
pass test data ('nor gini:', -0.0641891891891890

all data ('nor gini:', 0.41316813129863061)
train data ('nor gini:', 0.4272204892464293)
valid data ('nor gini:', 0.33550850399166948)
mean of all impurity:0.225651577503
pass node id shape:(1,)
pass data shape:(126, 14) (107,)
not pass data shape:(12399, 14) (47048,)
all data shape:12525
mean of all impurity:0.432023766575
pass node id shape:(54,)
pass test data shape: 75
not pass test data shape: 1474
pass test data ('nor gini:', 0.13489736070381236)

layer: 69
X_train.shape, y_train.shape:(10539, 14)(10539,)
X_valid.shape, y_valid.shape:(1860, 14)(1860,)
all data ('nor gini:', 0.40358328650723019)
train data ('nor gini:', 0.40872686107054879)
valid data ('nor gini:', 0.37878343056292152)
mean of all impurity:0.222414455978
pass node id shape:(1,)
pass data shape:(120, 14) (19,)
not pass data shape:(12279, 14) (46603,)
all data shape:12399
mean of all impurity:0.42175585963
pass node id shape:(46,)
pass test data shape: 18
not pass test data shape: 1456
pass test data ('nor gini:', 0

all data ('nor gini:', 0.33230205149887998)
train data ('nor gini:', 0.34092141153682165)
valid data ('nor gini:', 0.28638450528820253)
mean of all impurity:0.305460248995
pass node id shape:(1,)
pass data shape:(114, 14) (27,)
not pass data shape:(9691, 14) (37354,)
all data shape:9805
mean of all impurity:0.461856561785
pass node id shape:(39,)
pass test data shape: 4
not pass test data shape: 453
pass test data ('nor gini:', -1.0)

layer: 87
X_train.shape, y_train.shape:(8237, 14)(8237,)
X_valid.shape, y_valid.shape:(1454, 14)(1454,)
all data ('nor gini:', 0.33082109883262617)
train data ('nor gini:', 0.34836126219721925)
valid data ('nor gini:', 0.23295103501898179)
mean of all impurity:0.312174817898
pass node id shape:(1,)
pass data shape:(137, 14) (27,)
not pass data shape:(9554, 14) (36867,)
all data shape:9691
mean of all impurity:0.460206946138
pass node id shape:(43,)
pass test data shape: 21
not pass test data shape: 432
pass test data ('nor gini:', 0.12962962962962979)

la

### Todo list
- 树结构设计（完成）
- 通过gini对数据分割（完成）
- 全局测试集
- 输出结果集
- 打印信息增加pass data的比例
- 防止过拟合
- 对pass data的进一步处理
- 先进行数据均衡化是不是更快一些