# Demo of DeetForest VS XGBoost

GcForesr Link http://lamda.nju.edu.cn/code_gcForestCS.ashx

In [1]:
PATH_TO_GCFOREST_FOLDER="/home/yaantok1/Jira-1129 RS/"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.metrics import roc_auc_score
import statsmodels.stats.api as sms
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

#Imports for gcForestCs
import argparse
import sys
import pickle
sys.path.insert(0, PATH_TO_GCFOREST_FOLDER+"gcForestCS/gcForestCS/lib")

from gcforest.gcforestCS import GCForestCS
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json
from gcforest.utils.log_utils import get_logger
import scipy.io as sio

In [3]:
#Confir from fcForestCS Example
def get_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = 0
    ca_config["max_layers"] = 200
    ca_config["early_stopping_rounds"] = 4
    ca_config["n_classes"] = 2
    ca_config["estimators"] = []
    ca_config["estimators"].append({"n_folds": 3, "type": "RandomForestClassifier", "n_estimators": 500, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 3, "type": "ExtraTreesClassifier", "n_estimators": 500, "max_depth": None, "n_jobs": -1})
    config["cascadeCS"] = ca_config
    return config

In [4]:
class FeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features=features
    
    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return df.loc[:, map(str.lower, self.features)]

# Load and Preprocess Data

You can find this dataset at https://archive.ics.uci.edu/ml/datasets/adult

In [5]:
columns=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','Salary']

In [6]:
cathegorial_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numerical_features=['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [8]:
categorical = Pipeline([
    ('extractor', FeaturesExtractor(cathegorial_features)), 
    ('coder', CatBoostEncoder())])

numerical = Pipeline([
    ('exractor', FeaturesExtractor(numerical_features))])

feature_union = FeatureUnion([
    ('numerical', numerical), 
    ('categorical', categorical)
])

pl_transform = make_pipeline(
    feature_union
)

In [9]:
df=pd.read_csv('adult.data',names=columns,index_col=False)
X=df[df.columns[:-1]]
Y=df['Salary']
Y=Y.apply(lambda i: 0 if i ==' <=50K' else 1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=42, stratify=Y)
x_train=x_train.reset_index(drop=True)
y_train=y_train.reset_index(drop=True)
x_test=x_test.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)

In [10]:
pl_transform.fit(x_train,y_train)
x_train_transformed=pl_transform.transform(x_train)
x_test_transformed=pl_transform.transform(x_test)

## XGBoost

In [11]:
xg = xgb.XGBClassifier()
xg.fit(x_train_transformed,y_train);

In [12]:
roc_auc_score(y_test,xg.predict_proba(x_test_transformed)[:,1])

0.9256373681929799

In [64]:
roc_auc_scores=[]

kf = KFold(n_splits=5,random_state=1)

In [65]:
for train,test in kf.split(x_train):
    xg.fit(x_train_transformed[train,:],y_train.loc[train]);
    roc_auc_scores.append(roc_auc_score(y_train.loc[test],xg.predict_proba(x_train_transformed[test,:])[:,1]))

In [77]:
for train,test in KFold(n_splits=5,random_state=0).split(x_train):
    xg.fit(x_train_transformed[train,:],y_train.loc[train]);
    roc_auc_scores.append(roc_auc_score(y_train.loc[test],xg.predict_proba(x_train_transformed[test,:])[:,1]))

In [90]:
for train,test in KFold(n_splits=5,random_state=2).split(x_train):
    xg.fit(x_train_transformed[train,:],y_train.loc[train]);
    roc_auc_scores.append(roc_auc_score(y_train.loc[test],xg.predict_proba(x_train_transformed[test,:])[:,1]))

## gcForestCs

In [72]:
gcCS = GCForestCS(get_config())

In [73]:
roc_auc_scores_gcForest=[]

kf = KFold(n_splits=5,random_state=1)
i=0
for train,test in kf.split(x_train):
    gcCS.fit_transform(x_train_transformed[train,:],y_train.loc[train].values,
                      x_train_transformed[test,:],y_train.loc[test].values);
    roc_auc_scores_gcForest.append(roc_auc_score(y_train.loc[test],gcCS.predict_proba(x_train_transformed[test,:])[:,1]))
    print roc_auc_scores_gcForest

[ 2019-06-24 11:57:30,591][cascade_classifier_cs.fit_transform] X_groups_train.shape=[(20838, 14)],y_train.shape=(20838,),X_groups_test.shape=[(5210, 14)],y_test.shape=(5210,)
[ 2019-06-24 11:57:30,594][cascade_classifier_cs.fit_transform] group_dims=[14]
[ 2019-06-24 11:57:30,595][cascade_classifier_cs.fit_transform] group_starts=[0]
[ 2019-06-24 11:57:30,596][cascade_classifier_cs.fit_transform] group_ends=[14]
[ 2019-06-24 11:57:30,597][cascade_classifier_cs.fit_transform] X_train.shape=(20838, 14),X_test.shape=(5210, 14)
[ 2019-06-24 11:57:30,601][cascade_classifier_cs.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(20838, 14), X_cur_test.shape=(5210, 14)
[ 2019-06-24 11:57:30,603][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 11:57:33,981][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.49%
[ 2019-06-24 11:57:37,289][kfold_wrapper.log_eva

[ 2019-06-24 11:59:07,295][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.train_cv.predict)=64.23%
[ 2019-06-24 11:59:07,297][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.test.predict)=66.14%
[ 2019-06-24 11:59:07,299][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train part2 accuracy)=65.41%
[ 2019-06-24 11:59:07,300][cascade_classifier_cs.fit_transform] ------------------------layer_2 - train accuracy 86.1359055572
[ 2019-06-24 11:59:07,301][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test part2 accuracy)=66.08%
[ 2019-06-24 11:59:07,302][cascade_classifier_cs.fit_transform] ------------------------layer_2 - test accuracy 85.7965451056
[ 2019-06-24 11:59:07,304][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.654138992386 and accuracy part1 needs to get 0.884712997462 (3 times)
[ 2019-06-24 11:59:07,310][cascade_classifier_cs.decide_thresh] #instances = 6173, num_thresh 6034, CX threshold: 0.8

[ 2019-06-24 12:00:37,283][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_0.predict)=63.59%
[ 2019-06-24 12:00:44,743][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_1.predict)=63.33%
[ 2019-06-24 12:00:52,524][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_2.predict)=62.32%
[ 2019-06-24 12:00:53,407][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_cv.predict)=63.08%
[ 2019-06-24 12:00:53,410][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.test.predict)=66.35%
[ 2019-06-24 12:00:53,411][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:00:59,937][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=64.50%
[ 2019-06-24 12:01:07,044][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds

[ 2019-06-24 12:02:44,620][cascade_classifier_cs.transform] X_groups_test.shape=[(5210, 14)]
[ 2019-06-24 12:02:44,622][cascade_classifier_cs.transform] group_dims=[14]
[ 2019-06-24 12:02:44,623][cascade_classifier_cs.transform] X_test.shape=(5210, 14)
[ 2019-06-24 12:02:44,624][cascade_classifier_cs.transform] [layer=0], #instances=5210
[ 2019-06-24 12:02:44,625][cascade_classifier_cs.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5210, 14)
[ 2019-06-24 12:02:47,391][cascade_classifier_cs.transform] [layer=1], #instances=1813
[ 2019-06-24 12:02:47,394][cascade_classifier_cs.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(1813, 18)
[ 2019-06-24 12:02:51,962][cascade_classifier_cs.transform] [layer=2], #instances=1639
[ 2019-06-24 12:02:51,966][cascade_classifier_cs.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(1639, 18)
[ 2019-06-24 12:02:56,810][cascade_classifier_cs.transform] [layer=3], #instances=1608
[ 2019-06-24 12:02:56,813][cascade_classifier_cs.trans

[0.9110249127078356]


[ 2019-06-24 12:03:05,012][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.33%
[ 2019-06-24 12:03:08,401][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.37%
[ 2019-06-24 12:03:11,891][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.69%
[ 2019-06-24 12:03:12,231][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.80%
[ 2019-06-24 12:03:12,233][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.05%
[ 2019-06-24 12:03:12,248][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:03:15,080][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.77%
[ 2019-06-24 12:03:18,336][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:04:42,926][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.653778276913 and accuracy part1 needs to get 0.884592758971 (3 times)
[ 2019-06-24 12:04:42,933][cascade_classifier_cs.decide_thresh] #instances = 6233, num_thresh 6220, CX threshold: 0.94054877758
[ 2019-06-24 12:04:42,934][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:04:42,935][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(13))=92.31%
[ 2019-06-24 12:04:42,937][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6220))=65.32%
[ 2019-06-24 12:04:42,938][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:04:42,939][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(0))=0.00%
[ 2019-06-24 12:04:42,940][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1633))=65.52%
[ 2019-06-24 12:0

[ 2019-06-24 12:06:31,162][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:06:37,116][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.34%
[ 2019-06-24 12:06:44,369][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=64.26%
[ 2019-06-24 12:06:51,365][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=64.54%
[ 2019-06-24 12:06:52,376][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=64.71%
[ 2019-06-24 12:06:52,378][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=64.66%
[ 2019-06-24 12:06:52,379][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=65.38%
[ 2019-06-24 12:06:52,380][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 12:08:23,692][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 12:08:23,693][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(19))=89.47%
[ 2019-06-24 12:08:23,694][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1590))=63.77%
[ 2019-06-24 12:08:23,696][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(5975, 18), X_cur_test.shape=(1590, 18)
[ 2019-06-24 12:08:23,697][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:08:26,168][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=64.06%
[ 2019-06-24 12:08:29,154][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=63.60%
[ 2019-06-24 12:08:32,229][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583]


[ 2019-06-24 12:09:50,504][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.32%
[ 2019-06-24 12:09:54,283][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.66%
[ 2019-06-24 12:09:58,173][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.28%
[ 2019-06-24 12:09:58,659][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.42%
[ 2019-06-24 12:09:58,661][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.78%
[ 2019-06-24 12:09:58,679][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:10:01,770][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.61%
[ 2019-06-24 12:10:05,195][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:11:31,911][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.666768712689 and accuracy part1 needs to get 0.88892290423 (3 times)
[ 2019-06-24 12:11:31,918][cascade_classifier_cs.decide_thresh] #instances = 6533, num_thresh 6529, CX threshold: 0.951981723309
[ 2019-06-24 12:11:31,921][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:11:31,922][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(4))=100.00%
[ 2019-06-24 12:11:31,923][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6529))=66.66%
[ 2019-06-24 12:11:31,925][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:11:31,926][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(1))=100.00%
[ 2019-06-24 12:11:31,928][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1697))=67.41%
[ 2019-06-24 12

[ 2019-06-24 12:13:21,830][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:13:27,726][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.46%
[ 2019-06-24 12:13:34,983][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.34%
[ 2019-06-24 12:13:41,972][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.05%
[ 2019-06-24 12:13:42,817][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=65.61%
[ 2019-06-24 12:13:42,819][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=65.34%
[ 2019-06-24 12:13:42,841][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.04%
[ 2019-06-24 12:13:42,842][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685]


[ 2019-06-24 12:13:52,732][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=84.91%
[ 2019-06-24 12:13:56,276][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.55%
[ 2019-06-24 12:13:59,805][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.83%
[ 2019-06-24 12:14:00,416][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.43%
[ 2019-06-24 12:14:00,418][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.25%
[ 2019-06-24 12:14:00,434][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:14:03,365][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.80%
[ 2019-06-24 12:14:06,824][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:15:34,763][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.677514334418 and accuracy part1 needs to get 0.892504778139 (3 times)
[ 2019-06-24 12:15:34,769][cascade_classifier_cs.decide_thresh] #instances = 6453, num_thresh 6405, CX threshold: 0.910060971975
[ 2019-06-24 12:15:34,772][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:15:34,773][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(51))=86.27%
[ 2019-06-24 12:15:34,774][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6402))=67.60%
[ 2019-06-24 12:15:34,776][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:15:34,777][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(7))=85.71%
[ 2019-06-24 12:15:34,778][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1661))=67.31%
[ 2019-06-24 12

[ 2019-06-24 12:17:25,262][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:17:31,646][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=66.03%
[ 2019-06-24 12:17:38,486][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.08%
[ 2019-06-24 12:17:46,054][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.96%
[ 2019-06-24 12:17:46,934][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=66.02%
[ 2019-06-24 12:17:46,936][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=67.05%
[ 2019-06-24 12:17:46,937][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.79%
[ 2019-06-24 12:17:46,938][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 12:19:15,690][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 12:19:15,691][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(2))=100.00%
[ 2019-06-24 12:19:15,693][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1634))=66.40%
[ 2019-06-24 12:19:15,696][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(6256, 18), X_cur_test.shape=(1634, 18)
[ 2019-06-24 12:19:15,698][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:19:18,114][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=66.68%
[ 2019-06-24 12:19:20,837][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=65.61%
[ 2019-06-24 12:19:24,000][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525]


[ 2019-06-24 12:19:56,975][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.73%
[ 2019-06-24 12:20:00,575][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.01%
[ 2019-06-24 12:20:03,943][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.85%
[ 2019-06-24 12:20:04,306][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.86%
[ 2019-06-24 12:20:04,308][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=85.56%
[ 2019-06-24 12:20:04,327][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:20:07,168][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.47%
[ 2019-06-24 12:20:10,404][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:21:32,518][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.660448993067 and accuracy part1 needs to get 0.886816331022 (3 times)
[ 2019-06-24 12:21:32,524][cascade_classifier_cs.decide_thresh] #instances = 6058, num_thresh 6012, CX threshold: 0.913871943951
[ 2019-06-24 12:21:32,526][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:21:32,527][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(46))=89.13%
[ 2019-06-24 12:21:32,529][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6012))=65.87%
[ 2019-06-24 12:21:32,530][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:21:32,531][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(6))=83.33%
[ 2019-06-24 12:21:32,532][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1590))=66.73%
[ 2019-06-24 12

[ 2019-06-24 12:23:15,885][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:23:21,803][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=62.49%
[ 2019-06-24 12:23:28,508][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=63.13%
[ 2019-06-24 12:23:35,471][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=63.83%
[ 2019-06-24 12:23:36,335][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=63.15%
[ 2019-06-24 12:23:36,338][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=66.96%
[ 2019-06-24 12:23:36,339][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=64.62%
[ 2019-06-24 12:23:36,340][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961]


In [76]:
kf = KFold(n_splits=5,random_state=0)
i=0
for train,test in kf.split(x_train):
    gcCS.fit_transform(x_train_transformed[train,:],y_train.loc[train].values,
                      x_train_transformed[test,:],y_train.loc[test].values);
    roc_auc_scores_gcForest.append(roc_auc_score(y_train.loc[test],gcCS.predict_proba(x_train_transformed[test,:])[:,1]))
    print roc_auc_scores_gcForest

[ 2019-06-24 12:40:27,756][cascade_classifier_cs.fit_transform] X_groups_train.shape=[(20838, 14)],y_train.shape=(20838,),X_groups_test.shape=[(5210, 14)],y_test.shape=(5210,)
[ 2019-06-24 12:40:27,759][cascade_classifier_cs.fit_transform] group_dims=[14]
[ 2019-06-24 12:40:27,760][cascade_classifier_cs.fit_transform] group_starts=[0]
[ 2019-06-24 12:40:27,761][cascade_classifier_cs.fit_transform] group_ends=[14]
[ 2019-06-24 12:40:27,761][cascade_classifier_cs.fit_transform] X_train.shape=(20838, 14),X_test.shape=(5210, 14)
[ 2019-06-24 12:40:27,765][cascade_classifier_cs.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(20838, 14), X_cur_test.shape=(5210, 14)
[ 2019-06-24 12:40:27,766][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:40:30,854][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.49%
[ 2019-06-24 12:40:34,095][kfold_wrapper.log_eva

[ 2019-06-24 12:42:09,584][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.train_cv.predict)=64.23%
[ 2019-06-24 12:42:09,586][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.test.predict)=66.14%
[ 2019-06-24 12:42:09,612][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train part2 accuracy)=65.41%
[ 2019-06-24 12:42:09,613][cascade_classifier_cs.fit_transform] ------------------------layer_2 - train accuracy 86.1359055572
[ 2019-06-24 12:42:09,614][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test part2 accuracy)=66.08%
[ 2019-06-24 12:42:09,615][cascade_classifier_cs.fit_transform] ------------------------layer_2 - test accuracy 85.7965451056
[ 2019-06-24 12:42:09,616][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.654138992386 and accuracy part1 needs to get 0.884712997462 (3 times)
[ 2019-06-24 12:42:09,622][cascade_classifier_cs.decide_thresh] #instances = 6173, num_thresh 6034, CX threshold: 0.8

[ 2019-06-24 12:43:42,132][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_0.predict)=63.59%
[ 2019-06-24 12:43:49,588][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_1.predict)=63.33%
[ 2019-06-24 12:43:56,726][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_2.predict)=62.32%
[ 2019-06-24 12:43:57,610][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_cv.predict)=63.08%
[ 2019-06-24 12:43:57,612][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.test.predict)=66.35%
[ 2019-06-24 12:43:57,617][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:44:03,728][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=64.50%
[ 2019-06-24 12:44:10,668][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds

[ 2019-06-24 12:45:45,623][cascade_classifier_cs.transform] X_groups_test.shape=[(5210, 14)]
[ 2019-06-24 12:45:45,625][cascade_classifier_cs.transform] group_dims=[14]
[ 2019-06-24 12:45:45,625][cascade_classifier_cs.transform] X_test.shape=(5210, 14)
[ 2019-06-24 12:45:45,626][cascade_classifier_cs.transform] [layer=0], #instances=5210
[ 2019-06-24 12:45:45,628][cascade_classifier_cs.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5210, 14)
[ 2019-06-24 12:45:48,276][cascade_classifier_cs.transform] [layer=1], #instances=1813
[ 2019-06-24 12:45:48,279][cascade_classifier_cs.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(1813, 18)
[ 2019-06-24 12:45:52,721][cascade_classifier_cs.transform] [layer=2], #instances=1639
[ 2019-06-24 12:45:52,723][cascade_classifier_cs.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(1639, 18)
[ 2019-06-24 12:45:57,568][cascade_classifier_cs.transform] [layer=3], #instances=1608
[ 2019-06-24 12:45:57,570][cascade_classifier_cs.trans

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356]


[ 2019-06-24 12:46:05,462][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.33%
[ 2019-06-24 12:46:08,896][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.37%
[ 2019-06-24 12:46:12,553][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.69%
[ 2019-06-24 12:46:13,016][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.80%
[ 2019-06-24 12:46:13,018][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.05%
[ 2019-06-24 12:46:13,032][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:46:15,772][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.77%
[ 2019-06-24 12:46:18,940][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:47:44,492][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.653778276913 and accuracy part1 needs to get 0.884592758971 (3 times)
[ 2019-06-24 12:47:44,498][cascade_classifier_cs.decide_thresh] #instances = 6233, num_thresh 6220, CX threshold: 0.94054877758
[ 2019-06-24 12:47:44,500][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:47:44,501][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(13))=92.31%
[ 2019-06-24 12:47:44,502][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6220))=65.32%
[ 2019-06-24 12:47:44,504][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:47:44,504][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(0))=0.00%
[ 2019-06-24 12:47:44,505][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1633))=65.52%
[ 2019-06-24 12:4

[ 2019-06-24 12:49:32,791][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:49:39,021][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.34%
[ 2019-06-24 12:49:46,043][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=64.26%
[ 2019-06-24 12:49:53,311][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=64.54%
[ 2019-06-24 12:49:54,194][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=64.71%
[ 2019-06-24 12:49:54,196][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=64.66%
[ 2019-06-24 12:49:54,197][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=65.38%
[ 2019-06-24 12:49:54,198][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 12:51:22,714][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 12:51:22,715][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(19))=89.47%
[ 2019-06-24 12:51:22,716][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1590))=63.77%
[ 2019-06-24 12:51:22,718][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(5975, 18), X_cur_test.shape=(1590, 18)
[ 2019-06-24 12:51:22,719][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:51:25,303][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=64.06%
[ 2019-06-24 12:51:28,252][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=63.60%
[ 2019-06-24 12:51:31,252][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583]


[ 2019-06-24 12:52:47,951][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.32%
[ 2019-06-24 12:52:51,287][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.66%
[ 2019-06-24 12:52:54,752][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.28%
[ 2019-06-24 12:52:55,213][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.42%
[ 2019-06-24 12:52:55,215][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.78%
[ 2019-06-24 12:52:55,230][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:52:58,213][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.61%
[ 2019-06-24 12:53:01,475][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:54:27,931][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.666768712689 and accuracy part1 needs to get 0.88892290423 (3 times)
[ 2019-06-24 12:54:27,938][cascade_classifier_cs.decide_thresh] #instances = 6533, num_thresh 6529, CX threshold: 0.951981723309
[ 2019-06-24 12:54:27,940][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:54:27,941][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(4))=100.00%
[ 2019-06-24 12:54:27,942][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6529))=66.66%
[ 2019-06-24 12:54:27,944][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:54:27,945][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(1))=100.00%
[ 2019-06-24 12:54:27,946][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1697))=67.41%
[ 2019-06-24 12

[ 2019-06-24 12:56:17,794][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 12:56:24,194][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.46%
[ 2019-06-24 12:56:31,482][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.34%
[ 2019-06-24 12:56:38,763][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.05%
[ 2019-06-24 12:56:39,774][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=65.61%
[ 2019-06-24 12:56:39,777][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=65.34%
[ 2019-06-24 12:56:39,803][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.04%
[ 2019-06-24 12:56:39,805][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685]


[ 2019-06-24 12:56:50,621][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=84.91%
[ 2019-06-24 12:56:54,047][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.55%
[ 2019-06-24 12:56:57,437][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.83%
[ 2019-06-24 12:56:57,778][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.43%
[ 2019-06-24 12:56:57,780][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.25%
[ 2019-06-24 12:56:57,792][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 12:57:00,669][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.80%
[ 2019-06-24 12:57:03,797][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 12:58:28,087][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.677514334418 and accuracy part1 needs to get 0.892504778139 (3 times)
[ 2019-06-24 12:58:28,100][cascade_classifier_cs.decide_thresh] #instances = 6453, num_thresh 6405, CX threshold: 0.910060971975
[ 2019-06-24 12:58:28,103][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 12:58:28,104][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(51))=86.27%
[ 2019-06-24 12:58:28,106][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6402))=67.60%
[ 2019-06-24 12:58:28,108][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 12:58:28,109][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(7))=85.71%
[ 2019-06-24 12:58:28,110][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1661))=67.31%
[ 2019-06-24 12

[ 2019-06-24 13:00:16,685][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 13:00:22,994][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=66.03%
[ 2019-06-24 13:00:30,012][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.08%
[ 2019-06-24 13:00:37,074][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.96%
[ 2019-06-24 13:00:38,031][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=66.02%
[ 2019-06-24 13:00:38,033][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=67.05%
[ 2019-06-24 13:00:38,034][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.79%
[ 2019-06-24 13:00:38,035][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 13:02:07,481][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 13:02:07,482][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(2))=100.00%
[ 2019-06-24 13:02:07,484][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1634))=66.40%
[ 2019-06-24 13:02:07,486][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(6256, 18), X_cur_test.shape=(1634, 18)
[ 2019-06-24 13:02:07,487][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 13:02:10,120][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=66.68%
[ 2019-06-24 13:02:13,156][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=65.61%
[ 2019-06-24 13:02:16,010][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525]


[ 2019-06-24 13:02:52,439][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.73%
[ 2019-06-24 13:02:55,782][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.01%
[ 2019-06-24 13:02:59,366][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.85%
[ 2019-06-24 13:02:59,820][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.86%
[ 2019-06-24 13:02:59,821][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=85.56%
[ 2019-06-24 13:02:59,837][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 13:03:02,724][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.47%
[ 2019-06-24 13:03:06,064][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 13:04:30,704][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.660448993067 and accuracy part1 needs to get 0.886816331022 (3 times)
[ 2019-06-24 13:04:30,715][cascade_classifier_cs.decide_thresh] #instances = 6058, num_thresh 6012, CX threshold: 0.913871943951
[ 2019-06-24 13:04:30,719][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 13:04:30,720][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(46))=89.13%
[ 2019-06-24 13:04:30,721][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6012))=65.87%
[ 2019-06-24 13:04:30,724][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 13:04:30,725][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(6))=83.33%
[ 2019-06-24 13:04:30,726][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1590))=66.73%
[ 2019-06-24 13

[ 2019-06-24 13:06:20,088][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 13:06:26,164][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=62.49%
[ 2019-06-24 13:06:33,040][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=63.13%
[ 2019-06-24 13:06:40,523][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=63.83%
[ 2019-06-24 13:06:41,432][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=63.15%
[ 2019-06-24 13:06:41,434][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=66.96%
[ 2019-06-24 13:06:41,435][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=64.62%
[ 2019-06-24 13:06:41,436][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961]


In [88]:
kf = KFold(n_splits=5,random_state=2)
i=0
for train,test in kf.split(x_train):
    gcCS.fit_transform(x_train_transformed[train,:],y_train.loc[train].values,
                      x_train_transformed[test,:],y_train.loc[test].values);
    roc_auc_scores_gcForest.append(roc_auc_score(y_train.loc[test],gcCS.predict_proba(x_train_transformed[test,:])[:,1]))
    print roc_auc_scores_gcForest

[ 2019-06-24 16:12:53,022][cascade_classifier_cs.fit_transform] X_groups_train.shape=[(20838, 14)],y_train.shape=(20838,),X_groups_test.shape=[(5210, 14)],y_test.shape=(5210,)
[ 2019-06-24 16:12:53,025][cascade_classifier_cs.fit_transform] group_dims=[14]
[ 2019-06-24 16:12:53,026][cascade_classifier_cs.fit_transform] group_starts=[0]
[ 2019-06-24 16:12:53,027][cascade_classifier_cs.fit_transform] group_ends=[14]
[ 2019-06-24 16:12:53,028][cascade_classifier_cs.fit_transform] X_train.shape=(20838, 14),X_test.shape=(5210, 14)
[ 2019-06-24 16:12:53,031][cascade_classifier_cs.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(20838, 14), X_cur_test.shape=(5210, 14)
[ 2019-06-24 16:12:53,032][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:12:56,411][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.49%
[ 2019-06-24 16:12:59,975][kfold_wrapper.log_eva

[ 2019-06-24 16:14:31,064][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.train_cv.predict)=64.23%
[ 2019-06-24 16:14:31,066][kfold_wrapper.log_eval_metrics] Accuracy(layer_2 - estimator_1 - 3_folds.test.predict)=66.14%
[ 2019-06-24 16:14:31,098][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train part2 accuracy)=65.41%
[ 2019-06-24 16:14:31,100][cascade_classifier_cs.fit_transform] ------------------------layer_2 - train accuracy 86.1359055572
[ 2019-06-24 16:14:31,101][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test part2 accuracy)=66.08%
[ 2019-06-24 16:14:31,102][cascade_classifier_cs.fit_transform] ------------------------layer_2 - test accuracy 85.7965451056
[ 2019-06-24 16:14:31,103][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.654138992386 and accuracy part1 needs to get 0.884712997462 (3 times)
[ 2019-06-24 16:14:31,109][cascade_classifier_cs.decide_thresh] #instances = 6173, num_thresh 6034, CX threshold: 0.8

[ 2019-06-24 16:16:00,172][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_0.predict)=63.59%
[ 2019-06-24 16:16:07,101][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_1.predict)=63.33%
[ 2019-06-24 16:16:13,909][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_2.predict)=62.32%
[ 2019-06-24 16:16:14,558][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.train_cv.predict)=63.08%
[ 2019-06-24 16:16:14,560][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_0 - 3_folds.test.predict)=66.35%
[ 2019-06-24 16:16:14,561][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 16:16:20,469][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=64.50%
[ 2019-06-24 16:16:27,127][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds

[ 2019-06-24 16:18:00,182][cascade_classifier_cs.transform] X_groups_test.shape=[(5210, 14)]
[ 2019-06-24 16:18:00,184][cascade_classifier_cs.transform] group_dims=[14]
[ 2019-06-24 16:18:00,185][cascade_classifier_cs.transform] X_test.shape=(5210, 14)
[ 2019-06-24 16:18:00,186][cascade_classifier_cs.transform] [layer=0], #instances=5210
[ 2019-06-24 16:18:00,188][cascade_classifier_cs.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5210, 14)
[ 2019-06-24 16:18:02,515][cascade_classifier_cs.transform] [layer=1], #instances=1813
[ 2019-06-24 16:18:02,518][cascade_classifier_cs.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(1813, 18)
[ 2019-06-24 16:18:06,655][cascade_classifier_cs.transform] [layer=2], #instances=1639
[ 2019-06-24 16:18:06,658][cascade_classifier_cs.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(1639, 18)
[ 2019-06-24 16:18:10,994][cascade_classifier_cs.transform] [layer=3], #instances=1608
[ 2019-06-24 16:18:10,997][cascade_classifier_cs.trans

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356]


[ 2019-06-24 16:18:18,044][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.33%
[ 2019-06-24 16:18:21,246][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.37%
[ 2019-06-24 16:18:24,317][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.69%
[ 2019-06-24 16:18:24,667][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.80%
[ 2019-06-24 16:18:24,669][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.05%
[ 2019-06-24 16:18:24,683][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:18:27,232][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.77%
[ 2019-06-24 16:18:30,341][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 16:19:51,982][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.653778276913 and accuracy part1 needs to get 0.884592758971 (3 times)
[ 2019-06-24 16:19:51,988][cascade_classifier_cs.decide_thresh] #instances = 6233, num_thresh 6220, CX threshold: 0.94054877758
[ 2019-06-24 16:19:51,990][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 16:19:51,991][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(13))=92.31%
[ 2019-06-24 16:19:51,992][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6220))=65.32%
[ 2019-06-24 16:19:51,994][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 16:19:51,994][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(0))=0.00%
[ 2019-06-24 16:19:51,995][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1633))=65.52%
[ 2019-06-24 16:1

[ 2019-06-24 16:21:31,078][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 16:21:36,902][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.34%
[ 2019-06-24 16:21:43,804][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=64.26%
[ 2019-06-24 16:21:50,642][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=64.54%
[ 2019-06-24 16:21:51,377][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=64.71%
[ 2019-06-24 16:21:51,379][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=64.66%
[ 2019-06-24 16:21:51,380][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=65.38%
[ 2019-06-24 16:21:51,381][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 16:23:16,813][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 16:23:16,814][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(19))=89.47%
[ 2019-06-24 16:23:16,815][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1590))=63.77%
[ 2019-06-24 16:23:16,817][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(5975, 18), X_cur_test.shape=(1590, 18)
[ 2019-06-24 16:23:16,818][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:23:19,312][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=64.06%
[ 2019-06-24 16:23:22,236][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=63.60%
[ 2019-06-24 16:23:25,068][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583]


[ 2019-06-24 16:24:41,539][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.32%
[ 2019-06-24 16:24:45,218][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.66%
[ 2019-06-24 16:24:48,836][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.28%
[ 2019-06-24 16:24:49,286][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.42%
[ 2019-06-24 16:24:49,288][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.78%
[ 2019-06-24 16:24:49,300][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:24:52,134][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.61%
[ 2019-06-24 16:24:55,171][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 16:26:15,633][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.666768712689 and accuracy part1 needs to get 0.88892290423 (3 times)
[ 2019-06-24 16:26:15,640][cascade_classifier_cs.decide_thresh] #instances = 6533, num_thresh 6529, CX threshold: 0.951981723309
[ 2019-06-24 16:26:15,642][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 16:26:15,643][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(4))=100.00%
[ 2019-06-24 16:26:15,644][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6529))=66.66%
[ 2019-06-24 16:26:15,646][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 16:26:15,647][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(1))=100.00%
[ 2019-06-24 16:26:15,648][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1697))=67.41%
[ 2019-06-24 16

[ 2019-06-24 16:28:03,775][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 16:28:09,889][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=65.46%
[ 2019-06-24 16:28:16,795][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.34%
[ 2019-06-24 16:28:24,058][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.05%
[ 2019-06-24 16:28:24,814][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=65.61%
[ 2019-06-24 16:28:24,816][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=65.34%
[ 2019-06-24 16:28:24,839][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.04%
[ 2019-06-24 16:28:24,840][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685]


[ 2019-06-24 16:28:34,728][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=84.91%
[ 2019-06-24 16:28:37,859][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=85.55%
[ 2019-06-24 16:28:41,132][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.83%
[ 2019-06-24 16:28:41,699][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.43%
[ 2019-06-24 16:28:41,701][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=86.25%
[ 2019-06-24 16:28:41,714][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:28:44,317][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.80%
[ 2019-06-24 16:28:47,340][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 16:30:07,858][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.677514334418 and accuracy part1 needs to get 0.892504778139 (3 times)
[ 2019-06-24 16:30:07,864][cascade_classifier_cs.decide_thresh] #instances = 6453, num_thresh 6405, CX threshold: 0.910060971975
[ 2019-06-24 16:30:07,866][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 16:30:07,867][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(51))=86.27%
[ 2019-06-24 16:30:07,869][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6402))=67.60%
[ 2019-06-24 16:30:07,870][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 16:30:07,871][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(7))=85.71%
[ 2019-06-24 16:30:07,872][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1661))=67.31%
[ 2019-06-24 16

[ 2019-06-24 16:31:50,306][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 16:31:56,788][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=66.03%
[ 2019-06-24 16:32:03,551][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=66.08%
[ 2019-06-24 16:32:11,039][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=65.96%
[ 2019-06-24 16:32:11,909][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=66.02%
[ 2019-06-24 16:32:11,911][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=67.05%
[ 2019-06-24 16:32:11,912][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=66.79%
[ 2019-06-24 16:32:11,913][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[ 2019-06-24 16:33:35,922][cascade_classifier_cs.confidence_screening_test] In layer 8, test num is 5210, part decay is 3.31847133758
[ 2019-06-24 16:33:35,923][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.1part(2))=100.00%
[ 2019-06-24 16:33:35,924][cascade_classifier_cs.calc_accuracy] Accuracy(layer_7 - test.2part(1634))=66.40%
[ 2019-06-24 16:33:35,926][cascade_classifier_cs.fit_transform] [layer=8] look_indexs=[0], X_cur_train.shape=(6256, 18), X_cur_test.shape=(1634, 18)
[ 2019-06-24 16:33:35,927][cascade_classifier_cs.fit_transform] [layer=8] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:33:38,260][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_0.predict)=66.68%
[ 2019-06-24 16:33:40,961][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_1.predict)=65.61%
[ 2019-06-24 16:33:43,979][kfold_wrapper.log_eval_metrics] Accuracy(layer_8 - estimator_0 - 3_folds.train_2.

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525]


[ 2019-06-24 16:34:17,055][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_0.predict)=85.73%
[ 2019-06-24 16:34:20,608][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_1.predict)=86.01%
[ 2019-06-24 16:34:24,107][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_2.predict)=85.85%
[ 2019-06-24 16:34:24,566][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.train_cv.predict)=85.86%
[ 2019-06-24 16:34:24,568][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 3_folds.test.predict)=85.56%
[ 2019-06-24 16:34:24,581][cascade_classifier_cs.fit_transform] [layer=0] train_decay_level=1.0, # n_estimators before=500, # n_estimators now=500
[ 2019-06-24 16:34:27,291][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_0.predict)=84.47%
[ 2019-06-24 16:34:30,161][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_1 - 3_folds.train_1.pre

[ 2019-06-24 16:35:51,033][cascade_classifier_cs.getAutoThresh] Training accuracy now is 0.660448993067 and accuracy part1 needs to get 0.886816331022 (3 times)
[ 2019-06-24 16:35:51,038][cascade_classifier_cs.decide_thresh] #instances = 6058, num_thresh 6012, CX threshold: 0.913871943951
[ 2019-06-24 16:35:51,040][cascade_classifier_cs.confidence_screening] In layer 3, train num is 6034, part decay is 0.977482585453
[ 2019-06-24 16:35:51,041][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.1part(46))=89.13%
[ 2019-06-24 16:35:51,042][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - train.2part(6012))=65.87%
[ 2019-06-24 16:35:51,044][cascade_classifier_cs.confidence_screening_test] In layer 3, test num is 1608, part decay is 0.981086028066
[ 2019-06-24 16:35:51,045][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.1part(6))=83.33%
[ 2019-06-24 16:35:51,046][cascade_classifier_cs.calc_accuracy] Accuracy(layer_2 - test.2part(1590))=66.73%
[ 2019-06-24 16

[ 2019-06-24 16:37:37,196][cascade_classifier_cs.fit_transform] [layer=5] train_decay_level=0.364464645634, # n_estimators before=500, # n_estimators now=1371
[ 2019-06-24 16:37:43,579][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_0.predict)=62.49%
[ 2019-06-24 16:37:51,013][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_1.predict)=63.13%
[ 2019-06-24 16:37:58,118][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_2.predict)=63.83%
[ 2019-06-24 16:37:58,996][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.train_cv.predict)=63.15%
[ 2019-06-24 16:37:58,998][kfold_wrapper.log_eval_metrics] Accuracy(layer_5 - estimator_1 - 3_folds.test.predict)=66.96%
[ 2019-06-24 16:37:59,000][cascade_classifier_cs.calc_accuracy] Accuracy(layer_5 - train part2 accuracy)=64.62%
[ 2019-06-24 16:37:59,000][cascade_classifier_cs.fit_transform] ------------------------layer_5 - train accurac

[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961]


# Results

In [69]:
roc_auc_scores=[0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793]

In [74]:
roc_auc_scores_gcForest=[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961]

In [75]:
print sms.DescrStatsW(roc_auc_scores).tconfint_mean()
print sms.DescrStatsW(roc_auc_scores_gcForest).tconfint_mean() 

(0.9157175679711728, 0.9265412210108092)
(0.9037808686586117, 0.9204497201027995)


In [89]:
roc_auc_scores_gcForest_N_CV=[0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961, 0.9110249127078356, 0.9096770594625583, 0.923903337042685, 0.9080834264338525, 0.9078877362565961]

In [91]:
print roc_auc_scores

[0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793, 0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793, 0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793]


In [93]:
roc_auc_scores_N_CV=[0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793, 0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793, 0.9214118036039015, 0.9220632940977193, 0.9277699827633614, 0.9170108247659934, 0.9173910672239793]

In [94]:
print sms.DescrStatsW(roc_auc_scores_N_CV).tconfint_mean()
print sms.DescrStatsW(roc_auc_scores_gcForest_N_CV).tconfint_mean() 

(0.9188947684339486, 0.9233640205480336)
(0.9086738823507978, 0.9155567064106132)


Тест на нормальность распределения. $H_0\colon$  попарные разности распределены нормально.
$H_1\colon$  не нормально.

In [96]:
print "Shapiro-Wilk normality test, W-statistic: %f, p-value: %f" % stats.shapiro(np.array(roc_auc_scores_N_CV) - 
                                                                                  np.array(roc_auc_scores_gcForest_N_CV))

Shapiro-Wilk normality test, W-statistic: 0.821433, p-value: 0.007020


Критерий Стьюдента:
$H_0\colon$ средние значения одинаковы
$H_1\colon$ не одинаковы.

In [97]:
print "Rel. Test on Equal Means, W-statistic: %f, p-value: %f" % stats.ttest_rel(roc_auc_scores_N_CV, roc_auc_scores_gcForest_N_CV)

Rel. Test on Equal Means, W-statistic: 11.924683, p-value: 0.000000


In [98]:
print "95%% confidence interval: [%f, %f]" % DescrStatsW(np.array(roc_auc_scores_N_CV) - 
                                                                                  np.array(roc_auc_scores_gcForest_N_CV
                                                                                          )).tconfint_mean()

95% confidence interval: [0.007393, 0.010635]
