# DecisionTreeClassifier

Самая простая модель, "рабочая лошадка" для теста фич.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 2000) # since we have a lot of features
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("data"))

['test.csv', 'train.csv', 'sample_submission.csv']


In [2]:
''' Import data '''

DATA_DIR = 'data'
train_file = os.path.join(DATA_DIR, 'train.csv')
test_file = os.path.join(DATA_DIR, 'test.csv')
submission_file = os.path.join(DATA_DIR, 'sample_submission.csv')

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

X = train.drop(columns=['y', 'sample_id'])
y = train.y
X_test = test.drop(columns=['sample_id'])

X_all = X.append(X_test)

In [3]:
''' Gather all preprocessing functions '''

from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import bisect
from scipy.stats import shapiro
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM


''' 1 - divide to training and holdout datasets '''

X_train, X_hold, y_train, y_hold = train_test_split(X, y, random_state=3, test_size=0.3)

''' 2 - define the same validation scheme for all models '''

skf = RepeatedStratifiedKFold(n_repeats=5, random_state=17)


''' Function 1 - delete all columns with NaN or Inf '''
columns_to_keep = X_all.replace([np.inf, -np.inf], np.nan).dropna(axis=1).columns

def del_nan(x):
    return x[columns_to_keep]
delete_nan = FunctionTransformer(del_nan, validate=False)


''' Function 2 - delete NaN-only columns '''

na_cols = X_all[X_all.isna().any()[lambda x: x].index].isna().agg(['sum', 'count']).T

all_na_cols = na_cols[na_cols['sum'] == na_cols['count']]

def drop_all_na(X):
    ''' Here we drop all columns which consist of NaN only '''
    idx = all_na_cols.index.values
    return X.drop(columns=idx)

drop_all_na_f = FunctionTransformer(drop_all_na, validate=False)


''' Function 3 - replace NaN '''

has_na_cols = na_cols.T.drop(columns=all_na_cols.index.values).T
has_na_cols['perc'] = has_na_cols['sum']/has_na_cols['count']

'''def replace_na(X, add_boolean=True, threshold=0.3):
    Here we apply two filling strategies
            for two types of columns with na:
            % na > threshold - fill with mean
            % na < threshold - fill with 0.
        Plus we add additional columns showing
            if the value was na before transformation.
    X_copy = X.copy()
    cols_less = has_na_cols[has_na_cols['perc'] < threshold].index.values
    cols_more = has_na_cols[has_na_cols['perc'] >= threshold].index.values
    X_less = X_copy[cols_less]
    X_less = X_less.fillna(X_less.mean())
    X_copy[cols_less] = X_less
    X_copy[cols_more] = X_more
    if add_boolean:
        was_na = X[has_na_cols.index.values].isna().astype('int64')
        was_na.columns = was_na.columns.map(lambda x: x + '_na')
        X_copy = pd.concat([X_copy, was_na], axis=1)
    return X_copy'''
    
# I should have checked columns for categorical features before

def replace_na(X, add_boolean=True):
    ''' Here we apply mean filling strategy.
        And add additional columns showing if the values
        was na before transformation.
    '''
    X_copy = X.copy()
    cols = has_na_cols.index.values
    X_c = X_copy[cols]
    X_c = X_c.fillna(X_all.replace([np.inf, -np.inf], np.nan).mean())
    X_copy[cols] = X_c
    if add_boolean:
        was_na = X[has_na_cols.index.values].isna().astype('int64')
        was_na.columns = was_na.columns.map(lambda x: x + '_na')
        X_copy = pd.concat([X_copy, was_na], axis=1)
    return X_copy

replace_na_f = FunctionTransformer(replace_na, validate=False)


''' Function 4 - replace inf '''

pos_inf_cols = X_all.fillna(0).replace([np.inf], 
                                       np.nan).isna().any()[lambda x: x].index
pos_inf_cols = X_all[pos_inf_cols].replace([np.inf], 
                                           np.nan).isna().agg(['sum', 'count']).T
neg_inf_cols = X_all.fillna(0).replace([-np.inf],
                                       np.nan).isna().any()[lambda x: x].index

def fill_inf(X, add_boolean=True, multiplier=2):
    '''
    Here we apply filling strategy for infinite values:
        we replace it with absolute maximum value * multiplier.
    Additionally, we new columns showing if the value
        was infinite before filling.
    '''
    X_copy = X.copy()
    X_pos = X_copy[pos_inf_cols.index.values]
    to_fill = X_pos.replace(np.inf, np.nan).abs().max() * multiplier + 1.0
    X_pos = X_pos.replace(np.inf, np.nan).fillna(to_fill)
    X_copy[pos_inf_cols.index.values] = X_pos
    if add_boolean:
        was_inf = X[pos_inf_cols.index.values].replace(np.inf, np.nan).isna().astype('int64')
        was_inf.columns = was_inf.columns.map(lambda x: x + '_inf')
        X_copy = pd.concat([X_copy, was_inf], axis=1)
    return X_copy

fill_inf_f = FunctionTransformer(fill_inf, validate=False)


''' First block of data cleaning is gathered in one function '''

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))


''' Function and class 5 - delete zero-variance features '''

w = fill_inf(replace_na(drop_all_na(X_all))).var()
zero_var_cols = w[np.isclose(w, 0)]

def drop_zero_var(X):
    idx = zero_var_cols.index.values
    return X.drop(columns=idx)

drop_zero_var_f = FunctionTransformer(drop_zero_var)

class Drop_zero_var:
    def __init__(self):
        self.zero_var_cols = []
        self.is_fitted = False
        
    def fit(self, X, y=None):
        w = X.var()
        self.zero_var_cols = w[np.isclose(w, 0.0)].index.values
        self.is_fitted = True
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        return X_copy.drop(columns=self.zero_var_cols)
    
    def fit_transform(self, X, y=None):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)

print(Drop_zero_var().fit_transform(pre_proc(X_train), y_train).shape)

''' Function and class 6 - drop highly correlated features '''

'''
# Here is the function
print(drop_zero_var(pre_proc(X_all)).shape)
corr_matrix = pre_proc(X_all).corr().abs()
print(corr_matrix.shape)

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(len(to_drop))

'''

class Drop_corr:
    def __init__(self, threshold=0.95):
        self.to_drop = []
        self.is_fitted = False
        self.threshold = threshold
        
    def fit(self, X, y=None):
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        self.is_fitted = True
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        return X_copy.drop(columns=self.to_drop)
    
    def fit_transform(self, X, y=None):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)

print(Drop_corr().fit_transform(pre_proc(X_train), y_train).shape)
    
''' Class 7 - encode categorical labels '''

class LE_df:
    def __init__(self, threshold=5):
        self.le_dict = {}
        self.threshold = threshold
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        counts = X_copy.apply(lambda x: x.value_counts().shape[0])
        categorical = X_copy[counts[counts <= self.threshold].index]
        for col in categorical.columns:
            le = LabelEncoder()
            le.fit(categorical[col])
            self.le_dict[col] = le
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in self.le_dict.keys():
            X_copy[col] = X_copy[col].map(lambda s: -100500 if s not in \
                                          self.le_dict[col].classes_ else s)
            if np.any(X_copy[col]==-100500):
                le_classes = self.le_dict[col].classes_.tolist()
                le_classes.append(-100500)
                self.le_dict[col].classes_ = np.asarray(le_classes)
            X_copy[col] = self.le_dict[col].transform(X_copy[col])
        return X_copy
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

print(LE_df().fit_transform(pre_proc(X_train), y_train).shape)
    
''' Class 8 - target mean encode categorical labels '''

class LOOTME_df:
    ''' Class performs leave-one-out target mean encoding '''
    def __init__(self, threshold=5):
        self.threshold = threshold
        self.tme_dict = {}
        self.is_fitted = False
        
    def fit(self, X, y):
        X_copy = X.copy()
        y_copy = y.copy()
        counts = X_copy.apply(lambda x: x.value_counts().shape[0])
        categorical = X_copy[counts[counts <= self.threshold].index]
        for col in categorical.columns:
            df = pd.concat((X_copy[col], y_copy), axis=1)
            df.columns = ['col', 'y']
            tme = {}
            for value in df['col'].unique():
                y_val = df[df['col'] == value]['y'].values
                col_val = df[df['col'] == value]['col'].values
                loo_mean = np.mean((np.full_like(col_val, y_val.sum()) - y_val) / y_val.shape[0])
                tme[value] = loo_mean
            self.tme_dict[col] = tme
        self.is_fitted = True
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for col in self.tme_dict.keys():
            X_copy[col] = X_copy[col].map(lambda s: -100500 if s not in self.tme_dict[col].keys() else s)
            self.tme_dict[col][-100500] = 0.0
            X_copy[col] = X_copy[col].map(self.tme_dict[col])
        return X_copy
        
    
    def fit_transform(self, X, y):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)
    
print(LOOTME_df().fit_transform(pre_proc(X_train), y_train).shape)


''' Class 9 - One Hot Encoding of categorical features '''

class OneHot_df:
    def __init__(self, threshold=5, drop_collinear=False):
        self.threshold = threshold
        self.drop_collinear = drop_collinear
        self.categorical = []
        self.ohe = None
        self.column_names = None
        self.is_fitted = False
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        counts = X_copy.apply(lambda x: x.value_counts().shape[0])
        X_cat = X_copy[counts[(counts <= self.threshold) & (counts > 1)].index]
        ohe = OneHotEncoder(handle_unknown='ignore')
        ohe.fit(X_cat)
        self.categorical = X_cat.columns
        self.ohe = ohe
        self.is_fitted = True
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_cat = X_copy.copy()[self.categorical]
        X_cat_tr = self.ohe.transform(X_cat).toarray()
        if not self.column_names:
            column_names = []
            for i, col in enumerate(self.categorical):
                column_names.extend([col + '_' + str(s) for s in self.ohe.categories_[i]])
            self.column_names = column_names
        X_cat_tr = pd.DataFrame(X_cat_tr, columns=self.column_names, index=X_copy.index)
        X_other = X_copy.drop(columns=self.categorical)
        X_copy = pd.concat([X_other, X_cat_tr], axis=1)
        if self.drop_collinear:
            to_drop = pd.Series(self.column_names, 
        index=self.column_names).apply(lambda x: x.endswith('_0'))[lambda x: x].index
            X_copy.drop(columns=to_drop)
        return X_copy
    
    def fit_transform(self, X, y=None):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)

print(OneHot_df().fit(pre_proc(X_train)).transform(pre_proc(X_hold)).shape)

''' Class 10 - find best normalizing transformation '''

def get_shapiro_p(X):
    _, p = shapiro(X)
    return p

class Find_Trans:
    def __init__(self, threshold=10):
        self.threshold = threshold
        self.pvals = []
        self.numerical = []
        self.is_fitted = False
        self.mms = MinMaxScaler()
        self.trans_dict = {'no': lambda x: x,
                           'x^2': lambda x: np.power(x, 2), 
                           'x^3': lambda x: np.power(x, 3), 
                           'log(x)': lambda x: np.log(x), 
                           #'exp(x)': lambda x: np.exp(x/10), 
                           'sqrt(x)': lambda x: np.sqrt(x)}
    
    def fit(self, X, y=None):
        X_copy = X.copy()
        counts = X_copy.apply(lambda x: x.value_counts().shape[0])
        self.numerical = counts[counts > self.threshold].index.values
        X_numer = X_copy[self.numerical]
        idx = X_numer.index
        X_numer = pd.DataFrame(self.mms.fit_transform(X_numer) + 0.01,
                               columns=self.numerical,
                               index=idx)
        pvals = pd.DataFrame(X_numer.apply(get_shapiro_p), columns=['no'])
        for name in self.trans_dict.keys():
            if name not in pvals.columns:
                pvals[name] = X_numer.apply(self.trans_dict[name]).apply(get_shapiro_p)
        self.pvals = pvals
        self.is_fitted = True
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_numer = X_copy[self.numerical]
        idx = X_numer.index
        X_numer = pd.DataFrame(self.mms.transform(X_numer) + 0.01,
                               columns=self.numerical,
                               index=idx)
        best_transform = self.pvals.idxmax(axis=1)
        best_transform = best_transform.map(self.trans_dict).to_dict()
        X_numer = X_numer.apply(best_transform)
        X_numer = X_numer.fillna(X_numer.min()-1)
        for col in self.numerical:
            X_copy[col] = X_numer[col]
        return X_copy
    
    def fit_transform(self, X, y=None):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)

print(Find_Trans().fit(pre_proc(X_train), y_train).transform(pre_proc(X_hold)).shape)


''' Class 11 - find and drop outliers '''

class DetectOut_df:
    def __init__(self, method='IsolationForest', out_fraction=0.05, 
                 drop=True, random_state=17,
                 kernel='rbf', shrink=True):
        self.method = method
        self.out_fraction = out_fraction
        self.drop = drop
        self.is_fitted = False
        self.random_state = random_state
        self.kernel = kernel
        self.shrink = shrink
        self.model = None
        self.train_set = []
        
    def fit(self, X, y=None):
        if self.method == 'IsolationForest':
            model = IsolationForest(random_state=self.random_state,
                        n_estimators=100,
                        max_features=0.9,
                        contamination=self.out_fraction, #
                        n_jobs=-1,
                        behaviour='new')
        if self.method == 'OneClassSVM':
            model = OneClassSVM(kernel=self.kernel,
                     nu=self.out_fraction,
                     shrinking=self.shrink,
                     gamma='scale'
                    )
        model.fit(X, y)
        self.model = model
        self.is_fitted = True
        self.train_set = X.copy()
        return self
    
    def transform(self, X, y):
        ''' We shouldn't predict any outliers for test set '''
        X_copy = X.copy()
        X_copy['outlier'] = 1
        X_copy['y'] = y.copy()
        if self.train_set.equals(X):
            X_copy['outlier'] = self.model.predict(X)
        if self.drop:
            X_copy = X_copy[X_copy['outlier'] == 1]
            X_copy = X_copy.drop(columns=['outlier'])
        y_copy = X_copy['y']
        X_copy = X_copy.drop(columns=['y'])
        return X_copy, y_copy
    
    def fit_transform(self, X, y):
        if not self.is_fitted:
            self.fit(X, y)
        return self.transform(X, y)
    
print(DetectOut_df().fit_transform(pre_proc(X_train), y_train)[0].shape)

(766, 1777)
(766, 932)
(766, 1908)
(766, 1908)
(329, 2518)
(329, 1908)
(727, 1908)


## Набор преобразований 1 - удаление всех колонок с Nan и Inf

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

tree = DecisionTreeClassifier(random_state=17)
pipe_tree = Pipeline([('drop_na', delete_nan),
                     ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 1.0000
Cross-validation ROC AUC: mean 0.6916, std 0.0330
Holdout ROC AUC: 0.6646


In [5]:
%%time

n_iter = 50
random_state = 17

from hyperopt import fmin, Trials, hp, tpe

def tree_roc_cv(params, random_state=random_state, cv=skf, X=X_train, y=y_train):
    # the function gest a set of variable parameters in "param"
    params = {'min_samples_leaf': int(params['min_samples_leaf']), 
              'max_depth': int(params['max_depth']), 
              'max_features': int(params['max_features'])}
    
    # we use this params to create a new LGBM Regressor
    model = Pipeline([('drop_na', delete_nan),
                      ('tree', DecisionTreeClassifier(random_state=random_state, 
                                                      **params))])
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1).mean()

    return score

# possible values of parameters
space={'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 15, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'max_features': hp.quniform('max_features', 10, 1150, 10)
      }

# trials will contain logging information
trials = Trials()


best=fmin(fn=tree_roc_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = Pipeline([('drop_na', delete_nan),
                      ('tree', DecisionTreeClassifier(random_state=random_state, 
                               min_samples_leaf=int(best['min_samples_leaf']),
                               max_depth=int(best['max_depth']),
                               max_features=int(best['max_features'])))])
model.fit(X_train, y_train)
tpe_test_score = roc_auc_score(y_hold, model.predict_proba(X_hold)[:, 1])

print("Best ROC-AUC {:.3f} params {}".format( tree_roc_cv(best), best))
# best parameters: min_samples_leaf=15, max_depth=4, max_features=1000

Best ROC-AUC -0.786 params {'max_depth': 4.0, 'max_features': 1000.0, 'min_samples_leaf': 15.0}
CPU times: user 12.4 s, sys: 1.28 s, total: 13.7 s
Wall time: 1min 54s


In [6]:
model.fit(X, y)

submission = pd.read_csv(submission_file)
submission['y'] = model.predict_proba(X_test)
submission.to_csv('baseline_delete_nan_tree.csv', index=False)

## Обработка NaN, inf, удаление фич с нулевой дисперсией

In [7]:
''' Проверим качество простого дерева на новых фичах '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

tree = DecisionTreeClassifier(random_state=17)

def pre_proc(X):
    return drop_zero_var(fill_inf(replace_na(drop_all_na(X))))

pipe_tree = Pipeline([('pre_proc', FunctionTransformer(pre_proc, validate=False)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 1.0000
Cross-validation ROC AUC: mean 0.6975, std 0.0378
Holdout ROC AUC: 0.7007


In [8]:
''' Проверим качество простого дерева на новых фичах без генерации фич-флагов '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

tree = DecisionTreeClassifier(random_state=17)

def pre_proc(X):
    return drop_zero_var(fill_inf(replace_na(drop_all_na(X), 
                                             add_boolean=False), 
                                  add_boolean=False))

pipe_tree = Pipeline([('pre_proc', FunctionTransformer(pre_proc, validate=False)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 1.0000
Cross-validation ROC AUC: mean 0.7060, std 0.0326
Holdout ROC AUC: 0.7304


## Преобразование категориальных фич

In [9]:
''' Проверим качество простого дерева на новых фичах '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

CORR_THRESHOLD = 0.95
CAT_THRESHOLD = 10 # <- and less unique values make feature categorical

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(random_state=17)

pipe_tree = Pipeline([('pre_proc', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      #('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 1.0000
Cross-validation ROC AUC: mean 0.7071, std 0.0300
Holdout ROC AUC: 0.7261


In [10]:
''' Добавим Leave-one-out encoding '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

CORR_THRESHOLD = 0.95
CAT_THRESHOLD = 10 # <- and less unique values make feature categorical

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(random_state=17)

pipe_tree = Pipeline([('pre_proc', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      ('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 1.0000
Cross-validation ROC AUC: mean 0.6965, std 0.0317
Holdout ROC AUC: 0.7083


In [11]:
''' Попробуем хорошее дерево и LOO-TME энкодинг '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

CORR_THRESHOLD = 0.94
CAT_THRESHOLD = 5 # <- and less unique values make feature categorical

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(max_depth=4,
                              max_features=0.9,
                              min_samples_leaf=15,
                              random_state=17)

pipe_tree = Pipeline([('pre_proc', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      ('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 0.8817
Cross-validation ROC AUC: mean 0.7789, std 0.0431
Holdout ROC AUC: 0.8015


### Drop correlated features threshold - 0.94

CORR_THRESHOLD = [0.95, 0.97, 0.93, 94]

CV_ROC_AUC = [0.7818, 0.7781, 0.7888, 0.7878]

HOLD_ROC_AUC = [0.7922, 0.7875, 0.7607, 0.7981]

### LOOTME category threshold - 5

CAT_THRESHOLD = [10, 0, 5, 15, 20, 8, 3]

CV_ROC_AUC = [0.7878, 0.7727, 0.7789, 0.7879, 0.7878, 0.7890, 0.7727]

HOLD_ROC_AUC = [0.7981, 0.7734, 0.8015, 0.7981, 0.7981, 0.7981, 0.7734]


Неплохие фичи.

## Трансформация фич - лучшая модель

In [12]:
# Лучшая модель этого класса - 0.85277 на тесте

''' Попробуем трансформацию фич на хорошем дереве '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

CORR_THRESHOLD = 0.94
CAT_THRESHOLD = 5 # <- and less unique values make feature categorical

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(max_depth=4,
                              max_features=0.9,
                              min_samples_leaf=15,
                              random_state=17)

pipe_tree = Pipeline([('pre_proc1', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      ('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('transform', Find_Trans(threshold=CAT_THRESHOLD)),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 0.8817
Cross-validation ROC AUC: mean 0.7787, std 0.0428
Holdout ROC AUC: 0.8015


In [13]:
pipe_tree.fit(X, y)

submission = pd.read_csv(submission_file)
submission['y'] = pipe_tree.predict_proba(X_test)[:,1]
submission.to_csv('submission_tree_transform.csv', index=False)

In [14]:
''' Добавляем стандартизацию фич (всех) '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

CORR_THRESHOLD = 0.94
CAT_THRESHOLD = 5 # <- and less unique values make feature categorical

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(max_depth=4,
                              max_features=0.9,
                              min_samples_leaf=15,
                              random_state=17)

pipe_tree = Pipeline([('pre_proc1', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      ('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('transform', Find_Trans(threshold=CAT_THRESHOLD)),
                      ('sc', StandardScaler()),
                      ('tree', tree)])
pipe_tree.fit(X_train, y_train)

scores = cross_val_score(pipe_tree, X_train, y_train, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train, pipe_tree.predict_proba(X_train)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, pipe_tree.predict_proba(X_hold)[:,1]))

Train ROC AUC: 0.8817
Cross-validation ROC AUC: mean 0.7787, std 0.0428
Holdout ROC AUC: 0.8015


## Удаление выбросов

In [15]:
''' Our best tree without outliers '''

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

CORR_THRESHOLD = 0.94
CAT_THRESHOLD = 5 # <- and less unique values make feature categorical
OUT_FRACTION = 0.05

def pre_proc(X):
    return fill_inf(replace_na(drop_all_na(X)))

tree = DecisionTreeClassifier(max_depth=4,
                              max_features=0.9,
                              min_samples_leaf=15,
                              random_state=17)

pipe_pre = Pipeline([('pre_proc1', FunctionTransformer(pre_proc, validate=False)),
                      ('drop_zero_var', Drop_zero_var()),
                      ('drop_corr', Drop_corr(threshold=CORR_THRESHOLD)),
                      ('lootme', LOOTME_df(threshold=CAT_THRESHOLD)),
                      ('transform', Find_Trans(threshold=CAT_THRESHOLD))])

X_train_pre = pipe_pre.fit_transform(X_train, y_train)
X_train_pre, y_train_pre = DetectOut_df(out_fraction=OUT_FRACTION).fit_transform(X_train_pre,
                                                                                 y_train)
X_hold_pre = pipe_pre.transform(X_hold)


tree.fit(X_train_pre, y_train_pre)

scores = cross_val_score(tree, X_train_pre, y_train_pre, cv=skf, scoring='roc_auc')

print('Train ROC AUC: %.4f' % roc_auc_score(y_train_pre, tree.predict_proba(X_train_pre)[:,1]))
print('Cross-validation ROC AUC: mean %.4f, std %.4f' % (scores.mean(), scores.std()))
print('Holdout ROC AUC: %.4f' % roc_auc_score(y_hold, tree.predict_proba(X_hold_pre)[:,1]))

Train ROC AUC: 0.8764
Cross-validation ROC AUC: mean 0.7703, std 0.0438
Holdout ROC AUC: 0.7397


Лучшая простая модель этого класса выдала 0.85777 на тесте, 0.7787 на кросс-валидации и 0.8015 на отложенной выборке. Не вошла в финальный блендинг.