In [3]:
import pandas as pd
import numpy as np
import gc

# use the creditcard default dataset
df = pd.read_excel(r'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', header=1)

In [8]:
X = df.drop(['ID', 'default payment next month'], axis=1)
y = df['default payment next month']
y.mean()

0.2212

## Baseline  
- plain random forest with equal weight for each decision tree in the forest

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [14]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1024)

rf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=1024)
baseline_result = cross_validate(rf, X, y, scoring='roc_auc', cv=cv, return_train_score=True)
pd.DataFrame(baseline_result).mean()

fit_time       1.940685
score_time     0.064801
test_score     0.775301
train_score    0.792621
dtype: float64

## Experiment 1  
- The assumption is that each tree is better at predicting samples in a small subspace, so during inference time the output of each tree is weighted based on whether the instance falls into the subspace that it's good at.

In [83]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score
from tqdm import trange

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=1024)

In [158]:
class SpecializedForestClassifier(RandomForestClassifier):
    
    def __init__(self,
                 n_neighbours=200,
                 metric='auc',
                 weight_fn='square',
                 n_estimators='warn',
                 criterion="gini",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features="auto",
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.,
                 min_impurity_split=None,
                 bootstrap=True,
                 oob_score=False,
                 n_jobs=None,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 class_weight=None):
        super().__init__(
                n_estimators=n_estimators,
                 criterion=criterion,
                 max_depth=max_depth,
                 min_samples_split=min_samples_split,
                 min_samples_leaf=min_samples_leaf,
                 min_weight_fraction_leaf=min_weight_fraction_leaf,
                 max_features=max_features,
                 max_leaf_nodes=max_leaf_nodes,
                 min_impurity_decrease=min_impurity_decrease,
                 min_impurity_split=min_impurity_split,
                 bootstrap=bootstrap,
                 oob_score=oob_score,
                 n_jobs=n_jobs,
                 random_state=random_state,
                 verbose=verbose,
                 warm_start=warm_start,
                 class_weight=class_weight
            )
        self.n_neighbours = n_neighbours
        self.metric = metric
        self.weight_fn = weight_fn
        
    def fit(self, X, y, sample_weight=None):
        super().fit(X, y, sample_weight)
        # convert to array
        self.X_train = X.values if hasattr(X, 'values') else X
        self.y_train = y.values if hasattr(y, 'values') else y
        self.nbrs = NearestNeighbors(n_neighbors=self.n_neighbours, metric='l2')
        self.nbrs.fit(X, y)
        return self
            
    def predict_proba(self, X):
        
        if self.metric in ('auc', 'roc_auc'):
            metric_fn = roc_auc_score
        elif isinstance(self.metric, str):
            raise
        else:
            metric_fn = self.metric
        
        if self.weight_fn == 'square':
            weight_fn = np.square
        elif self.weight_fn == 'sqrt':
            weight_fn = np.sqrt
        elif self.weight_fn in ('exp', 'exponential'):
            weight_fn = np.exp
        elif isinstance(self.weight_fn, str):
            raise ValueError('{} is not supported.'.format(self.weight_fn))
        else:
            weight_fn = np.vectorize(self.weight_fn)

        # get the prediction for each base classifier
        # prediction.shape = [n_sample, n_classifier]
        prediction = np.empty((len(X), len(self.estimators_)))
        for i, est in enumerate(self.estimators_):
            prediction[:, i] = est.predict_proba(X)[:, 1]
        
        # calculate the benchmark indices for X, shape = [n_sample, n_neighbours]
        dist, benchmark_indices = self.nbrs.kneighbors(X)
        
        metrics = np.empty((len(X), len(self.estimators_)))
        for sample_idx in trange(len(X)):
            X_benchmark = self.X_train[benchmark_indices[sample_idx]]
            y_benchmark = self.y_train[benchmark_indices[sample_idx]]
            for est_idx, est in enumerate(self.estimators_):
                pred = est.predict_proba(X_benchmark)[:, 1]            
                metrics[sample_idx, est_idx] = metric_fn(y_benchmark, pred)
        
        # the metric rank for each sample in ascending order
        metric_rank = metrics.argsort()
        metric_rank = metric_rank.astype('float') + 1e-10
        
        # the weight for each prediction based on the rank of the metrics
        pred_weight = weight_fn(metric_rank)
        pred_weight = pred_weight / pred_weight.sum(axis=1, keepdims=True)
        prediction = prediction * pred_weight
        prediction = prediction.mean(axis=1)
        return np.array([1-prediction, prediction]).T

In [None]:
S = SpecializedForestClassifier(n_neighbours=500, n_estimators=100, max_depth=5)
exp1_res = cross_validate(S, X, y, scoring='roc_auc', cv=cv, return_train_score=True)
pd.DataFrame(exp1_res).mean()

100%|██████████| 9000/9000 [13:44<00:00, 10.91it/s]
 12%|█▏        | 2591/21000 [1:05:58<29:34, 10.37it/s]      

In [155]:
roc_auc_score(y_test, pred[:, 1])

0.759849268920873

In [23]:
n_neighbours = 100

nbrs = NearestNeighbors(n_neighbors=n_neighbours,
                       metric='l2')

nbrs.fit(X_train, y_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l2',
                 metric_params=None, n_jobs=None, n_neighbors=100, p=2,
                 radius=1.0)

In [33]:
dist, branchmark_indices = nbrs.kneighbors(X_test[:10])

# get the benchmark dataset to evaluate the performance for each base classifier


In [46]:
X_train.values[indices[0]]

array([[600000,      2,      2, ...,   2186,   2310,   7511],
       [620000,      2,      1, ...,   2856,   4197,    920],
       [600000,      1,      1, ...,    601,      0,    492],
       ...,
       [500000,      1,      1, ...,   2010,      0,   4642],
       [500000,      2,      2, ...,  10000,  10000,  25304],
       [500000,      1,      1, ...,  24010,  15000,  20000]])

In [39]:
X_train[indices[0].tolist()]

KeyError: '[ 2466 18299 20636 17493   999 11182 16268  4650 19504  4708   505  8602\n 19616  8887 14879 19168  6621  6576  3612  7353  6131  6898  9427  8732\n  2451  1704 10540  8110 12487 12826 13969  5399 17846  9108 17445  2255\n 11788  4443  2694 11597 15066  2314  9979  1944 14902 20374 19665 20506\n  7629  8135  3822 16404 18472 14703 10987 18607 13805 17578 14696 11606\n  3795  2984 15418 11331  8094  2499 15891  4048 16068  7136 13706  2809\n 10812 17730  2478 14898 18290 16686  2838 20630  2841  2632  4017  9727\n  8874 19244 12847  1231  9838  3251  7656 15165  9562 17913 17495  7742\n 16927  8221  1105  1605] not in index'