In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import gc

data = fetch_openml(name='satimage')
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# treat 4 as the target class
y = (y=='4.').astype(int)
y.mean()

del data
gc.collect()

0.09720062208398134

0

In [8]:
df = pd.read_excel(r'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', header=1)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## Baseline performance

In [198]:
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [199]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1024)

rf = RandomForestClassifier(n_estimators=150, max_depth=5, random_state=1024)
baseline_result = cross_validate(rf, X, y, scoring='roc_auc', cv=cv, return_train_score=True)
pd.DataFrame(baseline_result).mean()

fit_time       0.755876
score_time     0.046805
test_score     0.943349
train_score    0.963619
dtype: float64

## With weight

In [200]:
rf = RandomJungleClassifier(n_estimators=150, max_depth=5, random_state=1024)
result = cross_validate(rf, X, y, scoring='roc_auc', cv=cv, return_train_score=True)
pd.DataFrame(result).mean()

fit_time       3.296730
score_time     1.272327
test_score     0.943775
train_score    0.964059
dtype: float64

In [189]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import safe_indexing

def exponential_auc_score(auc):
    return np.exp((auc-0.5) * 10)

def absolute_cut_auc_score(auc):
    return 1

class RandomJungleClassifier(RandomForestClassifier):
    
    def fit(self, X, y, sample_weight=None):
        super().fit(X, y, sample_weight)
        self.cip = [get_interval_performance(clf, n_bins=20) for clf in self.estimators_]
        return self
    
    def predict_proba(self, X):
        total_weight = np.zeros(X.shape[0])
        weighted_prediction = np.zeros(X.shape[0])

        for i, tree in enumerate(self.estimators_):
            prediction = tree.predict_proba(X)[:, 1]
            weight = np.array([get_weight_given_prediction(self.cip[i], p, exponential_auc_score) for p in prediction])

            total_weight += weight
            weighted_prediction += prediction * weight

        prediction = np.zeros((X.shape[0], 2))
        prediction[:, 1] = weighted_prediction / total_weight
        prediction[:, 0] = 1 - prediction[:, 1]
        return prediction

In [156]:
from sklearn.utils import safe_indexing

def get_interval_performance(clf, n_bins=10, score='roc_auc'):
    # cutoffs = np.linspace(0, 1, num=n_bins)
    # use qcut
    interval_performance = dict()

    prediction = clf.predict_proba(X)[:, 1]
    
    # add epilson to the last cutoff so 1 falls in the range
    _, cutoffs = pd.qcut(prediction, q=n_bins, retbins=True, duplicates='drop')
    cutoffs[-1] += 1e-10
    
    for i in range(len(cutoffs) - 1):
        idx = np.where((cutoffs[i] <= prediction) & (prediction < cutoffs[i+1]))[0]
        try:
            interval_performance[(cutoffs[i], cutoffs[i+1])] = \
                roc_auc_score(safe_indexing(y, idx), safe_indexing(prediction, idx))
        except:
            continue
    return interval_performance

def get_weight_given_prediction(cip, pred, score_to_weight=None, default=0):
    """ Given the clssifier interval performance and a single prediction score, 
        return the weight for the classifier. Default is used is the prediction
        does not fall in any of the intervals.
    """
    if score_to_weight is None:
        score_to_weight = lambda x: x
        
    for itv, score in cip.items():
        if itv[0] <= pred < itv[1]:
            return score_to_weight(score)
    else:
        return default
    
def exponential_auc_score(auc):
    return np.exp((auc - 0.5) * 3)

def predict(rf, cip, X, score_to_weight=None):
    total_weight = np.zeros(X.shape[0])
    weighted_prediction = np.zeros(X.shape[0])

    for i, tree in enumerate(rf.estimators_):
        prediction = tree.predict_proba(X)[:, 1]
        weight = np.array([get_weight_given_prediction(cip[i], p, score_to_weight) for p in prediction])

        total_weight += weight
        weighted_prediction += prediction * weight

    return weighted_prediction / total_weight