In [None]:
!pip install requests black nb_black hpbandster > /dev/null
%load_ext nb_black

In [None]:
import os
from pathlib import Path

from requests import get
import pandas as pd
import numpy as np

np.random.seed(0)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import logging

logging.basicConfig(level=logging.WARN)

from sklearn.model_selection import StratifiedShuffleSplit
from xplainet.random_utils import setup_seed, SEED

In [None]:
setup_seed()

In [None]:
from xplainet.input_utils import preproc_dataset
from xplainet.model import build_model
from xplainet.tuner.bohb_tuner import BOHBTuner
from xplainet.tuner.xplainet_worker import XplaiNetWorker

# Utilities

In [None]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force and out.exists():
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

In [None]:
UNKNOWN_VALUE = ["Unkn0wnV@lue"]


class SafeLabelEncoder(LabelEncoder):
    """
    Safe label encoder, encoding every unknown value as Unkn0wnV@lue.
    """

    def fit(self, y):
        """
        Fit the label encoder, by casting the numpy array as a string, then adding the code for unknown.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        SafeLabelEncoder
            itself, fitted
        """
        return super().fit(np.concatenate((y.astype("str"), UNKNOWN_VALUE)))

    def fit_transform(self, y):
        """
        Fit the encoder, then transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        self.fit(y)
        return super().transform(y)

    def transform(self, y):
        """
        Transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        return super().transform(
            np.where(
                np.isin(y.astype("str"), self.classes_), y.astype("str"), UNKNOWN_VALUE
            )
        )



# Download census-income dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

dataset_name = "census-income"
out = Path(os.getcwd() + "/data/" + dataset_name + ".csv")
out_test = Path(os.getcwd() + "/data/" + dataset_name + "_test.csv")

download(url, out, force=False)
download(url_test, out_test, force=False)

# Load data and split

In [None]:
cols = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target",
]

In [None]:
train = pd.read_csv(out, names=cols)
test = pd.read_csv(out_test, names=cols, skiprows=2)
target = "target"

train[target] = train[target].str.strip()
# Test has . in label, let's clean it
test[target] = test[target].str.strip().str.strip(".")

In [None]:
used_columns = list(set(train.columns.tolist()) - set([target]) - set(["Set"]))
used_columns

In [None]:
if "Set" not in train.columns:
    print("Building tailored column")
    train_index, valid_index = next(
        StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED).split(
            range(train[target].shape[0]), train[target].values
        )
    )
    train["Set"] = "train"
    train["Set"][valid_index] = "valid"

In [None]:
train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
# test_indices = train[train.Set == "test"].index

# Simple preprocessing

In [None]:
input_train, params = preproc_dataset(train.loc[train_indices], target, ["Set"])
params

In [None]:
input_valid, _ = preproc_dataset(train.loc[valid_indices], target, ["Set"], params)
input_test, _ = preproc_dataset(test, target, ["Set"], params)

In [None]:
target_encoder = LabelEncoder()

In [None]:
train[target] = target_encoder.fit_transform(train[target].values.reshape(-1))
y_train = train[target].values[train_indices]
y_valid = train[target].values[valid_indices]
y_test = target_encoder.fit_transform(test[target].values)

In [None]:
params

# BOHB

## XGB Worker

In [None]:
tuner = BOHBTuner(XplaiNetWorker)

In [None]:
min_budget = 10
max_budget = 50
n_iter = 10

In [None]:
%%time
result = tuner.fit(
    input_train, y_train, input_valid, y_valid, input_test, y_test, params=params, n_iter=n_iter, min_budget=min_budget, max_budget=max_budget
)
result

In [None]:
%matplotlib inline
tuner.describe_results()

In [None]:
result["best_params"]

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(**result["best_params"], n_estimators=10000)
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_train)[:, 1], y_true=y_train)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_valid)[:, 1], y_true=y_valid)

In [None]:
roc_auc_score(y_score=clf.predict_proba(X_test)[:, 1], y_true=y_test)