In [6]:
import numpy
import matplotlib.pyplot as plt
import random
import pickle

import xgboost as xgb
from sklearn.model_selection import train_test_split

### Loading Data

In [5]:
with open("../Data/whale_data/X_train.pkl", "rb") as obj:
    x_train = pickle.load(obj)

with open("../Data/whale_data/X_test.pkl", "rb") as obj:
    x_test = pickle.load(obj)

with open("../Data/whale_data/y_train.pkl", "rb") as obj:
    y_train = pickle.load(obj)

with open("../Data/whale_data/y_test.pkl", "rb") as obj:
    y_test = pickle.load(obj)

### Setting Parameters for XGBoost

In [7]:
param = {
    # max depth of the tree
    "max_depth": 3,
    # step size shrinkage
    "eta": 0.3,
    # silent parameter
    "silent": 0,
    # objective -> binary classification
    "objective": "binary:logistic",
    # number of threads
    "nthread": 7,
    # evaluation criterion
    "eval_metric": "logloss"
}

In [9]:
plst = param.items()

### Computing the score ranges

In [15]:
def calc_stats(margin_scores):
    """
    input: xgboost margin scores -> obtained from bootstrap_pred
    ops: 
        min_scr = mean - (3 * std)
        max_scr = mean + (3 * std)
    returns: two numpy arrays -> min_scr, max_scr
    """
    min_scr = np.round(np.mean(margin_scores, axis=1) - (3 * np.std(margin_scores, axis=1)), 2)
    max_scr = np.round(np.mean(margin_scores, axis=1) + (3 * np.std(margin_scores, axis=1)), 2)
    return (min_scr, max_scr)

In [16]:
# test_case
margin_scores = np.array([[-0.22, -0.19, -0.17, -0.13], [-0.1, -0.05, 0.02, 0.10], [0.03, 0.11, 0.12, 0.15]])
min_score, max_score = calc_stats(margin_scores)

assert type(min_score) == np.ndarray, 'Incorrect Return type'
assert type(max_score) == np.ndarray, 'Incorrect Return type'

assert (min_score == np.array([-0.28, -0.23, -0.03])).all(), "Incorrect return value"

assert (max_score == np.array([-0.08,  0.22,  0.24])).all(), "Incorrect return value"

### Calculating Predictions

In [None]:
def f(x, y):
    if x == y:
        return max(x, -1)
    return 0

def predict(min_scr, max_scr):
    """
    input: min_scr and max_scr
    returns:
        values of min_scr & max_scr less than 0 -> predict -1 (Cuvier)
        value of min_scr less than 0 & value of max_scr greater than 0 -> predict 0 (unsure)
        values of min_scr & max_scr greater than 0 -> predict 1 (Gervais)
    """
    return np.array(list(map(f, np.sign(min_scr), np.sign(max_scr))))

In [17]:
# test_case
max_s = np.array([-0.49, -0.39, -0.33, -0.25, -0.2, -0.11, -0.04, 0.1, 0.3, 0.51])
min_s = np.array([-0.78, -0.68, -0.6, -0.53, -0.47, -0.42, -0.32, -0.21, -0.07, 0.22])
pred = predict(min_s, max_s)
true_pred = np.array([-1, -1, -1, -1, -1, -1, -1, 0, 0, 1])

assert type(pred) == np.ndarray, 'Incorrect return type'
assert (pred == true_pred).all(), 'Incorrect return value'

### Calculating Scores

In [31]:
def bootstrap_pred(x_train, x_test, y_train, y_test, n_bootstrap, min_r, max_r, 
                   bootstrap_size, num_round=100, plst=plst):
    margin_scores = []
    l_bound = int(n_bootstrap * min_r)
    u_bound = int(n_bootstrap * max_r)
    
    for _ in range(n_bootstrap):
        index = np.random.randint(len(x_train), size=bootstrap_size)
        x_sample = x_train[index]
        y_sample = y_train[index]
        
        d_train = xgb.DMatrix(x_sample, label=y_sample)
        d_test = xgb.DMatrix(x_test, label=y_test)
        
        evallist = [(d_train, "train"), (d_test, "eval")]
        
        bst = xgb.train(plst, d_train, num_round, evallist, verbose_eval=False)
        y_pred = bst.predict(d_test, ntree_limit=bst.best_ntree_limit, output_margin=True)
        y_pred = np.array(y_pred / (np.max(y_pred) - np.min(y_pred)))
        margin_scores.append(y_pred)
        
    margin_scores = np.array(margin_scores).T
    scores_filter = margin_scores[:, l_bound:u_bound]
    
    return calc_stats(scores_filter)

In [32]:
def process(x_train, x_test, y_train, y_test, n_bootstrap=100):
    min_scr, max_scr = bootstrap_pred(x_train, x_test, y_train, y_test, n_bootstrap=n_bootstrap, 
                                      min_r=0.1, max_r=0.9, bootstrap_size=len(x_train))
    pred = predict(min_scr, max_scr)
    return min_scr, max_scr, pred

### Test

In [33]:
sample_indices = np.load("../Data/whale_data/vis_indices.npy")
x_test_sample = x_test[sample_indices]
y_test_sample = np.array(y_test[sample_indices], dtype=int)

In [34]:
mid_point = np.load("../Data/whale_data/vis_midpt.npy")
avg_length = np.load("../Data/whale_data/vis_avg_length.npy")

In [35]:
min_scr, max_scr, pred = process(x_train, x_test_sample, y_train, y_test_sample)
length = max_scr - min_scr

In [42]:
# test_case
assert sum(min_scr <= mid_point) >= (
    0.7 * len(sample_indices)
), "Incorrect range (mean - 3*std) to (mean + 3*std)"

assert sum(max_scr >= mid_point) >= (
    0.7 * len(sample_indices)
), "Incorrect range (mean - 3*std) to (mean + 3*std)"

assert sum(length < 2*avg_length) >= (
    0.7 * len(sample_indices)
), "Incorrect length of range (mean - 3*std) to (mean + 3*std)"