In [13]:
import sys
sys.path.append('../')
from datasets import TwentyNewsGroups, PathologyReports
from feature_extraction.feature_extraction import BERT, BOW_DimReduction
# from classifiers.bayesian.BayesianLinearClassifier import BayesianLinearClassifier
# from classifiers.bayesian.BayesianNeuralNet import BayesianNeuralNet
from classifiers.bayesian.NeuralNet_BBB import BayesianNeuralNet_BBB
from classifiers.deterministic.LogisticClassifier import LogisticClassifier
from classifiers.deterministic.NeuralNetwork import NeuralNet
from design import ActiveLearningLoop
import numpy as np
# from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import sklearn.metrics as skm


In [14]:
def dataset_split(dataset, train_perc, test_perc):

    # run_directory = os.path.join(directory, 'run_{}'.format(run_i + 1))
    # perform stratified train/test/unlabel split
    ids_train = np.array([], dtype=int)
    ids_test = np.array([], dtype=int)
    ids_unlab = np.array([], dtype=int)
    for c in np.unique(dataset.y):
        # get all samples with label c
        ids_class = np.where(dataset.y == c)[0]
        idx = np.random.permutation(ids_class.size)

        nb_test = int(test_perc * ids_class.size)
        nb_train = int(train_perc * ids_class.size)

        ids_train = np.concatenate([ids_train,
                                    ids_class[idx[:nb_train]].copy()])
        ids_test = np.concatenate([ids_test,
                                   ids_class[idx[nb_train:(nb_train + nb_test)]].copy()])
        # ids_unlab = np.concatenate([ids_unlab,
        #                             ids_class[idx[(nb_train + nb_test):]].copy()])

    # shuffle data from all classes
    np.random.shuffle(ids_train)
    np.random.shuffle(ids_test)
    # np.random.shuffle(ids_unlab)

    # add one dataset object to each model
    dataset.data_split(ids_train, ids_unlab, ids_test)
    return dataset

In [15]:
def calibration_curve(y_true, y_prob, n_bins=10):

    y_true, y_prob = y_true.ravel(), y_prob.ravel()
    
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)
    binids = np.digitize(y_prob, bins) - 1
    bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
    bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
    bin_total = np.bincount(binids, minlength=len(bins))
    
    nonzero = bin_total != 0
    prob_true = (bin_true[nonzero] / bin_total[nonzero])
    prob_pred = (bin_sums[nonzero] / bin_total[nonzero])
    
    residual = np.absolute(prob_true - prob_pred).ravel()
    weights = bin_total[nonzero]/bin_total.sum().ravel()
    print('Bin total: {}'.format(bin_total))
    print('Residual: {}'.format(residual))
    print('Weights: {}'.format(weights))

    ece = 100*np.average(residual, weights=weights)
    return prob_true, prob_pred, ece


In [17]:
# dataset = TwentyNewsGroups('BOW_DimReduction')
#fe = BOW_DimReduction(features_dim=100, projection='PCA', remove_stop_words=True)
# fe = BOW_TopicModel(nb_topics=30)
fe = BERT(sentence_len=30)

dataset = TwentyNewsGroups(fe)
# dataset = PathologyReports('GTKum', fe)

dataset.prepare()
dataset = dataset_split(dataset, 0.6, 0.4)

clf = [LogisticClassifier(), NeuralNet(), BayesianNeuralNet_BBB()]

for c in clf:
    c.fit(dataset.train['x'], dataset.train['y'])
    yhat = c.predict(dataset.test['x'])
    fp, mpv, ece = calibration_curve(dataset.test['y'], yhat[:, 1], n_bins=10)
    acc = skm.accuracy_score(dataset.test['y'], yhat[:, 1].ravel().astype(int))
    plt.plot(mpv, fp, label='%s (ECE: %.2f | ACC: %.2f)'% (c.__str__(), ece, acc))
    
plt.plot([0, 1], [0, 1], 'r:')
plt.ylabel("Fraction of positives")
plt.xlabel("Mean predicted value")
plt.legend()



                               

Feature already extracted. Loading it ...
Bin total: [173   6   3   2   5  10   6   6   9 205   0]
Residual: [0.00508656 0.03316985 0.08599864 0.1481725  0.05042    0.04240459
 0.01726146 0.09475154 0.0312896  0.00078605]
Weights: [0.40705882 0.01411765 0.00705882 0.00470588 0.01176471 0.02352941
 0.01411765 0.01411765 0.02117647 0.48235294]




TypeError: only size-1 arrays can be converted to Python scalars