In [22]:
import numpy as np

from sklearn import svm
from sklearn.metrics import accuracy_score
from scipy.stats import mannwhitneyu, wilcoxon, binom

def classification_accuracy_weighted(a, b):
    '''
    Computes best accuracy of classifying between scalar variables using a separating line
    Compensates for disbalances in samples - aims to recover separability of equal classes

    :param a: 1D numpy array of floats
    :param b: 1D numpy array of floats
    :return: scalar measure of accuracy in percent
    '''

    aEff = a[~np.isnan(a)]
    bEff = b[~np.isnan(b)]

    Na = len(aEff)
    Nb = len(bEff)
    assert (Na > 0) and (Nb > 0)

    x = np.hstack([aEff, bEff])[:, None]
    y = [0] * Na + [1] * Nb
    w = [Nb] * Na + [Na] * Nb
    clf = svm.SVC()
    clf.fit(x, y, sample_weight=w)
    yHat = clf.predict(x)
    return accuracy_score(y, yHat)

def classification_accuracy_weighted_handmade(a, b, alternative='greater'):
    if alternative == 'greater':
        nX = len(a)
        nY = len(b)
        
        aEff = a[a < np.max(b)]
        acc = [(np.sum(a >= t) / nX + np.sum(b < t) / nY) / 2 for t in aEff]
        return np.max(acc)
    if alternative == 'lesser':
        return classification_accuracy_weighted_handmade(b, a, alternative='greater')
    elif alternative == 'two-sided':
        acc1 = classification_accuracy_weighted_handmade(a, b, alternative='greater')
        acc2 = classification_accuracy_weighted_handmade(b, a, alternative='greater')
        return max(acc1, acc2)

In [17]:
x = np.random.normal(0, 1, 1000)
y = np.random.normal(1, 1, 1000)

In [18]:
%%timeit
classification_accuracy_weighted(x, y)

3.29 s ± 149 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
classification_accuracy_weighted_handmade(y, x)

18.4 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
