## Results Evaluation

This notebook has the purpose of summarizing and evaluating the results of all experiments.

In [1]:
import numpy as np
# First task: Generate a summary table for latex, containing the results for all experiments
experiments = ['MiniPalindrome', 'Sorting', 'Cystic', 'Leukemia']
classifiers = ['linear', 'RBF', 'ST', 'SST', 'PT', 'Rec', 'TES']

# print the header
print('\\begin{tabular}{lccccc}')
print(' & unattacked & \\multicolumn{2}{c}{random} & \\multicolumn{2}{c}{backtracing}\\\\')
print('classifier & accuracy & success rate & dist.\ ratio & success rate & dist.\ ratio\\\\')

# iterate over all experiments
for exp in experiments:
    # print header
    print('\\cmidrule(lr){1-1} \\cmidrule(lr){2-2} \\cmidrule(lr){3-4} \\cmidrule(lr){5-6}')
    print('& \\multicolumn{5}{c}{%s} \\\\' % exp)
    print('\\cmidrule(r){1-1} \\cmidrule(lr){2-2} \\cmidrule(lr){3-4} \\cmidrule(lr){5-6}')
    # load data for this experiment
    accs  = np.loadtxt('results/%s_accs.csv' % exp.lower())
    n_folds = accs.shape[0]
    table = np.zeros((len(classifiers), 5, n_folds))
    table[:, 0, :] = accs.T
    table[:, 1, :] = 1. - np.loadtxt('results/%s_accs_random_adversarial.csv' % exp.lower()).T
    table[:, 2, :] = np.loadtxt('results/%s_d_random_adversarial.csv' % exp.lower()).T
    table[:, 3, :] = 1. - np.loadtxt('results/%s_accs_adversarial.csv' % exp.lower()).T
    table[:, 4, :] = np.loadtxt('results/%s_d_adversarial.csv' % exp.lower()).T
    means = np.nanmean(table, axis=2)
    stds  = np.nanstd(table, axis=2)
    # retrieve the best values in each column
    best = np.zeros((len(classifiers), 5), dtype=np.bool)
    for c in range(len(classifiers)):
        if(means[c, 1] < means[c, 3]):
            best[c, 3] = True
        else:
            best[c, 1] = True
        if(means[c, 2] < means[c, 4]):
            best[c, 2] = True
        else:
            best[c, 4] = True
    # iterate over all classifiers
    for c in range(len(classifiers)):
        # prepare a latex string containing the results for the current classifier
        c_str = classifiers[c]
        for col in range(5):
            if(np.isnan(means[c, col])):
                c_str += ' & n.a.\\ '
            elif(best[c, col]):
                c_str += ' & $\\bm{%0.2f \pm %0.2f}$' % (means[c, col], stds[c, col])
            else:
                c_str += ' & $%0.2f \pm %0.2f$' % (means[c, col], stds[c, col])
        c_str += ' \\\\'

        # print the results for the current classifier as a new row in the table
        print(c_str)
# print footer
print('\\end{tabular}')

\begin{tabular}{lccccc}
 & unattacked & \multicolumn{2}{c}{random} & \multicolumn{2}{c}{backtracing}\\
classifier & accuracy & success rate & dist.\ ratio & success rate & dist.\ ratio\\
\cmidrule(lr){1-1} \cmidrule(lr){2-2} \cmidrule(lr){3-4} \cmidrule(lr){5-6}
& \multicolumn{5}{c}{MiniPalindrome} \\
\cmidrule(r){1-1} \cmidrule(lr){2-2} \cmidrule(lr){3-4} \cmidrule(lr){5-6}
linear & $0.96 \pm 0.06$ & $0.09 \pm 0.09$ & $\bm{0.24 \pm 0.07}$ & $\bm{0.52 \pm 0.15}$ & $2.68 \pm 3.54$ \\
RBF & $1.00 \pm 0.00$ & $0.06 \pm 0.06$ & $\bm{0.27 \pm 0.21}$ & $\bm{0.52 \pm 0.17}$ & $1.44 \pm 0.51$ \\
ST & $0.88 \pm 0.07$ & $\bm{0.86 \pm 0.08}$ & $\bm{0.29 \pm 0.05}$ & $0.72 \pm 0.10$ & $0.93 \pm 0.15$ \\
SST & $0.96 \pm 0.06$ & $\bm{0.78 \pm 0.15}$ & $\bm{0.36 \pm 0.08}$ & $0.54 \pm 0.11$ & $1.91 \pm 1.19$ \\
PT & $0.96 \pm 0.06$ & $\bm{0.80 \pm 0.07}$ & $\bm{0.35 \pm 0.10}$ & $0.54 \pm 0.11$ & $1.91 \pm 1.19$ \\
Rec & $0.85 \pm 0.13$ & $0.72 \pm 0.14$ & $\bm{0.17 \pm 0.05}$ & $\bm{0.79 \pm 0.08}$ 

  keepdims=keepdims)


In [19]:
# In a next step, we perform a statistical evaluation. For each classifier separately, we evaluate
# whether random adversarial edits or tree edit distance edits achieve higher success rates and lower
# distance ratios using a one-sided wilcoxon signed rank test

# iterate over all experiments and aggregate the folds
folds_succ_random = []
folds_succ_ted = []
folds_d_random = []
folds_d_ted = []
for exp in experiments:
    folds_succ_random.append(1. - np.loadtxt('results/%s_accs_random_adversarial.csv' % exp.lower()))
    folds_succ_ted.append(1. - np.loadtxt('results/%s_accs_adversarial.csv' % exp.lower()))
    folds_d_random.append(np.loadtxt('results/%s_d_random_adversarial.csv' % exp.lower()))
    folds_d_ted.append(np.loadtxt('results/%s_d_adversarial.csv' % exp.lower()))
folds_succ_random = np.concatenate(folds_succ_random)
folds_succ_ted = np.concatenate(folds_succ_ted)
folds_d_random = np.concatenate(folds_d_random)
folds_d_ted = np.concatenate(folds_d_ted)

from scipy.stats import wilcoxon

# iterate over all classifiers
for c in range(len(classifiers)):
    # perform a wilcoxon signed rank test to test the alternative hypothesis that tree edit distance
    # attacks have (in the median) higher success rates for the current classifier
    higher = np.mean(folds_succ_ted[:, c] >= folds_succ_random[:, c]) >= 0.5
    _, p = wilcoxon(folds_succ_ted[:, c], folds_succ_random[:, c])
    # multiply the p value by len(classifiers) * 2 to perform a Bonferroni correction
    p *= len(classifiers) * 2
    # divide the p value by 2 again for a one-sided test
    p /= 2
    # print it
    if(higher):
        print('tree edit distance attacks have higher success rates for classifier %s: p = %g' % (classifiers[c], p))
    else:
        print('tree edit distance attacks have lower success rates for classifier %s: p = %g' % (classifiers[c], p))
    # perform a wilcoxon signed rank test to test the alternative hypothesis that tree edit distance
    # attacks have (in the median) higher distance ratios for the current classifier
    not_nan = np.logical_not(np.isnan(folds_d_random[:, c]))
    higher = np.mean(folds_d_ted[not_nan, c] >= folds_d_random[not_nan, c]) >= 0.5
    _, p = wilcoxon(folds_d_ted[not_nan, c], folds_d_random[not_nan, c])
    # multiply the p value by len(classifiers) * 2 to perform a Bonferroni correction
    p *= len(classifiers) * 2
    # divide the p value by 2 again for a one-sided test
    p /= 2
    # print it
    if(higher):
        print('tree edit distance attacks have higher distance ratios for classifier %s: p = %g' % (classifiers[c], p))
    else:
        print('tree edit distance attacks have lower distance ratios for classifier %s: p = %g' % (classifiers[c], p))
    

tree edit distance attacks have higher success rates for classifier linear: p = 3.73894e-06
tree edit distance attacks have higher distance ratios for classifier linear: p = 0.475224
tree edit distance attacks have higher success rates for classifier RBF: p = 7.69864e-06
tree edit distance attacks have higher distance ratios for classifier RBF: p = 0.0538006
tree edit distance attacks have higher success rates for classifier ST: p = 0.0138818
tree edit distance attacks have higher distance ratios for classifier ST: p = 0.00687078
tree edit distance attacks have higher success rates for classifier SST: p = 0.0303675
tree edit distance attacks have higher distance ratios for classifier SST: p = 0.00687078
tree edit distance attacks have higher success rates for classifier PT: p = 0.0945178
tree edit distance attacks have higher distance ratios for classifier PT: p = 0.00687078
tree edit distance attacks have lower success rates for classifier Rec: p = 0.00180204
tree edit distance attack