# Results

This notebook focuses only on the results exploitation, based on the raw data made during experiments for reproducibility.

Set the system name

In [None]:
system_name = "Apache"

Load the dataset

In [None]:
import pandas as pd
import json
import numpy

def get_dataset(name):
    if name == "Linux":
        with open("datasets/Linux_options.json","r") as f:
            linux_options = json.load(f)
        # Load csv by setting options as int8 to save a lot of memory
        return pd.read_csv("datasets/Linux.csv", dtype={f:numpy.int8 for f in linux_options})
    else :
        return pd.read_csv("datasets/{}.csv".format(name))
    
df = get_dataset(system_name)

Load the results

In [None]:
df_classification = pd.read_csv("results/%s/classification.csv" % system_name)
df_classification_fs = pd.read_csv("results/%s/classification_fs.csv" % system_name)
df_regression = pd.read_csv("results/%s/regression.csv" % system_name)
df_regression_fs = pd.read_csv("results/%s/regression_fs.csv" % system_name)
df_spec_regression = pd.read_csv("results/%s/spec_regression.csv" % system_name)
df_spec_regression_fs = pd.read_csv("results/%s/spec_regression_fs.csv" % system_name)

In [None]:
df_classification

## Reproduce the tables

In [None]:
df_all = pd.concat([df_classification, df_classification_fs, df_regression, df_regression_fs, df_spec_regression, df_spec_regression_fs])

s = system_name
s += " & "
s += "{:.1f}\\%".format(df_all.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_all.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}\\%".format(df_classification.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_classification.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}\\%".format(df_classification_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_classification_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}".format(df_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}\\%".format(df_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}\\%".format(df_spec_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_spec_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)
s += " & "
s += "{:.1f}\\%".format(df_spec_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].mean() * 100)
s += r" ($\pm${:.1f})".format(df_spec_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[0.7].std() * 100)

print(s)

In [None]:
print("\\begin{table*}")
print("\\begin{tabular}{ |l|ccccc| }")
print("\\hline")
print("\\multirow{2}{*}{Training set size} & \\multicolumn{4}{c}{\\hspace{2cm}Acceptable configurations} & \\\\")

print("  &  10\% & 20\% & 50\% & 80\% & 90\% \\\\")

print("\\hline \\hline")
print("&\\multicolumn{5}{c|}{\\textbf{Classification}}&")
print("\\hline")

for k,i in df_classification.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').iterrows():
    s = "{:.0f}".format(int(k* df.shape[0]))
    for l,j in i.iteritems():
        j_fs = df_classification_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        diff = j_fs - j
        j_best = j_fs if j_fs > j else j
        best = j_best == df_all.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        color = "\\textcolor{ForestGreen}{+" if diff > 0 else "\\textcolor{red}{-"
        if abs(diff) <= 0.01:
            color = color.replace("red","gray").replace("ForestGreen","gray")
        s += " & {}{:.1f}{} ({}{:0.1f}{})".format(
            "\\textbf{" if best else "", 
            j_best*100, 
            "}" if best else "", 
            color,
            abs(diff*100),
            "}"
        )
    s += " \\\\"
    print(s)
    
print("\\hline \\hline")
print("&\\multicolumn{5}{c|}{\\textbf{Regression}}&")
print("\\hline")
for k,i in df_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').iterrows():
    s = "{:.0f}".format(int(k* df.shape[0]))
    for l,j in i.iteritems():
        j_fs = df_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        diff = j_fs - j
        j_best = j_fs if j_fs > j else j
        best = j_best == df_all.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        color = "\\textcolor{ForestGreen}{+" if diff > 0 else "\\textcolor{red}{-"
        color = color if diff >= 0.01 or diff <= -0.01 else "\\textcolor{gray}{+"
        
        s += " & {}{:.1f}{} ({}{:0.1f}{})".format(
            "\\textbf{" if best else "", 
            j_best*100, 
            "}" if best else "", 
            color,
            abs(diff*100),
            "}"
        )
    s += " \\\\"
    print(s)

print("\\hline \\hline")
print("&\\multicolumn{5}{c|}{\\textbf{Specialized Regression}}&")
print("\\hline")
for k,i in df_spec_regression.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').iterrows():
    s = "{:.0f}".format(int(k* df.shape[0]))
    for l,j in i.iteritems():
        j_fs = df_spec_regression_fs.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        diff = j_fs - j
        j_best = j_fs if j_fs > j else j
        best = j_best == df_all.groupby(["threshold","training_size"])["mean"].max().unstack('threshold').loc[k,l]
        color = "\\textcolor{ForestGreen}{+" if diff > 0 else "\\textcolor{red}{-"
        color = color if diff >= 0.01 or diff <= -0.01 else "\\textcolor{gray}{+"
        s += " & {}{:.1f}{} ({}{:0.1f}{})".format(
            "\\textbf{" if best else "", 
            j_best*100, 
            "}" if best else "", 
            color,
            abs(diff*100),
            "}"
        )
    s += " \\\\"
    print(s)
    
print("\\hline")
print("\\end{tabular}")
print("\\caption{Decision tree classification accuracy on performance specialization for", system_name, 
      "on three strategies. Bold represents the best result among other strategies including feature selection, the value in brackets is the difference made by feature selection\\label{tab:" + system_name.lower() + "}}")
print("\\end{table*}")