In [14]:
# Do all necessary imports here
import itertools as it
import statistics

from bisect import bisect_left
from pathlib import Path
from typing import List, Tuple, Optional, Dict

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylatex
import scipy.stats as ss
import seaborn as sns
import glob

from pandas import Categorical

Useful Functions

In [2]:
def vd_a(treatment: List[float], control: List[float]) -> Tuple[float, str]:
    """Compute Vargha and Delaney A index

    A. Vargha and H. D. Delaney.  A critique and improvement of the CL common language
    effect size statistics of McGraw and Wong.  Journal of Educational and Behavioral
    Statistics, 25(2):101-132, 2000.

    The formula to compute A has been transformed to minimise accuracy errors, see
    https://mtorchiano.wordpress.com/2014/05/19/effect-size-of-r-precision/

    :param treatment: a list of numbers
    :param control: a list of numbers
    :return: the value estimate and the magnitude
    """
    m = len(treatment)
    n = len(control)

    #if m != n:
    #    raise ValueError("Parameter lists must have equal lengths")

    r = ss.rankdata(treatment + control)
    r1 = sum(r[0:m])

    # Compute the measure
    # A = (r1/m - (m+1)/2)/n  # formula (14) in Vargha and Delaney, 2000
    A = (2 * r1 - m * (m + 1)) / (2 * n * m)  # equivalent formula with better accuracy

    levels = [0.147, 0.33, 0.474]
    magnitudes = ["negligible", "small", "medium", "large"]
    scaled_A = (A - 0.5) * 2

    magnitude = magnitudes[bisect_left(levels, abs(scaled_A))]
    estimate = A

    return estimate, magnitude

In [12]:
os.chdir("/home/l_pc1-l/ba/own_stuff/ba-thesis/evaluation/results/opt_const/data")
extension = 'csv'
all_filenames = [i for i in glob.glob('./**/*.{}'.format(extension), recursive=True)]

#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "results.csv", index=False, encoding='utf-8-sig')

In [17]:
os.getcwd()

'/home/l_pc1-l/ba/own_stuff/ba-thesis/evaluation/results/opt_const/data'

In [16]:
# The names of the columns we are interested in
cut = "TargetModule"
project_name = "ProjectName"
config = "ConfigurationId"
coverage = "Coverage"
seconds = [f"CoverageTimeline_T{i}" for i in range(1, 601)]
use_cols = [cut, project_name, config, coverage] + seconds

sns.set(style="whitegrid")

# How often every CUT was executed
runs = 1
os.getcwd()
# Adjust this path on your system if you want to rerun this sheet!
PAPER_EXPORT_PATH = Path("/home/l_pc1-l/ba/own_stuff/ba-thesis/evaluation")

results = pd.read_csv(Path("../..") / "results" / "opt_const" / "results.csv")
project_information = pd.read_csv(Path("../..") / "results" / "opt_const" / "projects.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../../results/opt_const/results.csv'

In [6]:
number_cuts = len(set(results[cut]))
print(f"We tested {number_cuts} unique classes, each being executed {runs} times per configuration")

We tested 105 unique classes, each being executed 1 times per configuration


In [10]:
config_names = list(set(results[config]))
config_names.sort()
print("We used {} configurations, namely:\n - {}".format(
    len(config_names), "\n - ".join(config_names)
))

TypeError: '<' not supported between instances of 'str' and 'float'

In [8]:
results.groupby(config)[coverage].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ConfigurationId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Seed_Prob_0_0,616.0,0.638686,0.321325,0.037736,0.306644,0.7,1.0,1.0
Seed_Prob_0_1,598.0,0.641886,0.323161,0.037736,0.306818,0.7,1.0,1.0
Seed_Prob_0_2,604.0,0.635096,0.325065,0.037736,0.3,0.666667,1.0,1.0
Seed_Prob_0_3,622.0,0.635337,0.320596,0.037736,0.306818,0.7,0.990385,1.0
Seed_Prob_0_4,616.0,0.635396,0.322214,0.037736,0.306644,0.666667,1.0,1.0
Seed_Prob_0_5,622.0,0.636023,0.320389,0.037736,0.310345,0.692308,0.990385,1.0
Seed_Prob_0_6,622.0,0.641024,0.319144,0.037736,0.310345,0.7,0.960284,1.0
Seed_Prob_0_7,604.0,0.633696,0.322716,0.037736,0.306122,0.666667,1.0,1.0
Seed_Prob_0_8,616.0,0.639235,0.322474,0.037736,0.309463,0.7,1.0,1.0
Seed_Prob_0_9,622.0,0.64128,0.32045,0.037736,0.310345,0.7,1.0,1.0


In [9]:
table = pylatex.Table(position="H")
tabular = pylatex.Tabular('c c', booktabs=True)
tabular.add_row([
    pylatex.NoEscape(r"P\textsubscript{Constant}"),
    "Avg. Branch Coverage"
])



raw_table_data = results.groupby(config, as_index=False).agg(
    {
        coverage: "mean"
    }
)

for _, row in raw_table_data.iterrows():
    tabular.add_row([
        row[config][-3] + "." + row[config][-1] ,
        "{:.4f}".format(row[coverage])        
    ])

tabular.add_hline()
table.append(pylatex.NoEscape(r'\centering'))
table.append(tabular)
table.add_caption("Table showing the different values for seeding " +
                  "constants and the corresponding achieved coverage.")
label = pylatex.Label("tabconstvalues")
table.append(label)

with open ("../../../Thesis/chapters/evaluation_tables/opt_const_table.tex", "w") as file:
    file.write(table.dumps())
print(table.dumps())

\begin{table}[H]%
\centering%
\begin{tabular}{@{}c c@{}}%
\toprule%
P\textsubscript{Constant}&Avg. Branch Coverage\\%
0.0&0.6387\\%
0.1&0.6419\\%
0.2&0.6351\\%
0.3&0.6353\\%
0.4&0.6354\\%
0.5&0.6360\\%
0.6&0.6410\\%
0.7&0.6337\\%
0.8&0.6392\\%
0.9&0.6413\\%
\midrule\bottomrule%
%
\end{tabular}%
\caption{Table showing the different values for seeding constants and the corresponding achieved coverage.}%
\label{tabconstvalues}%
\end{table}
