In [1]:
import math
from pathlib import Path
import subprocess
import numpy as np
import csv
import pandas
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
# this is so we can render big dendogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import LabelEncoder
from IPython.display import clear_output
import time

In [2]:
def execute_verbose(project):
    pitestOutput = subprocess.run(["mvn", "test-compile", "-Dverbose=true", "-Drat.skip=true", "-Dfeatures=+characteristics", "org.pitest:pitest-maven:mutationCoverage"], capture_output=True, cwd="projects/" + project, text=True)
    with open("verboseOutputs/" + project + "-pitestOutput.txt", "w") as f:
        f.write(pitestOutput.stdout)
    return pitestOutput.stdout.split("\n")

def read_verbose(project):
    with open("verboseOutputs/" + project + "-pitestOutput.txt") as f:
        pitestOutput = f.readlines()
        pitestOutput.append("filler")
        return pitestOutput

def export_clusters(labels, csv_data, export_dir):
    df = pandas.DataFrame(columns=["id", "cluster_id"])
    for i in range(0, len(labels)):
        df = df.append({"id": csv_data["id"][i], "cluster_id": labels[i]}, ignore_index=True)

    df.to_csv(export_dir + "/clustering/clusters.csv", sep=",", index=False)
    return df

def execute_mutantsV2(project, labels, data, seed):
    includeTimings = []
    excludeTimings = []
    start_time = time.time()
    pitestOutputFile = open("verboseOutputs/" + project + "-pitestOutput.txt", "r")
    excludeTimings.append(time.time() - start_time)
    df = pandas.DataFrame(columns=["id", "cluster_id"])
    for i in range(0, len(labels)):
        df = df.append({"id": data["id"][i], "cluster_id": labels[i]}, ignore_index=True)
    clusters = df["cluster_id"].unique()
    for cluster_id in clusters:
        tmp = df[df["cluster_id"] == cluster_id]
        mutantID = tmp.sample(random_state=seed).iloc[0]["id"]
        start_time = time.time()
        mutantID = mutantID[:45] + mutantID[45:].replace(" ", ", ")
        found = 0
        endOfFileReached = 1
        for line in pitestOutputFile:
            if found == 0 and mutantID in line:
                found = 1
            elif found == 1:
                if "MutationDetails [id=MutationIdentifier [location=Location" in line:
                    includeTimings.append(5)
                    endOfFileReached = 0
                    break
                elif "replaced" in line:
                    includeTimings.append(int(line.split(" ")[-2]) * 0.001)
                elif "processed" in line:
                    includeTimings.append(int(line.split(" ")[-2]) * 0.001)
                    endOfFileReached = 0
                    break
        # If found is 0, the end of file was reached without finding the mutation,
        # therefore there was no coverage. If true, it was the last mutation in the
        # output and timed out.
        if endOfFileReached and found:
            includeTimings.append(5)
        excludeTimings.append(time.time() - start_time)

    return sum(includeTimings) - sum(excludeTimings)

def execute_mutants(project, labels, data, seed):
    includeTimings = []
    excludeTimings = []
    start_time = time.time()
    pitestOutput = np.array(read_verbose(project))
    excludeTimings.append(time.time() - start_time)
    df = pandas.DataFrame(columns=["id", "cluster_id"])
    for i in range(0, len(labels)):
        df = df.append({"id": data["id"][i], "cluster_id": labels[i]}, ignore_index=True)
    clusters = df["cluster_id"].unique()
    for cluster_id in clusters:
        tmp = df[df["cluster_id"] == cluster_id]
        mutantID = tmp.sample(random_state=seed).iloc[0]["id"]
        start_time = time.time()
        mutantID = mutantID[:45] + mutantID[45:].replace(" ", ", ")
        mutantIdx = [i for i, line in enumerate(pitestOutput) if mutantID in line]
        if len(mutantIdx) > 0:
            startIndex = mutantIdx[0]
        else:
            excludeTimings.append(time.time() - start_time)
            continue
        found = 0
        for line in pitestOutput[startIndex+1:]:
            if "MutationDetails [id=MutationIdentifier [location=Location" in line:
                includeTimings.append(5)
                found = 1
                break
            elif "replaced" in line:
                includeTimings.append(int(line.split(" ")[-2]) * 0.001)
            elif "processed" in line:
                includeTimings.append(int(line.split(" ")[-2]) * 0.001)
                found = 1
                break
        if found == 0:
            includeTimings.append(5)
        excludeTimings.append(time.time() - start_time)

    return sum(includeTimings) - sum(excludeTimings)

def clusteredTestingSimulation(project, cur_seed, reduction, timingFile):
    start_time = time.time()
    csv_path = "projects/" + project
    try:
        data = pandas.read_csv(csv_path + "/target/pit-reports/clustering/characteristics.csv",
                                names=["id", "mutOperator", "opcode", "returnType",
                                        "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                        "className", "methodName", "blockNumber", "lineNumber",
                                        "numTests"],
                                skiprows=1)
    except FileNotFoundError:
        data = pandas.read_csv(csv_path + "/processor/target/pit-reports/clustering/characteristics.csv",
                                names=["id", "mutOperator", "opcode", "returnType",
                                        "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                        "className", "methodName", "blockNumber", "lineNumber",
                                        "numTests"],
                                skiprows=1)


    # define ordinal encoding
    encoder = LabelEncoder()
    data = data[["id", "mutOperator", "opcode", "returnType",
                    "localVarsCount", "isInTryCatch", "isInFinalBlock", "className", "methodName",
                    "blockNumber", "lineNumber", "numTests"]]
    # Transform each column.. do id last since we need to inverse that.
    for col in ["mutOperator", "returnType", "className", "methodName", "id"]:
        data[col] = encoder.fit_transform(data[col])

    clustering = AgglomerativeClustering(distance_threshold=None,
                                            n_clusters=int(math.ceil(len(data) * reduction)),
                                            linkage="ward",
                                            compute_distances=True)
    clusters = clustering.fit(data)

    # unlabel id so we can recognize the mutants
    data["id"] = encoder.inverse_transform(data["id"])
    resultTime = execute_mutants(project, clusters.labels_, data, cur_seed)
    simulationTime = time.time() - start_time
    timingFile.write(str(reduction) + ", " + str(simulationTime) + ", " + str(resultTime) + ", " + str(simulationTime + resultTime))


In [3]:
# skipped = ["zxing", "commons-lang", "jodatime", "jfreechart", ]
# projects = ["google-auto-service", "google-auto-common", "scribejava-core", "google-auto-factory", "commons-csv",
#                 "commons-cli", "google-auto-value", "gson", "commons-io","commons-text", "commonc-codec", ]
# projects1 = ["commons-text", "commonc-codec", ]
seeds = [
    66304, 16389, 14706, 91254, 49890, 86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
    13388, 81617, 90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, ]
#timesProjects1 = [212, 3467, 1276, 8993, 11280, 34, 624, 443]
projects1 = ["commons-cli", "commons-text", "commons-codec", "commons-io", "google-auto-value", "google-auto-service", "google-auto-factory", "google-auto-common", ]
projects = ["google-auto-service", "google-auto-factory", "google-auto-common" ]
reductions = [0.1, 0.25, 0.5, 0.75]

In [4]:
full_time = 0

for project in projects:
    print(project)
    file = open("timings/" + project + "-timings", "w")
    for reduction in reductions:
        clusteredTestingSimulation(project, seeds[0], reduction, file)
        file.write("\n")
    file.close()

# directory = "expirements_results"
# seeds = [
#     66304, 16389, 14706, 91254, 49890, 86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
#     13388, 81617, 90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, ]
# for project in projects:
#     results_df = pandas.DataFrame(columns=["seed", "reduction", "score", "acc_avg", "acc_min", "acc_max", ])
#     for seed in seeds:
#         print(str(seed))
#         results_df = do_exp1_full(directory, project, seed, results_df, True)

#     results_df.to_csv(directory + "/full" + "/results_exp_" + project + ".csv", sep=",",
#                         index=False, )

google-auto-service
google-auto-factory
google-auto-common


KeyboardInterrupt: 