In [1]:
import math
from pathlib import Path
import subprocess
import numpy as np
import pickle
from os import path
import os
from random import sample
import csv
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
# this is so we can render big dendogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from IPython.display import clear_output
import time
import warnings

In [2]:
def compile_tests(project):
    pitestOutput = subprocess.run(["mvn", "test-compile"], capture_output=True, cwd="projects/" + project, text=True)
    return pitestOutput

def execute_normal(project):
    pitestOutput = subprocess.run(["mvn", "-Drat.skip=true", "org.pitest:pitest-maven:mutationCoverage"], capture_output=True, cwd="projects/" + project, text=True)
    return pitestOutput

def execute_verbose(project, addition=""):
    pitestOutput = subprocess.run(["mvn", "-Dverbose=true", "-Drat.skip=true", "-Dfeatures=+cluster", "org.pitest:pitest-maven:mutationCoverage"], capture_output=True, cwd="projects/" + project, text=True)
    with open("verboseOutputs/" + project + addition  + "-pitestOutput.txt", "w") as verboseFile:
        verboseFile.write(pitestOutput.stdout)
    return pitestOutput.stdout.split("\n")

def characteristics_extraction(project):
    pitestOutput = subprocess.run(["mvn", "-Drat.skip=true", "-Dfeatures=+characteristics" "org.pitest:pitest-maven:mutationCoverage"], capture_output=True, cwd="projects/" + project, text=True)
    return pitestOutput

def execute_cluster(project):
    pitestOutput = subprocess.run(["mvn", "-Drat.skip=true", "-Dfeatures=+cluster", "org.pitest:pitest-maven:mutationCoverage"], capture_output=True, cwd="projects/" + project, text=True)
    return pitestOutput

def read_verbose(project):
    with open("verboseOutputs/" + project + "-pitestOutput.txt") as verboseFile:
        pitestOutput = verboseFile.readlines()
        pitestOutput.append("filler")
        return pitestOutput

def export_clusters(labels, csv_data, export_dir):
    df = pd.DataFrame(columns=["id", "cluster_id"])
    for i in range(0, len(labels)):
        df = df.append({"id": csv_data["id"][i], "cluster_id": labels[i]}, ignore_index=True)

    df.to_csv(export_dir + "/clustering/clusters.csv", sep=",", index=False)
    return df

# Source: tutorialspoint.com/file-searching-using-python
def find_files(filename, search_path):
   result = []

    # Walking top-down from the root
   for root, _, files in os.walk(search_path):
      if filename in files:
         result.append(os.path.join(root, filename))
   return result

def trainAlgorithm(projects, classificationAlgo=LogisticRegression, classAlgoParameters=[], parameters=[0.05, 5, 3, 3, 3, 3, 2, 2, 0.4]):
    localityReduction, n_localVarsClusters, perOperator, perOpcode, perReturn, perTryCatch, perLocCluster, perVarsCluster, samplingStop = parameters
    classificationTraining = []
    for project in projects:
        print("Starting training project: " + project)
        start_time = time.time()
        csv_path = "projects/" + project
        charPath = find_files("characteristics.csv", csv_path)
        if charPath:
            charPath = charPath[0]
            data = pd.read_csv(charPath,
                               names=["id", "mutOperator", "opcode", "returnType",
                                      "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                      "className", "methodName", "blockNumber", "lineNumber"],
                               skiprows=1)
            killedPath = "/".join(charPath.split("/")[:-1]) + "/killed.csv"
            if path.exists(killedPath):
                results = pd.read_csv(killedPath,
                                      names=["id", "killed", "numTests"],
                                      skiprows=1)
                results = results.drop(columns=["numTests"])
            else:
                print("Could not find killed.csv for project: " + project)
                continue
        else:
            print("Could not find characteristics.csv for project: " + project)
            continue

        # define ordinal encoding
        encoder = LabelEncoder()
        newData = data[["mutOperator", "opcode", "returnType", "localVarsCount",
                     "isInTryCatch", "className", "methodName", "lineNumber"]]
        for col in ["mutOperator", "returnType", "className", "methodName"]:
            newData[col] = encoder.fit_transform(newData[col])

        # To-do: choose a better number than 100000
        newData["className"] = newData["className"].apply(lambda x: x*100000)
        # To-do: choose a better number than 1000
        newData["methodName"] = newData["methodName"].apply(lambda x: x*1000)
        localityClustering = AgglomerativeClustering(distance_threshold=None,
                                                     n_clusters=int(math.ceil(len(data) * localityReduction)),
                                                     linkage="ward",
                                                     compute_distances=False)
        localityClustering.fit(newData[["className", "methodName", "lineNumber"]])

        varsClustering = AgglomerativeClustering(distance_threshold=None,
                                                 n_clusters=n_localVarsClusters,
                                                 linkage="ward",
                                                 compute_distances=False)
        varsClustering.fit(newData[["localVarsCount"]])

        training = data[["id", "mutOperator", "opcode", "returnType","isInTryCatch"]]
        training["localityCluster_id"] = localityClustering.labels_
        training["varsCluster_id"] = varsClustering.labels_
        training = training.merge(results, how="inner", on="id")

        print(time.time() - start_time)
        print("Starting creation of classification training set for project: " + project)
        start_time = time.time()

        allMutants = []
        mutantsSampleDfs = []
        mutantsSample = []
        i = 0
        while True:
            print("iteration = ", i)
            i += 1
            print(samplingStop * len(training["id"].tolist()), len(allMutants))
            mutantsSample = []
            for mutOperator in list(dict.fromkeys(training["mutOperator"].tolist())):
                mutList = training[training["mutOperator"] ==  mutOperator]["id"].tolist()
                mutantsSample.extend(sample(mutList, min(len(mutList), perOperator)))
            for opcode in list(dict.fromkeys(training["opcode"].tolist())):
                opList = training[training["mutOperator"] ==  mutOperator]["id"].tolist()
                mutantsSample.extend(sample(opList, min(len(opList), perOpcode)))
            for returnType in list(dict.fromkeys(training["returnType"].tolist())):
                rTypeList = training[training["returnType"] ==  returnType]["id"].tolist()
                mutantsSample.extend(sample(rTypeList, min(len(rTypeList), perReturn)))
            notInTCList = training[training["isInTryCatch"] == 0]["id"].tolist()
            mutantsSample.extend(sample(notInTCList, min(len(notInTCList), perTryCatch)))
            inTCList = training[training["isInTryCatch"] == 1]["id"].tolist()
            mutantsSample.extend(sample(inTCList, min(len(inTCList), perTryCatch)))
            for localityCluster_id in list(dict.fromkeys(training["localityCluster_id"].tolist())):
                locClusterList = training[training["localityCluster_id"] == localityCluster_id]["id"].tolist()
                mutantsSample.extend(sample(locClusterList, min(len(locClusterList), perLocCluster)))
            for varsCluster_id in list(dict.fromkeys(training["varsCluster_id"].tolist())):
                varsClusterList = training[training["varsCluster_id"] == varsCluster_id]["id"].tolist()
                mutantsSample.extend(sample(varsClusterList, min(len(varsClusterList), perVarsCluster)))
            allMutants.extend(mutantsSample)

            trainingSample = training[training["id"].isin(mutantsSample)]
            for mutOperator in list(dict.fromkeys(trainingSample["mutOperator"].tolist())):
                percentageKilled = 1 / len(trainingSample[trainingSample["mutOperator"] == mutOperator]) * len(trainingSample[(trainingSample["mutOperator"] == mutOperator) & (trainingSample["killed"] == 1)])
                trainingSample.loc[trainingSample["mutOperator"] == mutOperator, "mutOperator"] = percentageKilled
            for opcode in list(dict.fromkeys(trainingSample["opcode"].tolist())):
                percentageKilled = 1 / len(trainingSample[trainingSample["opcode"] == opcode]) * len(trainingSample[(trainingSample["opcode"] == opcode) & (trainingSample["killed"] == 1)])
                trainingSample.loc[trainingSample["opcode"] == opcode, "opcode"] = percentageKilled
            for returnType in list(dict.fromkeys(trainingSample["returnType"].tolist())):
                percentageKilled = 1 / len(trainingSample[trainingSample["returnType"] == returnType]) * len(trainingSample[(trainingSample["returnType"] == returnType) & (trainingSample["killed"] == 1)])
                trainingSample.loc[trainingSample["returnType"] == returnType, "returnType"] = percentageKilled
            percentageKilled = 1 / len(trainingSample[trainingSample["isInTryCatch"] == 0]) * len(trainingSample[(trainingSample["isInTryCatch"] == 0) & (trainingSample["killed"] == 1)])
            trainingSample.loc[trainingSample["isInTryCatch"] == 0, "isInTryCatch"] = percentageKilled
            percentageKilled = 1 / len(trainingSample[trainingSample["isInTryCatch"] == 1]) * len(trainingSample[(trainingSample["isInTryCatch"] == 1) & (trainingSample["killed"] == 1)])
            trainingSample.loc[trainingSample["isInTryCatch"] == 1, "isInTryCatch"] = percentageKilled
            for localityCluster_id in list(dict.fromkeys(training["localityCluster_id"].tolist())):
                percentageKilled = 1 / len(trainingSample[trainingSample["localityCluster_id"] == localityCluster_id]) * len(trainingSample[(trainingSample["localityCluster_id"] == localityCluster_id) & (trainingSample["killed"] == 1)])
                trainingSample.loc[trainingSample["localityCluster_id"] == localityCluster_id, "localityCluster_id"] = percentageKilled
            for varsCluster_id in list(dict.fromkeys(training["varsCluster_id"].tolist())):
                percentageKilled = 1 / len(trainingSample[trainingSample["varsCluster_id"] == varsCluster_id]) * len(trainingSample[(trainingSample["varsCluster_id"] == varsCluster_id) & (trainingSample["killed"] == 1)])
                trainingSample.loc[trainingSample["varsCluster_id"] == varsCluster_id, "varsCluster_id"] = percentageKilled

            mutantsSampleDfs.append(trainingSample)
#             if samplingStop * len(training["id"].tolist()) > len(set(mutantsSample) & set(allMutants)):
#                 break
            if samplingStop * len(training["id"].tolist()) < len(allMutants):
                break

        classificationTraining.append(pd.concat(mutantsSampleDfs).drop(columns=["id"]))

    print("Done with getting all classification training data.")
    print(time.time() - start_time)

    classificationTrainingDf = pd.concat(classificationTraining)
    X_train = classificationTrainingDf.drop(columns=["killed"]).values.tolist()
    y_train = classificationTrainingDf["killed"].tolist()

    print("Starting training of algorithm.")
    start_time = time.time()
    classifier = classificationAlgo(*classAlgoParameters)
    classifier.fit(X_train, y_train)

    print(time.time() - start_time)
    return classifier

def own_predict(classifier, project, parameters=[0.05, 5, 3, 3, 3, 3, 2, 2, 0.01]):
    localityReduction, n_localVarsClusters, perOperator, perOpcode, perReturn, perTryCatch, perLocCluster, perVarsCluster, samplingStop = parameters
    print("Starting prediction project: " + project)
    start_time = time.time()
    csv_path = "projects/" + project
    charPath = find_files("characteristics.csv", csv_path)
    if charPath:
        charPath = charPath[0]
        data = pd.read_csv(charPath,
                           names=["id", "mutOperator", "opcode", "returnType",
                                    "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                    "className", "methodName", "blockNumber", "lineNumber"],
                           skiprows=1)
        # Normally 1 would need to execute during prediction instead of checking the file for results.
        killedPath = "/".join(charPath.split("/")[:-1]) + "/killed.csv"
        if path.exists(killedPath):
            results = pd.read_csv(killedPath,
                                  names=["id", "killed", "numTests"],
                                  skiprows=1)
            results = results.drop(columns=["numTests"])
        else:
            print("Could not find killed.csv for project: " + project)
            return 1
    else:
        print("Could not find characteristics.csv for project: " + project)
        return 1

    # define ordinal encoding
    encoder = LabelEncoder()
    newData = data[["mutOperator", "opcode", "returnType", "localVarsCount",
                    "isInTryCatch","className", "methodName", "lineNumber"]]
    for col in ["mutOperator", "returnType", "className", "methodName"]:
        newData[col] = encoder.fit_transform(newData[col])

    # To-do: choose a better number than 100000
    newData["className"] = newData["className"].apply(lambda x: x*100000)
    # To-do: choose a better number than 1000
    newData["methodName"] = newData["methodName"].apply(lambda x: x*1000)
    localityClustering = AgglomerativeClustering(distance_threshold=None,
                                                 n_clusters=int(math.ceil(len(data) * localityReduction)),
                                                 linkage="ward",
                                                 compute_distances=False)
    localityClustering.fit(newData[["className", "methodName", "lineNumber"]])

    varsClustering = AgglomerativeClustering(distance_threshold=None, n_clusters=n_localVarsClusters, linkage="ward", compute_distances=False)
    varsClustering.fit(newData[["localVarsCount"]])

    training = data[["id", "mutOperator", "opcode", "returnType","isInTryCatch"]]
    training["localityCluster_id"] = localityClustering.labels_
    training["varsCluster_id"] = varsClustering.labels_
    training = training.merge(results, how="inner", on="id")

    mutantsSample = []
    for mutOperator in list(dict.fromkeys(training["mutOperator"].tolist())):
        mutList = training[training["mutOperator"] ==  mutOperator]["id"].tolist()
        mutantsSample.extend(sample(mutList, min(len(mutList), perOperator)))
    for opcode in list(dict.fromkeys(training["opcode"].tolist())):
        opList = training[training["mutOperator"] ==  mutOperator]["id"].tolist()
        mutantsSample.extend(sample(opList, min(len(opList), perOpcode)))
    for returnType in list(dict.fromkeys(training["returnType"].tolist())):
        rTypeList = training[training["returnType"] ==  returnType]["id"].tolist()
        mutantsSample.extend(sample(rTypeList, min(len(rTypeList), perReturn)))
    notInTCList = training[training["isInTryCatch"] == 0]["id"].tolist()
    mutantsSample.extend(sample(notInTCList, min(len(notInTCList), perTryCatch)))
    inTCList = training[training["isInTryCatch"] == 1]["id"].tolist()
    mutantsSample.extend(sample(inTCList, min(len(inTCList), perTryCatch)))
    for localityCluster_id in list(dict.fromkeys(training["localityCluster_id"].tolist())):
        locClusterList = training[training["localityCluster_id"] == localityCluster_id]["id"].tolist()
        mutantsSample.extend(sample(locClusterList, min(len(locClusterList), perLocCluster)))
    for varsCluster_id in list(dict.fromkeys(training["varsCluster_id"].tolist())):
        varsClusterList = training[training["varsCluster_id"] == varsCluster_id]["id"].tolist()
        mutantsSample.extend(sample(varsClusterList, min(len(varsClusterList), perVarsCluster)))

    trainingSample = training[training["id"].isin(mutantsSample)]
    print("reduction = " + str(1 / len(training) * len(trainingSample)))
    for mutOperator in list(dict.fromkeys(trainingSample["mutOperator"].tolist())):
        percentageKilled = 1 / len(trainingSample[trainingSample["mutOperator"] == mutOperator]) * len(trainingSample[(trainingSample["mutOperator"] == mutOperator) & (trainingSample["killed"] == 1)])
        training.loc[training["mutOperator"] == mutOperator, "mutOperator"] = percentageKilled
    for opcode in list(dict.fromkeys(trainingSample["opcode"].tolist())):
        percentageKilled = 1 / len(trainingSample[trainingSample["opcode"] == opcode]) * len(trainingSample[(trainingSample["opcode"] == opcode) & (trainingSample["killed"] == 1)])
        training.loc[training["opcode"] == opcode, "opcode"] = percentageKilled
    for returnType in list(dict.fromkeys(trainingSample["returnType"].tolist())):
        percentageKilled = 1 / len(trainingSample[trainingSample["returnType"] == returnType]) * len(trainingSample[(trainingSample["returnType"] == returnType) & (trainingSample["killed"] == 1)])
        training.loc[training["returnType"] == returnType, "returnType"] = percentageKilled
    percentageKilled = 1 / len(trainingSample[trainingSample["isInTryCatch"] == 0]) * len(trainingSample[(trainingSample["isInTryCatch"] == 0) & (trainingSample["killed"] == 1)])
    training.loc[training["isInTryCatch"] == 0, "isInTryCatch"] = percentageKilled
    percentageKilled = 1 / len(trainingSample[trainingSample["isInTryCatch"] == 1]) * len(trainingSample[(trainingSample["isInTryCatch"] == 1) & (trainingSample["killed"] == 1)])
    training.loc[training["isInTryCatch"] == 1, "isInTryCatch"] = percentageKilled
    for localityCluster_id in list(dict.fromkeys(training["localityCluster_id"].tolist())):
        percentageKilled = 1 / len(trainingSample[trainingSample["localityCluster_id"] == localityCluster_id]) * len(trainingSample[(trainingSample["localityCluster_id"] == localityCluster_id) & (trainingSample["killed"] == 1)])
        training.loc[training["localityCluster_id"] == localityCluster_id, "localityCluster_id"] = percentageKilled
    for varsCluster_id in list(dict.fromkeys(training["varsCluster_id"].tolist())):
        percentageKilled = 1 / len(trainingSample[trainingSample["varsCluster_id"] == varsCluster_id]) * len(trainingSample[(trainingSample["varsCluster_id"] == varsCluster_id) & (trainingSample["killed"] == 1)])
        training.loc[training["varsCluster_id"] == varsCluster_id, "varsCluster_id"] = percentageKilled

    training = training.drop(columns=["id", "killed"])
    predictResults = classifier.predict(training)
    print("Prediction for project " + project + " took: " + str(time.time() - start_time) + " seconds.")
    return predictResults

def precisionCalc(project, predictions):
    csv_path = "projects/" + project
    charPath = find_files("characteristics.csv", csv_path)
    if charPath:
        charPath = charPath[0]
        data = pd.read_csv(charPath,
                           names=["id", "mutOperator", "opcode", "returnType",
                                    "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                    "className", "methodName", "blockNumber", "lineNumber"],
                           skiprows=1)
        # Normally 1 would need to execute during prediction instead of checking the file for results.
        killedPath = "/".join(charPath.split("/")[:-1]) + "/killed.csv"
        if path.exists(killedPath):
            results = pd.read_csv(killedPath,
                                  names=["id", "killed", "numTests"],
                                  skiprows=1)
            results = results.drop(columns=["numTests"])
        else:
            print("Could not find killed.csv for project: " + project)
            return 1
    else:
        print("Could not find characteristics.csv for project: " + project)
        return 1
    data["prediction"] = predictions
    newData = data[["id", "prediction"]]
    merged = newData.merge(results, how="inner", on="id")
    merged.to_csv("projects/" + project + "/predictions.csv", sep=",", index=False)
    correctList = [1 if i == j else 0 for i, j in zip(merged["killed"].tolist(), merged["prediction"].tolist())]
    print(1/len(correctList)*len([1 for val in correctList if val == 1 ]))
    

In [3]:
# skipped = ["zxing", "commons-lang", "jodatime", "jfreechart", ]
# projects = ["google-auto-service", "google-auto-common", "scribejava-core", "google-auto-factory", "commons-csv",
#                 "commons-cli", "google-auto-value", "gson", "commons-io","commons-text", "commonc-codec", ]
# projects1 = ["commons-text", "commonc-codec", ]
seeds = [
    66304, 16389, 14706, 91254, 49890, 86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
    13388, 81617, 90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, ]
trainingProjects = ["google-auto-service", "scribejava-core", "commons-cli",
                    "google-auto-value", ]
projects = [ "google-auto-factory", "google-auto-common", "commons-csv"]
unused = ["gson", "commons-io","commons-text", "commons-codec",]

In [4]:
warnings.filterwarnings('ignore')
print("Starting training")
classifier = trainAlgorithm(trainingProjects)
with open("classifier.pkl", 'wb') as outp:  # Overwrites any existing file.
    pickle.dump(classifier, outp, pickle.HIGHEST_PROTOCOL)
print("Starting predictions")
for project in projects:
    results = own_predict(classifier, project)
    print("Results for project: " + project)
    precisionCalc(project, results)

Starting training
Starting training project: google-auto-service
0.033617496490478516
Starting creation of classification training set for project: google-auto-service
iteration =  0
190.0 0
Starting training project: scribejava-core
0.9533677101135254
Starting creation of classification training set for project: scribejava-core
iteration =  0
2298.4 0
iteration =  1
2298.4 767
iteration =  2
2298.4 1534
Starting training project: commons-cli
1.2661919593811035
Starting creation of classification training set for project: commons-cli
iteration =  0
2876.0 0
iteration =  1
2876.0 999
iteration =  2
2876.0 1998
Starting training project: google-auto-value
7.69359564781189
Starting creation of classification training set for project: google-auto-value
iteration =  0
6698.400000000001 0
iteration =  1
6698.400000000001 1899
iteration =  2
6698.400000000001 3798
iteration =  3
6698.400000000001 5697
Done with getting all classification training data.
4.3780357837677
Starting training of alg