This file was made to calculate the impact more rigorous pre-processing and center selection has on the precision of Rasjaad Basarat's Clustered Mutation Testing algorithm.

More in-depth discussion is in Adam Abdalla's thesis.

In [None]:
import math
import subprocess
import numpy as np
from os import path
import os
import csv
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import time
import warnings
import pickle
from hyperopt import hp, fmin, tpe

In [None]:
# Source: tutorialspoint.com/file-searching-using-python
def find_files(filename: str, search_path: str) -> list:
   result = []

    # Walking top-down from the root
   for root, _, files in os.walk(search_path):
      if filename in files:
         result.append(path.join(root, filename))

   return result


# Gets characteristics and result status data from project file
# created by pitest clustering plugin.
def getProjectDfs(project: str) -> tuple[pd.DataFrame, pd.DataFrame] | int:
    csv_path = "projects/" + project
    charPath = find_files("characteristics.csv", csv_path)

    if charPath:
        charPath = charPath[0]
        data = pd.read_csv(charPath,
                           names=["id", "mutOperator", "opcode", "returnType",
                                    "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                    "className", "methodName", "blockNumber", "lineNumber"],
                           skiprows=1)
        # Normally 1 would need to execute during prediction instead of checking the file for results.
        killedPath = find_files("killed.csv", csv_path)

        if killedPath:
            killedPath = killedPath[0]
            results = pd.read_csv(killedPath,
                                  names=["id", "killed", "numTests"],
                                  skiprows=1)
            results = results.drop(columns=["numTests"])
        else:
            print("Could not find killed.csv for project: " + project)
            return -1
    else:
        print("Could not find characteristics.csv for project: " + project)
        return -1

    return data, results


# Source: https://stackoverflow.com/a/29651514
def normalize(df: pd.DataFrame) -> pd.DataFrame:
    result = df.copy()

    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)

    return result


# Transforms the given dataframe for better clustering as described in Adam's thesis.
def preprocessing(data: pd.DataFrame, weights: list) -> pd.DataFrame:
    idW, mutOpW, opcodeW, retTypeW, locVarsCountW, tryCatchW, bNumberW, localityW = weights

    # A mutant is more local than another if it is in the same class,
    # regardless of the other variables, therefore 10^5 multiplication.
    # 10^3 means the method also gets preferred over line number.
    data["locality"] = data["className"]*100000 + data["methodName"]*1000 + data["lineNumber"]

    # Data gets normalized so original ranges no longer matter.
    # For some of these variables, this *may* be a bad thing,
    # For example, a large project will have locality matter relatively less.
    normalizedData = normalize(data[["id", "opcode", "localVarsCount", "isInTryCatch", "blockNumber", "locality"]])

    # Weights and one-hot-encoded characteristics get added.
    weightedData = normalizedData.mul([idW, opcodeW, locVarsCountW, tryCatchW, bNumberW, localityW])
    newData = weightedData.join(pd.get_dummies(data["mutOperator"]) * mutOpW)
    newData = newData.join(pd.get_dummies(data["returnType"]) * retTypeW)

    return newData


# Does not truly execute mutants, just calculates the precision doing so would cause.
def execute_mutants(distances: np.ndarray, data: pd.DataFrame, centerSelection: int, preprocessing: int, seed: int) -> float:
    df = pd.DataFrame()
    df["id"] = data["id"]
    df["killed"] = data["killed"]

    # The cluster id the mutant is assigned is the index of the cluster center
    # that has the lowest distance from the mutant.
    df["cluster_id"] = [np.argmin(nodeDistances) for nodeDistances in distances]
    clusters = range(len(distances[0]))
    correctLen = len(df)

    # Assigns the killed/survived status to the other mutants in the cluster
    # based on that of the cluster representative.
    for cluster_id in clusters:
        if centerSelection:
            # Cluster representative = mutant with least distance where col index = cluster_id.
            killed = df.iloc[np.argmin(distances[:, cluster_id])]["killed"]
            df.loc[df["cluster_id"] == cluster_id, "prediction"] = killed
        else:
            # Cluster representative = random from cluster.
            tmp = df[df["cluster_id"] == cluster_id]
            killed = tmp.sample(random_state=seed).iloc[0]["killed"]
            df.loc[df["cluster_id"] == cluster_id, "prediction"] = killed

    correctList = [1 for i, j in zip(df["killed"].tolist(), df["prediction"].tolist()) if i == j]
    precision = 1/correctLen*len(correctList)
    return precision


# Calculates the precision for each reduction and for each seed if center selection is off.
def calcPrecicions(data: pd.DataFrame, reductions: list, encoder: LabelEncoder, newData: pd.DataFrame, centerSelection: int, preprocess: int, seeds: list) -> list:
    precisions = []
    dataSave = data

    for reduction in reductions:
        data = dataSave.copy()
        clustering = KMeans(n_clusters=int(math.ceil(len(data) * reduction)), n_init=1)

        # This gives a NxM matrix of distances to cluster centers
        # where N=# of Mutants and M=# of Clusters
        clusters = clustering.fit_transform(newData)

        # unlabel id so we can recognize the mutants
        data["id"] = encoder.inverse_transform(data["id"])
        if centerSelection:
            precision = execute_mutants(clusters, data, centerSelection, preprocess, seeds)
        else:
            seedprecisions = []
            for seed in seeds:
                seedprecisions.append(execute_mutants(clusters, data, centerSelection, preprocess, seed))
            precision = seedprecisions
        precisions.append(precision)

    return precisions


def clusteredTestingSimulation(project: str, seeds: list, reductions: list, parameters: list, centerSelection: int=0, preprocess: int=0) -> list:
    start_time = time.time()

    # Gets characteristics and result status data from project file.
    dataframes = getProjectDfs(project)
    if dataframes == -1:
        return -1

    data, results = dataframes
    data = data.merge(results, how="inner", on="id")

    # define ordinal encoding
    encoder = LabelEncoder()

    # Uses Adam's rigorous preprocessing if preprocess.
    # Original preprocessing otherwise.
    if preprocess:
        # Transform each column. Transform id last since we need to inverse that.
        for col in ["className", "methodName", "id"]:
            data[col] = encoder.fit_transform(data[col])
        newData = preprocessing(data, parameters)
    else:
        # Transform each column.. do id last since we need to invert that.
        for col in ["mutOperator", "returnType", "className", "methodName", "id"]:
            data[col] = encoder.fit_transform(data[col])
        newData = data.drop(columns=["killed"])

    return calcPrecicions(data, reductions, encoder, newData, centerSelection, preprocess, seeds)


Parameter cell down below. Note that these are not the only parameters you might want to change.

In [None]:
seeds = [
    66304, 16389, 14706, 91254, 49890, 86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
    13388, 81617, 90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, ]
projects = ["google-auto-common", "scribejava-core", "google-auto-factory", "commons-csv",
                "commons-cli", "google-auto-value", "gson", "commons-io","commons-text", "commons-codec", ]
reductions = [0.5, 0.25, 0.1]

In [None]:
warnings.filterwarnings('ignore')

The cell below calculates the precisions for every method, for every reduction, for every seed, for every project and prints it.

In [None]:
csImprovements = []
ppImprovements = []
csppImprovements = []

for project in projects:
    # Parameters used in Adam's thesis, picked from incomplete hyperopt run
    parameters = [20491, 13, 44, 4584, 10188, 14214, 2443, 15598]
    originalPrecisions = clusteredTestingSimulation(project, seeds, reductions, parameters)
    if originalPrecisions == -1:
        continue
    csPrecisions = clusteredTestingSimulation(project, seeds, reductions, parameters, centerSelection=1)
    ppPrecisions = clusteredTestingSimulation(project, seeds, reductions, parameters, centerSelection=0, preprocess=1)
    csppPrecisions = clusteredTestingSimulation(project, seeds, reductions, parameters, centerSelection=1, preprocess=1)
    print("Project = " + project)
    print("Original precisions = " + str(originalPrecisions))
    print("Precisions with center selection = " + str(csPrecisions))
    print("Precisions with preprocessing = " + str(ppPrecisions))
    print("Precisions with preprocessing and center selection = " + str(csppPrecisions))

We used this cell for our hyperopt runs. The variables you might want to change here are those inside the space, max_evals and the reductions(in the cell above).
You may also want to work with less seeds/projects.

In [None]:
# def objective(args):
#     parameters = [args["idW"], args["mutOpW"], args["opcodeW"], args["retTypeW"], args["locVarsCountW"], args["tryCatchW"], args["bNumberW"], args["localityW"]]
#     print(parameters)
#     precisions = []
#
#     for project in projects:
#         meanPrec = np.mean(np.concatenate(clusteredTestingSimulation(project, seeds, reductions, parameters, preprocess=1)).flat)
#         if meanPrec != -1:
#             precisions.append(meanPrec)

#     # Using np.hmean here instead could lead to more consistent performance across projects.
#     print("Score = " + str(100 - np.mean(precisions)))
#     return 100 - np.mean(precisions)

# space = {"idW": hp.randint("idW", 5001, 15001),
#          "mutOpW": hp.randint("mutOpW", 1, 201),
#          "opcodeW": hp.randint("opcodeW", 1, 10001),
#          "retTypeW": hp.randint("retTypeW", 1, 10001),
#          "locVarsCountW": hp.randint("locVarsCountW", 8001, 15001),
#          "tryCatchW":hp.randint("tryCatchW", 8001, 15001),
#          "bNumberW":hp.randint("bNumberW", 1, 10001),
#          "localityW": hp.randint("localityW", 5001, 10001)}

# best = fmin(objective, space, algo=tpe.suggest, max_evals=1)

# with open("best.pkl", 'wb') as outp:  # Overwrites any existing file.
#     pickle.dump(best, outp, pickle.HIGHEST_PROTOCOL)

# print(best)