This file was made to calculate the precision of Contextual Predictive Mutation Testing in its original form.

More in-depth discussion is in Adam Abdalla's thesis.

In [None]:
import math
import numpy as np
from numpy.random import randint
import pickle
from os import path
import os
from random import sample, seed
import csv
import pandas as pd
import kmeans1d
from hyperopt import hp, fmin, tpe
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
import time
import warnings

In [None]:
# Source: tutorialspoint.com/file-searching-using-python
def find_files(filename, search_path):
   result = []

    # Walking top-down from the root
   for root, _, files in os.walk(search_path):
      if filename in files:
         result.append(os.path.join(root, filename))
   return result


# Gets characteristics and result status data from project file
# created by pitest clustering plugin.
def getProjectDfs(project: str) -> tuple[pd.DataFrame, pd.DataFrame] | int:
    csv_path = "projects/" + project
    charPath = find_files("characteristics.csv", csv_path)

    if charPath:
        charPath = charPath[0]
        data = pd.read_csv(charPath,
                            names=["id", "mutOperator", "opcode", "returnType",
                                    "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                    "className", "methodName", "blockNumber", "lineNumber"],
                            skiprows=1)
        killedPath = find_files("killed.csv", csv_path)

        if killedPath:
            killedPath = killedPath[0]
            results = pd.read_csv(killedPath,
                                    names=["id", "killed", "numTests"],
                                    skiprows=1)
            results = results.drop(columns=["numTests"])
        else:
            print("Could not find killed.csv for project: " + project)
            return -1
    else:
        print("Could not find characteristics.csv for project: " + project)
        return -1

    return data, results


# Uses label encoding and clustering to change numerical data to categorical.
# Also merges data with results.
def dfToCategorical(data: pd.DataFrame, results: pd.DataFrame, parameters: list) -> pd.DataFrame:
    idReduction, localityReduction, n_localVarsClusters = parameters
    # define ordinal encoding
    encoder = LabelEncoder()
    newData = data[["id", "mutOperator", "opcode", "returnType", "localVarsCount",
                    "isInTryCatch", "className", "methodName", "lineNumber"]]
    for col in ["mutOperator", "returnType", "className", "methodName", "id"]:
        newData[col] = encoder.fit_transform(newData[col])

    idClustering = kmeans1d.cluster(np.asarray(newData[["id"]], dtype="int64"), int(math.ceil(len(data) * idReduction)))[0]

    # Categorical locality variable creation.
    newData["className"] = newData["className"].apply(lambda x: x*100000)
    newData["methodName"] = newData["methodName"].apply(lambda x: x*1000)
    localityClustering = kmeans1d.cluster(np.asarray(newData[["className"]], dtype="int64") +
                                          np.asarray(newData[["methodName"]], dtype="int64") +
                                          np.asarray(newData[["lineNumber"]], dtype="int64"), int(math.ceil(len(data) * localityReduction)))[0]

    varsClustering = kmeans1d.cluster(np.asarray(newData[["localVarsCount"]], dtype="int64"), n_localVarsClusters)[0]

    training = data[["id", "mutOperator", "opcode", "returnType", "isInTryCatch"]]
    training["idCluster_id"] = idClustering
    training["localityCluster_id"] = localityClustering
    training["varsCluster_id"] = varsClustering
    training = training.merge(results, how="inner", on="id")

    return training


# Selects the sample of mutants that would be executed by CPMT in practice.
# Does so by selecting a random, parameter-specified number of mutants per characteristic.
def sampleSelector(fullDF: pd.DataFrame, parameters: list) -> list:
    perOperator, perOpcode, perReturn, perTryCatch, perIDCluster, perLocCluster, perVarsCluster = parameters
    mutantsSample = []
    for idCluster_id in list(dict.fromkeys(fullDF["idCluster_id"].tolist())):
        idList = fullDF[fullDF["idCluster_id"] ==  idCluster_id]["id"].tolist()
        mutantsSample.extend(sample(idList, min(len(idList), perIDCluster)))
    for mutOperator in list(dict.fromkeys(fullDF["mutOperator"].tolist())):
        mutList = fullDF[fullDF["mutOperator"] ==  mutOperator]["id"].tolist()
        mutantsSample.extend(sample(mutList, min(len(mutList), perOperator)))
    for opcode in list(dict.fromkeys(fullDF["opcode"].tolist())):
        opList = fullDF[fullDF["mutOperator"] ==  mutOperator]["id"].tolist()
        mutantsSample.extend(sample(opList, min(len(opList), perOpcode)))
    for returnType in list(dict.fromkeys(fullDF["returnType"].tolist())):
        rTypeList = fullDF[fullDF["returnType"] ==  returnType]["id"].tolist()
        mutantsSample.extend(sample(rTypeList, min(len(rTypeList), perReturn)))
    notInTCList = fullDF[fullDF["isInTryCatch"] == 0]["id"].tolist()
    mutantsSample.extend(sample(notInTCList, min(len(notInTCList), perTryCatch)))
    inTCList = fullDF[fullDF["isInTryCatch"] == 1]["id"].tolist()
    mutantsSample.extend(sample(inTCList, min(len(inTCList), perTryCatch)))
    for localityCluster_id in list(dict.fromkeys(fullDF["localityCluster_id"].tolist())):
        locClusterList = fullDF[fullDF["localityCluster_id"] == localityCluster_id]["id"].tolist()
        mutantsSample.extend(sample(locClusterList, min(len(locClusterList), perLocCluster)))
    for varsCluster_id in list(dict.fromkeys(fullDF["varsCluster_id"].tolist())):
        varsClusterList = fullDF[fullDF["varsCluster_id"] == varsCluster_id]["id"].tolist()
        mutantsSample.extend(sample(varsClusterList, min(len(varsClusterList), perVarsCluster)))

    return mutantsSample


# For every possible value of every characteristic, takes all mutants with that value in the sampleDF.
# Calculates the percentage of those mutants which are killed, aka pk-score.
# Overwrites the value in the fullDF with the pk-score. Returns new fullDF.
def  pkScoreTransformer(fullDF: pd.DataFrame, sampleDF: pd.DataFrame) -> pd.DataFrame:
    for idCluster_id in list(dict.fromkeys(sampleDF["idCluster_id"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["idCluster_id"] == idCluster_id) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["idCluster_id"] == idCluster_id])
        fullDF.loc[fullDF["idCluster_id"] == idCluster_id, "idCluster_id"] = percentageKilled
    for mutOperator in list(dict.fromkeys(sampleDF["mutOperator"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["mutOperator"] == mutOperator) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["mutOperator"] == mutOperator])
        fullDF.loc[fullDF["mutOperator"] == mutOperator, "mutOperator"] = percentageKilled
    for opcode in list(dict.fromkeys(sampleDF["opcode"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["opcode"] == opcode) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["opcode"] == opcode])
        fullDF.loc[fullDF["opcode"] == opcode, "opcode"] = percentageKilled
    for returnType in list(dict.fromkeys(sampleDF["returnType"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["returnType"] == returnType) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["returnType"] == returnType])
        fullDF.loc[fullDF["returnType"] == returnType, "returnType"] = percentageKilled
    percentageKilled = len(sampleDF[(sampleDF["isInTryCatch"] == 0) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["isInTryCatch"] == 0])
    fullDF.loc[fullDF["isInTryCatch"] == 0, "isInTryCatch"] = percentageKilled
    percentageKilled = len(sampleDF[(sampleDF["isInTryCatch"] == 1) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["isInTryCatch"] == 1])
    fullDF.loc[fullDF["isInTryCatch"] == 1, "isInTryCatch"] = percentageKilled
    for localityCluster_id in list(dict.fromkeys(fullDF["localityCluster_id"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["localityCluster_id"] == localityCluster_id) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["localityCluster_id"] == localityCluster_id])
        fullDF.loc[fullDF["localityCluster_id"] == localityCluster_id, "localityCluster_id"] = percentageKilled
    for varsCluster_id in list(dict.fromkeys(fullDF["varsCluster_id"].tolist())):
        percentageKilled = len(sampleDF[(sampleDF["varsCluster_id"] == varsCluster_id) & (sampleDF["killed"] == 1)]) / len(sampleDF[sampleDF["varsCluster_id"] == varsCluster_id])
        fullDF.loc[fullDF["varsCluster_id"] == varsCluster_id, "varsCluster_id"] = percentageKilled

    return fullDF


# Takes a list of projects, transforms their characteristic data based on parameters
# into a single CPMT-ready dataframe. Trains the given classifier on the dataframe.
def trainAlgorithm(projects: list, classifier: any, parameters: list) -> any:
    samplingStop = parameters[-1]
    classificationTraining = []
    for project in projects:
        print("Starting training project: " + project)
        dataframes = getProjectDfs(project)
        if dataframes == -1:
            continue
        data, results = dataframes

        training = dfToCategorical(data, results, parameters[:3])
        print("Starting creation of classification training set for project: " + project)
        start_time = time.time()

        # For every project, a CPMT-ready dataframe is inserted
        # multiple times to produce subsampling. samplingStop controls how many.
        allMutants = []
        mutantsSampleDfs = []
        i = 0
        while True:
            print("iteration = ", i)
            i += 1
            print(samplingStop * len(training["id"].tolist()), len(allMutants))

            mutantsSample = sampleSelector(training, parameters[3:-1])
            allMutants.extend(mutantsSample)

            trainingSample = training[training["id"].isin(mutantsSample)]
            trainingInsert = training.copy()
            trainingInsert = pkScoreTransformer(trainingInsert, trainingSample)

            mutantsSampleDfs.append(trainingInsert)

            # Basing it on "allMutants" means lower reductions -> more subsampling.
            # Might want to experiment with different subsampling if classifier gets more complex.
            if samplingStop * len(training["id"].tolist()) < len(allMutants):
                break

        # All dataframes of 1 project get combined.
        classificationTraining.append(pd.concat(mutantsSampleDfs).drop(columns=["id"]))

    print("Done with getting all classification training data.")
    print(time.time() - start_time)

    # Dataframes of all projects get combined.
    classificationTrainingDf = pd.concat(classificationTraining)
    X_train = classificationTrainingDf.drop(columns=["killed"]).values.tolist()
    y_train = classificationTrainingDf["killed"].tolist()

    print("Starting training of classifier.")
    start_time = time.time()
    classifier.fit(X_train, y_train)

    print(time.time() - start_time)
    return classifier


# Predicts the result of every mutant in the given project using CPMT.
def own_predict(classifier: any, project: str, parameters: list, useSampled: bool=False) -> tuple[np.ndarray, float]:
    samplingStop = parameters[-1]
    print("Starting prediction project: " + project)
    start_time = time.time()
    dataframes = getProjectDfs(project)
    if dataframes == -1:
        return -1
    data, results = dataframes

    # Transformation from raw characteristic data to CPMT-ready data.
    fullDF = dfToCategorical(data, results, parameters[:3])
    sampleMutantsIDs = sampleSelector(fullDF, parameters[3:-1])
    sampleDF = fullDF[fullDF["id"].isin(sampleMutantsIDs)]
    reduction = len(sampleMutantsIDs)/len(fullDF)
    print("reduction = " + str(reduction))
    fullDF = pkScoreTransformer(fullDF, sampleDF)

    # Saves the results of the "executed" mutants to overwrite the predictions.
    if useSampled:
        killedIndices = fullDF.index[(fullDF["killed"] == 1) & (fullDF["id"].isin(sampleMutantsIDs))].tolist()
        survivedIndices = fullDF.index[(fullDF["killed"] == 0) & (fullDF["id"].isin(sampleMutantsIDs))].tolist()
    fullDF = fullDF.drop(columns=["id", "killed"])

    predictResults = classifier.predict(fullDF)

    # For the executed mutants, CPMT would not need to predict their result.
    if useSampled:
        predictResults[killedIndices] = 1
        predictResults[survivedIndices] = 0
    print("Prediction for project " + project + " took: " + str(time.time() - start_time) + " seconds.")
    return predictResults, reduction


# Calculates the precision: percentage of correct predictions in decimals.
def precisionCalc(project: str, predictions: np.ndarray) -> float:
    dataframes = getProjectDfs(project)
    if dataframes == -1:
        return -1
    data, results = dataframes
    data["prediction"] = predictions
    newData = data[["id", "prediction"]]
    merged = newData.merge(results, how="inner", on="id")
    merged.to_csv("projects/" + project + "/predictions.csv", sep=",", index=False)
    correctList = [1 if i == j else 0 for i, j in zip(merged["killed"].tolist(), merged["prediction"].tolist())]
    precision = len([1 for val in correctList if val == 1 ])/len(correctList)
    print("Precision = " + str(precision))
    return precision


Parameter cell down below. Note that these are not the only parameters you might want to change.

In [None]:
seeds = [
     90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, 66304, 16389, 14706, 91254, 49890,
    86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
    13388, 81617, ]
# trainingProjects = ["google-auto-service", "scribejava-core", "commons-cli",
#                     "google-auto-value","gson", "commons-io", "commons-codec"]
projects = ["commons-text", "commons-codec", "google-auto-common", "scribejava-core", "google-auto-factory", "commons-csv",
                "commons-cli", "google-auto-value", "gson", "commons-io", ]
parameters = [0.05, 5, 3, 3, 3, 3, 2, 2, 2]
classifier = SGDClassifier()

In [None]:
warnings.filterwarnings('ignore')

We used this cell for our hyperopt runs. The variables you might want to change here are those inside the space, max_evals and the reductions(in the cell above).
You may also want to work with less seeds/projects.

In [None]:
# def objective(args):
#     parameters = [args["idReduction"], args["localityReduction"], args["n_localVarsClusters"], args["perOperator"], args["perOpcode"], args["perReturn"],
#                   args["perTryCatch"], args["perIDCluster"], args["perLocCluster"], args["perVarsCluster"], args["samplingStop"]]
#     print(args)
#     precisions = []
#     reductions = []
#     for i in seeds:
#         seed(i)
#         classAlgo = LinearSVC(dual=False, tol=args["tol"], C=args["c"])
#         classifier = trainAlgorithm(trainingProjects, classAlgo, parameters=parameters)
#         for project in projects:
#             results, reduction = own_predict(classifier, project, parameters=parameters)
#             precisions.append(precisionCalc(project, results))
#             reductions.append(reduction)
# 
#     # If a reduction passes one of the if-statements,
#     # we negate it with the corresponding Cluster Accuracy
#     # reported by Mouissie to make it a little more fair.
#     # remove reduction if-statements that you don't want.
#     if np.mean(reductions) < 0.11:
#         return 0.7982 - np.mean(precisions)
#     elif np.mean(reductions) < 0.26:
#         return 0.8389 - np.mean(precisions)
#     elif np.mean(reductions) < 0.51:
#         return 0.8640 - np.mean(precisions)
#     else:
#         return 1.1 - np.mean(precisions)

# space = {"idReduction": hp.uniform("idReduction", 0, 0.3),
#          "localityReduction": hp.uniform("localityReduction", 0, 0.1),
#          "n_localVarsClusters": hp.randint("n_localVarsClusters", 1, 11),
#          "perOperator": hp.randint("perOperator", 1, 16),
#          "perOpcode": hp.randint("perOpcode", 1, 41),
#          "perReturn": hp.randint("perReturn", 1, 31),
#          "perTryCatch": hp.randint("perTryCatch", 1, 31),
#          "perIDCluster":hp.randint("perIDCluster", 1, 3),
#          "perLocCluster":hp.randint("perLocCluster", 1, 11),
#          "perVarsCluster": hp.randint("perVarsCluster", 1, 31),
#          "samplingStop": hp.uniform("samplingStop", 0, 1),
#          "tol": hp.uniform("tol", 0.00001, 0.01),
#          "c": hp.uniform("c", 0.05, 1)}

# best = fmin(objective, space, algo=tpe.suggest, max_evals=30)

# with open("best.pkl", 'wb') as outp:  # Overwrites any existing file.
#     pickle.dump(best, outp, pickle.HIGHEST_PROTOCOL)

# print(best)

Giving the printed output of fmin to the tryout function allows you to run CPMT with those parameters.

Below is the cell with 3-Projects training CPMT experiments.

In [None]:
# def tryOut(args):
#     parameters = [args["idReduction"], args["localityReduction"], args["n_localVarsClusters"], args["perOperator"], args["perOpcode"], args["perReturn"],
#                   args["perTryCatch"], args["perIDCluster"], args["perLocCluster"], args["perVarsCluster"], args["samplingStop"]]
#     classAlgo = LinearSVC(dual=False, tol=args["tol"], C=args["c"])
#     precisions = []
#     reductions = []
#     timings = []
#     for pickedSeed in seeds:
#         seed(pickedSeed)
#         print("---------Starting seed: " + str(pickedSeed) + "---------")
#
#         # Every project gets its own training.
#         for i in range(len(projects)):
#             # predictProject is not used for training.
#             predictProject = projects.pop(i)
#             classifier = trainAlgorithm(sample(projects, 3), classAlgo, parameters)
#             start_time = time.time()
#             results, reduction = own_predict(classifier, predictProject, parameters)
#             timings.append(time.time() - start_time)
#             precisions.append(precisionCalc(predictProject, results))
#             reductions.append(reduction)
#             projects.insert(i, predictProject)
#
#     print("RESULTS:")
#     for i in range(len(projects)):
#         print("Average reduction for project: " + projects[i] + ":")
#         print(np.mean(reductions[i:len(reductions):len(seeds)]))
#         print("Average precision for project: " + projects[i] + ":")
#         print(np.mean(precisions[i:len(reductions):len(seeds)]))
#
#     performance = [np.mean(reductions), np.mean(precisions), np.std(precisions), max(precisions), min(precisions)]
#     print("timings: " + str(timings))    
#     print("performance: " + str(performance))

# tryOut({'c': 0.2063328083683676, 'idReduction': 0.01574087592571406, 'localityReduction': 0.019430303075051083, 'n_localVarsClusters': 4, 'perIDCluster': 1, 'perLocCluster': 3, 'perOpcode': 21, 'perOperator': 2, 'perReturn': 27, 'perTryCatch': 14, 'perVarsCluster': 25, 'samplingStop': 0.12912822558556847, 'tol': 0.003700890655623175})

Below is the cell with original CPMT experiments.

In [None]:
def tryOut(args):
    parameters = [args["idReduction"], args["localityReduction"], args["n_localVarsClusters"], args["perOperator"], args["perOpcode"], args["perReturn"],
                  args["perTryCatch"], args["perIDCluster"], args["perLocCluster"], args["perVarsCluster"], args["samplingStop"]]
    classAlgo = LinearSVC(dual=False, tol=args["tol"], C=args["c"])
    precisions = []
    reductions = []
    timings = []
    for pickedSeed in seeds:
        seed(pickedSeed)
        print("---------Starting seed: " + str(pickedSeed) + "---------")

        # Every project gets its own training.
        for i in range(len(projects)):
            # predictProject is not used for training.
            predictProject = projects.pop(i)
            classifier = trainAlgorithm(projects, classAlgo, parameters)
            start_time = time.time()
            results, reduction = own_predict(classifier, predictProject, parameters)
            timings.append(time.time() - start_time)
            precisions.append(precisionCalc(predictProject, results))
            reductions.append(reduction)
            projects.insert(i, predictProject)

    print("RESULTS:")
    for i in range(len(projects)):
        print("Average reduction for project: " + projects[i] + ":")
        print(np.mean(reductions[i:len(reductions):len(seeds)]))
        print("Average precision for project: " + projects[i] + ":")
        print(np.mean(precisions[i:len(reductions):len(seeds)]))

    performance = [np.mean(reductions), np.mean(precisions), np.std(precisions), max(precisions), min(precisions)]
    print("timings: " + str(timings))
    print("performance: " + str(performance))

tryOut({'c': 0.2063328083683676, 'idReduction': 0.01574087592571406, 'localityReduction': 0.019430303075051083, 'n_localVarsClusters': 4, 'perIDCluster': 1, 'perLocCluster': 3, 'perOpcode': 21, 'perOperator': 2, 'perReturn': 27, 'perTryCatch': 14, 'perVarsCluster': 25, 'samplingStop': 0.12912822558556847, 'tol': 0.003700890655623175})