This is the same implementation of CPMT as cpmtClusterSelection, except now the influence a mutant has on the pk-score is dependent on its similarity to the mutant the pk-score is for. (See pkScoreTransformer())

This implementation was not discussed in Adam's thesis as it is way too time complex (Commons-text's prediction takes 16k+ seconds), because it requires a comparison between every mutant and every tested mutant for every corresponding feature.

The similarity is currently calculated based the distance between mutants in all dimensions(1 dimension = 1 feature). Creating a dimensionality-reduced feature set would allow you to calculate the distance with a simple subtraction. That should create some speedup.

Still, it would likely require some new innovation before being viable.

In [1]:
import math
import numpy as np
from numpy.random import randint
from numpy import dot
from numpy.linalg import norm
import pickle
from os import path
import os
from random import sample, seed
import csv
import pandas as pd
from sklearn.cluster import KMeans
import kmeans1d
from hyperopt import hp, fmin, tpe
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
import time
import warnings
import traceback

In [2]:
# Source: tutorialspoint.com/file-searching-using-python
def find_files(filename, search_path):
   result = []

    # Walking top-down from the root
   for root, _, files in os.walk(search_path):
      if filename in files:
         result.append(os.path.join(root, filename))
   return result


# Gets characteristics and result status data from project file
# created by pitest clustering plugin.
def getProjectDfs(project: str) -> tuple[pd.DataFrame, pd.DataFrame] | int:
    csv_path = "projects/" + project
    charPath = find_files("characteristics.csv", csv_path)

    if charPath:
        charPath = charPath[0]
        data = pd.read_csv(charPath,
                            names=["id", "mutOperator", "opcode", "returnType",
                                    "localVarsCount", "isInTryCatch", "isInFinalBlock",
                                    "className", "methodName", "blockNumber", "lineNumber"],
                            skiprows=1)
        killedPath = find_files("killed.csv", csv_path)

        if killedPath:
            killedPath = killedPath[0]
            results = pd.read_csv(killedPath,
                                    names=["id", "killed", "numTests"],
                                    skiprows=1)
            results = results.drop(columns=["numTests"])
        else:
            print("Could not find killed.csv for project: " + project)
            return -1
    else:
        print("Could not find characteristics.csv for project: " + project)
        return -1
    
    return data, results


# Uses label encoding and clustering to change numerical data to categorical.
# Also merges data with results.
def dfToCategorical(data: pd.DataFrame, results: pd.DataFrame, parameters: list) -> pd.DataFrame:
    idReduction, localityReduction, n_localVarsClusters = parameters
    # define ordinal encoding
    encoder = LabelEncoder()
    newData = data[["id", "mutOperator", "opcode", "returnType", "localVarsCount",
                    "isInTryCatch", "className", "methodName", "lineNumber"]]
    for col in ["mutOperator", "returnType", "className", "methodName", "id"]:
        newData[col] = encoder.fit_transform(newData[col])

    idClustering = kmeans1d.cluster(np.asarray(newData[["id"]], dtype="int64"), int(math.ceil(len(data) * idReduction)))[0]

    # Categorical locality variable creation.
    newData["className"] = newData["className"].apply(lambda x: x*100000)
    newData["methodName"] = newData["methodName"].apply(lambda x: x*1000)
    localityClustering = kmeans1d.cluster(np.asarray(newData[["className"]], dtype="int64") +
                                          np.asarray(newData[["methodName"]], dtype="int64") +
                                          np.asarray(newData[["lineNumber"]], dtype="int64"), int(math.ceil(len(data) * localityReduction)))[0]

    varsClustering = kmeans1d.cluster(np.asarray(newData[["localVarsCount"]], dtype="int64"), n_localVarsClusters)[0]

    training = data[["id", "mutOperator", "opcode", "returnType", "isInTryCatch"]]
    training["idCluster_id"] = idClustering
    training["localityCluster_id"] = localityClustering
    training["varsCluster_id"] = varsClustering
    training = training.merge(results, how="inner", on="id")

    return training


# Source: https://stackoverflow.com/a/29651514
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


# Transforms the given dataframe for better clustering as described in Adam's thesis.
# Single characteristic clustering is already done by dfToCategorical.
def preprocessing(data: pd.DataFrame, parameters: list) -> pd.DataFrame:
    idW, mutOpW, opcodeW, retTypeW, locVarsCountW, tryCatchW, localityW = parameters
    normalizedData = normalize(data[["opcode", "varsCluster_id", "isInTryCatch", "localityCluster_id", "idCluster_id"]])
    weightedData = normalizedData.mul([opcodeW, locVarsCountW, tryCatchW, localityW, idW])
    newData = weightedData.join(pd.get_dummies(data["mutOperator"]) * mutOpW)
    newData = newData.join(pd.get_dummies(data["returnType"]) * retTypeW)
    return newData


# Clusters mutants and selects the mutants closest to the center of each cluster.
# May want to change the selection method, considering center selection performed badly in CMT.
def sampleSelector(data: pd.DataFrame, parameters: list, encoder: LabelEncoder, reduction: float) -> list:
    newData = preprocessing(data, parameters)
    clustering = KMeans(n_clusters=int(math.ceil(len(data) * reduction)), n_init=1)
    clusters = clustering.fit_transform(newData)
    clusterCenterIndices = np.argmin(clusters, axis=0)
    mutants = data.loc[clusterCenterIndices]["id"]
    return mutants, clusters


# Calculates pk-scores, but the influence each mutant of the tested mutants has on the pk-score of a mutant
# is weighted by its co-similarity to that mutant.
def  pkScoreTransformer(fullDF: pd.DataFrame, sampleDF: pd.DataFrame, default: float, distances: np.ndarray) -> pd.DataFrame:
    for charName in list(fullDF)[1:-1]:
        for charVal in list(dict.fromkeys(fullDF[charName].tolist())):
            try:
                indices = fullDF.index[fullDF["id"].isin(sampleDF[sampleDF[charName] == charVal]["id"])].tolist()
                for mutant in fullDF.index[fullDF["id"].isin(fullDF[fullDF[charName] == charVal]["id"])].tolist():
                    killed = 0
                    total = 0

                    for mutantCompare in indices:
                        # Distances[mutant] is an array where each value is the distance from a cluster centroid.
                        # There should be no difference between using the distances relative to cluster centroid
                        # and absolute distances (features).
                        mutantDistance = distances[mutant]
                        mutantCompareDistance = distances[mutantCompare]
                        cos_sim = dot(mutantDistance, mutantCompareDistance)/(norm(mutantDistance)*norm(mutantCompareDistance))
                        if fullDF.loc[mutantCompare, "killed"] == 1:
                            killed += cos_sim
                        total += cos_sim

                    fullDF.loc[fullDF["id"] == fullDF.loc[mutant, "id"], charName] = killed / total
            except ZeroDivisionError:
                fullDF.loc[fullDF[charName] == charVal, charName] = default

    return fullDF


# Takes a list of projects, transforms their characteristic data based on parameters
# into a single CPMT-ready dataframe. Trains the given classifier on the dataframe.
def trainAlgorithm(projects: list, classifier: any, parameters: list=[0.05, 0.05, 5, 0.01], reduction: float=0.1, prepParams: list=[13998, 8, 598, 4240, 9505, 15477, 14723]) -> any:
    samplingStop = parameters[-1]
    classificationTraining = []
    for project in projects:
        print("Starting training project: " + project)
        dataframes = getProjectDfs(project)
        if dataframes == -1:
            continue
        data, results = dataframes

        training = dfToCategorical(data, results, parameters[:3])
        print("Starting creation of classification training set for project: " + project)
        start_time = time.time()

        # For every project, a CPMT-ready dataframe is inserted
        # multiple times to produce subsampling. samplingStop controls how many.
        allMutants = []
        mutantsSampleDfs = []
        i = 0
        encoder = LabelEncoder()
        while True:
            print("iteration = ", i)
            i += 1
            print(samplingStop * len(training["id"].tolist()), len(allMutants))
            mutantsSample, distances = sampleSelector(training, prepParams, encoder, reduction)
            allMutants.extend(mutantsSample)
            trainingSample = training[training["id"].isin(mutantsSample)]

            # Default pk-score for if a possible characteristics value has no mutants in selection.
            # Might want to change this by multiplying each killed mutant by the number of mutants
            # in the cluster they represent and dividing by all mutants.
            defaultPK = len(trainingSample[trainingSample["killed"] == 1]) / len(trainingSample)
            trainingInsert = training.copy()
            trainingInsert = pkScoreTransformer(trainingInsert, trainingSample, defaultPK, distances)

            mutantsSampleDfs.append(trainingInsert)

            # Basing it on "allMutants" means lower reductions -> more subsampling.
            # Might want to experiment with different subsampling if classifier gets more complex. 
            if samplingStop * len(training["id"].tolist()) < len(allMutants):
                break

        # All dataframes of 1 project get combined.
        classificationTraining.append(pd.concat(mutantsSampleDfs).drop(columns=["id"]))

    print("Done with getting all classification training data.")
    print(time.time() - start_time)

    # Dataframes of all projects get combined.
    classificationTrainingDf = pd.concat(classificationTraining)
    X_train = classificationTrainingDf.drop(columns=["killed"]).values.tolist()
    y_train = classificationTrainingDf["killed"].tolist()

    print("Starting training of classifier.")
    start_time = time.time()
    classifier.fit(X_train, y_train)

    print(time.time() - start_time)
    return classifier


# Predicts the result of every mutant in the given project using CPMT.
def own_predict(classifier: any, project: str, parameters: list=[0.05, 0.05, 5, 0.01], reduction: float=0.1, prepParams: list=[13998, 8, 598, 4240, 9505, 15477, 14723], useSampled: bool=True) -> tuple[np.ndarray, float]:
    samplingStop = parameters[-1]
    print("Starting prediction project: " + project)
    start_time = time.time()
    dataframes = getProjectDfs(project)
    if dataframes == -1:
        return -1
    data, results = dataframes

    # Transformation from raw characteristic data to CPMT-ready data.
    fullDF = dfToCategorical(data, results, parameters[:3])
    sampleMutantsIDs, distances = sampleSelector(fullDF, prepParams, LabelEncoder(), reduction)
    sampleDF = fullDF[fullDF["id"].isin(sampleMutantsIDs)]
    reduction = len(sampleMutantsIDs)/len(fullDF)
    print("reduction = " + str(reduction))
    # Default pk-score for if a possible characteristics value has no mutants in selection.
    defaultPK = len(sampleDF[sampleDF["killed"] == 1]) / len(sampleDF)

    trainingInsert = pkScoreTransformer(fullDF, sampleDF, defaultPK, distances)

    # Saves the results of the "executed" mutants to overwrite the predictions.
    if useSampled:
        killedIndices = fullDF.index[(fullDF["killed"] == 1) & (fullDF["id"].isin(sampleMutantsIDs))].tolist()
        survivedIndices = fullDF.index[(fullDF["killed"] == 0) & (fullDF["id"].isin(sampleMutantsIDs))].tolist()
    fullDF = fullDF.drop(columns=["id", "killed"])

    predictResults = classifier.predict(fullDF)

    # For the executed mutants, CPMT would not need to predict their result.
    if useSampled:
        predictResults[killedIndices] = 1
        predictResults[survivedIndices] = 0
    print("Prediction for project " + project + " took: " + str(time.time() - start_time) + " seconds.")
    return predictResults, reduction


# Calculates the precision: percentage of correct predictions in decimals.
def precisionCalc(project: str, predictions: np.ndarray) -> float:
    dataframes = getProjectDfs(project)
    if dataframes == -1:
        return -1
    data, results = dataframes
    data["prediction"] = predictions
    newData = data[["id", "prediction"]]
    merged = newData.merge(results, how="inner", on="id")
    merged.to_csv("projects/" + project + "/predictions.csv", sep=",", index=False)
    correctList = [1 if i == j else 0 for i, j in zip(merged["killed"].tolist(), merged["prediction"].tolist())]
    precision = len([1 for val in correctList if val == 1 ])/len(correctList)
    print("Precision = " + str(precision))
    return precision

In [3]:
seeds = [
    66304, 16389, 14706, 91254, 49890, 86054, 55284, 77324, 36147, 13506, 73920, 80157, 43981, 75358, 33399, 56134,
    13388, 81617, 90957, 52113, 20428, 26482, 56340, 31018, 32067, 13067, 8339, 49008, 125894, 68282, ]
trainingProjects = ["google-auto-service", "scribejava-core",
                     "commons-cli", "google-auto-value", "gson", "commons-io", "commons-codec"]
projects = [ "google-auto-factory", "google-auto-common", "commons-csv", "commons-text"]
parameters = [0.05, 5, 3, 3, 3, 3, 2, 2, 2]
# classifier = SGDClassifier()
reduction = [0.1]

In [4]:
warnings.filterwarnings('ignore')

In [5]:
def tryOut(args):
    parameters = [args["localityReduction"], args["n_localVarsClusters"], args["perOperator"], args["perOpcode"], args["perReturn"],
                  args["perTryCatch"], args["perLocCluster"], args["perVarsCluster"], args["samplingStop"]]
    classAlgo = LinearSVC(dual=False, tol=args["tol"], C=args["c"])
    try:
        classifier = trainAlgorithm(trainingProjects[:3], classAlgo)
    except Exception:
        traceback.print_exc()
    precisions = []
    reductions = []
    for project in projects:
        results, reduction = own_predict(classifier, project)
        precisions.append(precisionCalc(project, results))
        reductions.append(reduction)
    performance = [np.mean(reductions), np.mean(precisions)]
    print(performance)

tryOut({'c': 0.498087050393805, 'localityReduction': 0.01582020546979156, 'n_localVarsClusters': 2, 'perLocCluster': 1, 'perOpcode': 27, 'perOperator': 10, 'perReturn': 17, 'perTryCatch': 4, 'perVarsCluster': 22, 'samplingStop': 0.1537291955327681, 'tol': 0.005117505124731138})

Starting training project: google-auto-service
Starting creation of classification training set for project: google-auto-service
iteration =  0
4.75 0
Starting training project: scribejava-core
Starting creation of classification training set for project: scribejava-core
iteration =  0
57.46 0
Starting training project: commons-cli
Starting creation of classification training set for project: commons-cli
iteration =  0
71.9 0
Done with getting all classification training data.
330.33147859573364
Starting training of classifier.
0.025006771087646484
Starting prediction project: google-auto-factory
reduction = 0.10013717421124829
Prediction for project google-auto-factory took: 206.50364136695862 seconds.
Precision = 0.8335048010973937
Starting prediction project: google-auto-common
reduction = 0.10001916075876605
Prediction for project google-auto-common took: 155.86904740333557 seconds.
Precision = 0.7834834259436674
Starting prediction project: commons-csv
reduction = 0.10005792064871