# Joular and CK data analysis

In [1]:
import requests
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

BASE_URL = "http://localhost:8080/api/v1"

# Exploration of data

- **First objective**: aggregate all the joular data for each method and keep the methods having 30 values
- **Second objective**: remove all the aberrant values like the values that are too high or too low compared to the other values
- **Third objective**: sort all the data in function of their values in descending order
- **Fourth objective**: for a few methods (those having the highest values), compare the distribution of the values with the CK analysis of this method

## Aggregate the joular values and getting the methods with 30 values

In [41]:
def getEndpoint(endpoint, params=None):
    URL = BASE_URL + endpoint
    r = requests.get(url=URL, params=params)
    return r.json()

def getAllJoularDataForMethodsHaving30Values(sha=""):
    allResults = []
    endpoint = "/joular/aggregates/"
    if (sha != ""):
        endpoint += "by-commit/" + sha
    result = getEndpoint(endpoint)
    allResults.extend(result["content"])
    while(not result["last"]):
        page = result["number"] + 1
        print("Calling page ", page)
        result = getEndpoint(endpoint, {"page": page})
        allResults.extend(result["content"])
    return allResults

def getSeveralCkDataForOneMethod(commitSha: str, className: str, methodSignature: str, ckMetrics: list[str]):
    endpoint = "/ck-entities/by-commit-and-ast-elem/" + commitSha
    params = {
        "className": className,
        "methodSignature": methodSignature,
        "names": ckMetrics
    }
    return getEndpoint(endpoint=endpoint, params=params)

def createBoxplotJoular(repositoryName, methodSignature, allValues):
    df = pd.DataFrame({"allValues":allValues})
    fig = px.box(df, y="allValues", title="Joular values for " + methodSignature + " of repository " +repositoryName, points="outliers")
    fig.show()

In [3]:
def removeOutliersByZScore(data, threshold=3):
    zScores = np.abs(stats.zscore(data))
    #zScores = np.abs((data - np.mean(data)) / np.std(data))
    """boolScore = zScores < threshold
    for i in range(len(data)):
        print(str(data[i]) + "   " + str(zScores[i]) + "  " + str(boolScore[i]))"""
    return data[zScores < threshold]

In [4]:
def removeOutliersByIQR(allValues):
    df = pd.DataFrame({"allValues":allValues})
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    """print("Q1 = ", Q1["allValues"])
    print("Q3 = ", Q3["allValues"])
    print("IQR = ", IQR["allValues"])
    print("Lower limit = ", Q1["allValues"] - 1.5 * IQR["allValues"])
    print("Upper limit = ", Q3["allValues"] + 1.5 * IQR["allValues"])"""
    return df[~((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR))).any(axis=1)]

### Data for all repositories

In [9]:
allData = getAllJoularDataForMethodsHaving30Values()
print(allData)
dataRepo = sorted(allData, key=lambda x: x["allValues"], reverse=True)
dataRepo = []
for methodData in dataRepo:
    repositoryName = methodData["commit"]["repository"]["name"]
    methodSignature = methodData["measurableElement"]["methodSignature"]
    allValues = methodData["allValues"]
    allValuesAfterIQR = removeOutliersByIQR(allValues)
    #print(len(allValuesAfterIQR))
    if (len(allValuesAfterIQR) == 30):
        dataRepo.append(methodData)

    #createBoxplotJoular(repositoryName, methodSignature, allValues)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesWithoutOutliers)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesAfterIQR["allValues"])
print(len(dataRepo))

dataAllRepositories = []
for doc in dataRepo:
    dataAllRepositories.extend({"Repository": doc["commit"]["repository"]["name"], "Values":val, "Class": doc["measurableElement"]["className"]} for val in doc["allValues"])
ckDataFrame = pd.DataFrame(dataAllRepositories)

figRepository = px.violin(ckDataFrame, y="Values", x="Repository", points="all")
figRepository.show()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### Aggregate by repository and comparison with CK values

In [49]:
def sortData(data):
    return sorted(data, key=lambda x: x["allValues"], reverse=True)

def removeOutliers(data):
    print("Len with outliers : ", len(data))
    only30Values = []
    for methodData in data:
        allValues = methodData["allValues"]
        allValuesAfterIQR = removeOutliersByIQR(allValues)
        #print(len(allValuesAfterIQR))
        if (len(allValuesAfterIQR) == 30):
            only30Values.append(methodData)
    print("Len without outliers : ", len(only30Values))
    return only30Values

def getTheFirstHighestValues(data, end=5):
    return data[:end]

def createViolinJoular(data, granularity="Repository", points="all"):
    dataAllRepositories = []
    for doc in data:
        dataAllRepositories.extend({"Repository": doc["commit"]["repository"]["name"], "Values":val, "Class": doc["measurableElement"]["className"]} for val in doc["allValues"])
    df = pd.DataFrame(dataAllRepositories)

    figRepository = px.violin(df, y="Values", x=granularity, points=points)
    figRepository.show()

def createBarChart(dataframe, xAxisName, yAxisName, title):
    fig = px.bar(dataframe, x=xAxisName, y=yAxisName, title=title)
    fig.show()

def prepareCkDataFrame(ckValues):
    values = {item["name"]:item["value"] for item in ckValues}
    return pd.DataFrame(list(values.items()), columns=["Metric", "Value"])

def createBoxplot(dataframe, title):
    fig = px.box(dataframe, y="value", title=title, points="all")
    fig.show()

def prepareJoularDataFrame(joularValues):
    return pd.DataFrame({"value": joularValues})

In [50]:
allCommitSha = ["066f4cf207359e06d30911a553dedd054aef595c", "5c9d8989f968d0ee3a942b411ef7fe121ed94609", "12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "59e5152722198526c6ffe5361de7d1a6a87275c7"]
allowedCkMetrics = ["cbo", "loc", "wmc"]

for sha in allCommitSha:
    dataRepo = getAllJoularDataForMethodsHaving30Values(sha)
    sortedDataRepo = sortData(dataRepo)
    dataRepoWithoutOutliers = removeOutliers(sortedDataRepo)
    createViolinJoular(dataRepoWithoutOutliers)
    for i in range(5):
        # JOULAR DATA
        joularData = dataRepoWithoutOutliers[i]["allValues"]
        joularDataFrame = prepareJoularDataFrame(joularData)
        createBoxplot(joularDataFrame, "Joular values for " + methodSignature + " (" + className + ")")

        # CK DATA
        className = dataRepoWithoutOutliers[i]["measurableElement"]["className"]
        methodSignature = dataRepoWithoutOutliers[i]["measurableElement"]["methodSignature"]
        ckValues = getSeveralCkDataForOneMethod(commitSha=sha, className=className, methodSignature=methodSignature, ckMetrics=allowedCkMetrics)
        ckDataFrame = prepareCkDataFrame(ckValues)
        createBarChart(ckDataFrame, "Metric", "Value", "Ck metrics for " + methodSignature + " (" + className + ")")

    #dataRepoWithoutOutliers = getTheFirstHighestValues(dataRepoWithoutOutliers, 2)
    #createViolinJoular(dataRepoWithoutOutliers, "Class", "outliers")


Calling page  1
Len with outliers :  197
Len without outliers :  93


Len with outliers :  76
Len without outliers :  32


Len with outliers :  73
Len without outliers :  15


Calling page  1
Len with outliers :  177
Len without outliers :  79


Len with outliers :  15
Len without outliers :  8


# Test 

In [2]:
def getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature):
    URL = BASE_URL + "/joular-entities/by-commit-and-ast-elem/" + commitSha
    PARAMS = {"className":className, "methodSignature":methodSignature}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [3]:
def getCkDataFromOneCommit(commitSha, astElem, className, methodSignature=None):
    URL = BASE_URL + "/ck-entities/by-commit-and-ast-elem/" + commitSha
    if methodSignature != None:
        PARAMS = {"astElem": astElem, "className":className, "methodSignature":methodSignature}
    else:
        PARAMS = {"astElem": astElem, "className":className}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [4]:
def createJoularBoxplot(commitSha, className, methodSignature):
    data = getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature)
    values = [item["value"] for item in data["content"] if item["value"] > 0]
    df = pd.DataFrame({"value":values})
    fig = px.box(df, y="value", title="Joular values for " + methodSignature, points='all')
    fig.show()

In [5]:
def createCkBarChart(commitSha, astElem, className, methodSignature):
    NAMES = ["wmc", "cbo", "cboModified", "loc", "fanin", "fanout"]
    data = getCkDataFromOneCommit(commitSha, astElem, className, methodSignature)
    values = {item["name"]:item["value"] for item in data["content"] if item["name"] in NAMES}
    df = pd.DataFrame(list(values.items()), columns=["Metric", "Value"])
    fig = px.bar(df, x="Metric", y="Value", title="Ck metrics for " + methodSignature)
    fig.show()

In [6]:
def createBoxplotAndBarChart(commitSha, astElem, className, methodSignature):
    createJoularBoxplot(commitSha, className, methodSignature)
    createCkBarChart(commitSha, astElem, className, methodSignature)

In [None]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.servlet.server.StaticResourceJars", "isResourcesJar/1[java.util.jar.JarFile]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.engine.jdbc.internal.ResultSetReturnImpl", "executeUpdate/2[java.sql.PreparedStatement,java.lang.String]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.bytecode.internal.bytebuddy.ByteBuddyState", "make/2[TypePool,DynamicType.Builder<?>]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedServiceLoader$ClassPathAndModulePathAggregatedServiceLoader", "hasNextIgnoringServiceConfigurationError/1[java.util.Iterator<?>]")

createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.embedded.tomcat.TomcatWebServer", "initialize/0")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.testing.junit4.FailureExpectedHandler", "evaluate/0")

In [None]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.testsupport.classpath.ModifiedClassPathClassLoader", "loadClass/1[java.lang.String]")

In [None]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedClassLoader", "getResources/1[java.lang.String]")

### Data with aberrant negative values

The method *verifyCollection* has the value -530894.

In [12]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.orm.test.bootstrap.registry.classloading.PhantomReferenceLeakDetector", "verifyCollection/3[java.lang.ref.ReferenceQueue<T>,int,int]")

The method *accept* below has the value -217371.5

In [13]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.tool.schema.internal.exec.GenerationTargetToDatabase", "accept/1[java.lang.String]")