# Joular and CK data analysis

In [66]:
import requests
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from scipy import stats
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import statistics

BASE_URL = "http://localhost:8080/api/v1"

# Exploration of data

- **First objective**: aggregate all the joular data for each method and keep the methods having 30 values
- **Second objective**: remove all the aberrant values like the values that are too high or too low compared to the other values
- **Third objective**: sort all the data in function of their values in descending order
- **Fourth objective**: for a few methods (those having the highest values), compare the distribution of the values with the CK analysis of this method

## Aggregate the joular values and getting the methods with 30 values

In [102]:
def getEndpoint(endpoint, params=None):
    URL = BASE_URL + endpoint
    r = requests.get(url=URL, params=params)
    return r.json()

def getAllJoularDataForMethodsHaving30Values(sha=""):
    allResults = []
    endpoint = "/joular/aggregates"
    if (sha != ""):
        endpoint += "/by-commit/" + sha
    result = getEndpoint(endpoint)
    allResults.extend(result["content"])
    while(not result["last"]):
        page = result["number"] + 1
        result = getEndpoint(endpoint, {"page": page})
        allResults.extend(result["content"])
    return allResults

def getSeveralCkDataForOneMethod(commitSha: str, className: str, methodSignature: str, ckMetrics: list[str]):
    endpoint = "/ck-entities/by-commit-and-ast-elem/" + commitSha
    params = {
        "className": className,
        "methodSignature": methodSignature,
        "names": ckMetrics
    }
    return getEndpoint(endpoint=endpoint, params=params)

def createBoxplotJoular(repositoryName, methodSignature, allValues):
    df = pd.DataFrame({"allValues":allValues})
    fig = px.box(df, y="allValues", title="Joular values for " + methodSignature + " of repository " +repositoryName, points="outliers")
    fig.show()

In [68]:
def removeOutliersByZScore(data, threshold=3):
    zScores = np.abs(stats.zscore(data))
    #zScores = np.abs((data - np.mean(data)) / np.std(data))
    """boolScore = zScores < threshold
    for i in range(len(data)):
        print(str(data[i]) + "   " + str(zScores[i]) + "  " + str(boolScore[i]))"""
    return data[zScores < threshold]

In [69]:
def removeOutliersByIQR(allValues):
    df = pd.DataFrame({"allValues":allValues})
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    """print("Q1 = ", Q1["allValues"])
    print("Q3 = ", Q3["allValues"])
    print("IQR = ", IQR["allValues"])
    print("Lower limit = ", Q1["allValues"] - 1.5 * IQR["allValues"])
    print("Upper limit = ", Q3["allValues"] + 1.5 * IQR["allValues"])"""
    return df[~((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR))).any(axis=1)]

### Data for all repositories

In [70]:
allData = getAllJoularDataForMethodsHaving30Values()
print(allData)
joularDataRepo = sorted(allData, key=lambda x: x["allValues"], reverse=True)
joularDataRepo = []
for methodData in joularDataRepo:
    repositoryName = methodData["commit"]["repository"]["name"]
    methodSignature = methodData["measurableElement"]["methodSignature"]
    allValues = methodData["allValues"]
    allValuesAfterIQR = removeOutliersByIQR(allValues)
    #print(len(allValuesAfterIQR))
    if (len(allValuesAfterIQR) == 30):
        joularDataRepo.append(methodData)

    #createBoxplotJoular(repositoryName, methodSignature, allValues)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesWithoutOutliers)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesAfterIQR["allValues"])
print(len(joularDataRepo))

dataAllRepositories = []
for doc in joularDataRepo:
    dataAllRepositories.extend({"Repository": row["commit"]["repository"]["name"], "Values":val, "Class": row["measurableElement"]["className"]} for val in row["allValues"])
ckDataFrame = pd.DataFrame(dataAllRepositories)

#figRepository = px.violin(ckDataFrame, y="Values", x="Repository", points="all")
#figRepository.show()

0


### Aggregate by repository and comparison with CK values

In [71]:
def sortData(data):
    return sorted(data, key=lambda x: x["allValues"], reverse=True)

def removeOutliers(data):
    print("Len with outliers : ", len(data))
    only30Values = []
    for methodData in data:
        allValues = methodData["allValues"]
        allValuesAfterIQR = removeOutliersByIQR(allValues)
        #print(len(allValuesAfterIQR))
        if (len(allValuesAfterIQR) == 30):
            only30Values.append(methodData)
    print("Len without outliers (only 30 values) : ", len(only30Values))
    return only30Values

def getTheFirstHighestValues(data, end=5):
    return data[:end]

def createViolinJoular(data, granularity="Repository", points="all"):
    dataAllRepositories = []
    for doc in data:
        dataAllRepositories.extend({"Repository": doc["commit"]["repository"]["name"], "Values":val, "Class": doc["measurableElement"]["className"]} for val in doc["allValues"])
    df = pd.DataFrame(dataAllRepositories)

    figRepository = px.violin(df, y="Values", x=granularity, points=points)
    figRepository.show()

def createMultipleBoxplot(dataOneRepo):
    dataframePreparation = []
    for method in dataOneRepo:
        dataframePreparation.append({"Method": method["measurableElement"]["methodSignature"], "Values":method["allValues"], "Class":method["measurableElement"]["className"]})
    df = pd.DataFrame(dataframePreparation)
    print(df)
    fig = px.box(df, x="Method", y="Values", points="outliers")
    fig.show()
        


def createBarChart(dataframe, xAxisName, yAxisName, title):
    fig = px.bar(dataframe, x=xAxisName, y=yAxisName, title=title)
    fig.update_layout(yaxis_range=[0, 40])
    return fig

def prepareCkDataFrame(ckValues):
    values = {item["name"]:item["value"] for item in ckValues}
    return pd.DataFrame(list(values.items()), columns=["Metric", "Value"])

def createBoxplot(dataframe, title):
    return px.box(dataframe, y="value", title=title, points="all")

def prepareJoularDataFrame(joularValues):
    return pd.DataFrame({"value": joularValues})

def createSubplot(fig1, fig2, title):
    fig1Traces = []
    fig2Traces = []
    for trace in range(len(fig1["data"])):
        fig1Traces.append(fig1["data"][trace])
    for trace in range(len(fig2["data"])):
        fig2Traces.append(fig2["data"][trace])

    
    figure = make_subplots(rows=1, cols=2, subplot_titles=("Joular values", "Ck values"))
    for traces in fig1Traces:
        figure.append_trace(traces, row=1, col=1)
    for traces in fig2Traces:
        figure.append_trace(traces, row=1, col=2)

    figure.update_layout(title_text=title)
    figure.show()

## Methods to get specific fields from data from db

In [100]:
MEASURABLE_ELEMENT = "measurableElement"
CLASS_NAME = "className"
METHOD_SIGNATURE = "methodSignature"
ALL_VALUES = "allValues"

In [101]:
# Joular data
def getClassName(method):
    return method[MEASURABLE_ELEMENT][CLASS_NAME]

def getMethodSignature(method):
    return method[MEASURABLE_ELEMENT][METHOD_SIGNATURE]

def getAllJoularValues(method):
    return method[ALL_VALUES]

def getMeanOfAllJoularValues(method):
    return statistics.mean(getAllJoularValues(method))


# CK data
def getMetricName(ckData):
    return ckData["name"]

def getMetricValue(ckData):
    return ckData["value"]

In [74]:
def getNameBar(row):
    split = row["Method"].split("[")
    nameLegend = row["Method"].split("[")[0]
    if len(split) == 2:
        nameLegend += "[" + row["Method"].split("[")[1][:5]
    return nameLegend + " " + row["Class"].split(".")[-1][:10]

## Creation of the figures

Const used as key for the joular and ck dataframe

In [75]:
METHOD_DF = "Method"
CLASS_DF = "Class"
JOULAR_VALUES_DF = "JoularValues"
MEAN_JOULAR_VALUES_DF = "MeanJoularValue"

In [104]:
def createCkDict(methodData, mergedData, sha, allowedCkMetrics):
    className = getClassName(methodData)
    methodSignature = getMethodSignature(methodData)

    ckDataMethod = getSeveralCkDataForOneMethod(commitSha=sha, className=className, methodSignature=methodSignature, ckMetrics=allowedCkMetrics)
    
    for metricData in ckDataMethod:
        mergedData[getMetricName(metricData)].append(getMetricValue(metricData))
    return mergedData

In [77]:
def createJoularDict(methodData, mergedData):
    mergedData[CLASS_DF].append(getClassName(methodData))
    mergedData[METHOD_DF].append(getMethodSignature(methodData))
    mergedData[JOULAR_VALUES_DF].append(getAllJoularValues(methodData))
    mergedData[MEAN_JOULAR_VALUES_DF].append(getMeanOfAllJoularValues(methodData))
    return mergedData

In [78]:
def initDataframe(allowedCkMetrics):
    dataframe = {
        CLASS_DF : [],
        METHOD_DF : [],
        JOULAR_VALUES_DF : [],
        MEAN_JOULAR_VALUES_DF: []
    }
    for ckMetric in allowedCkMetrics:
        dataframe[ckMetric] = []
    return dataframe

In [92]:
def createJoularAndCkDataframe(dataOneRepo, sha, allowedCkMetrics):
    mergedData = initDataframe(allowedCkMetrics)
    for method in dataOneRepo:
        mergedData = createJoularDict(method, mergedData)
        mergedData = createCkDict(method, mergedData, sha, allowedCkMetrics)
    print("mergedData = ", mergedData)
    return pd.DataFrame(mergedData)

In [80]:
def oldCreateJoularAndCkDataframe(dataOneRepo, sha, allowedCkMetrics):
    dataframe = []
    for methodData in dataOneRepo:
        allMethodData = {}
        className = methodData["measurableElement"]["className"]
        methodSignature = methodData["measurableElement"]["methodSignature"]

        # Joular values
        allMethodData["Method"] = methodSignature
        allMethodData["Class"] = className
        allMethodData["JoularValues"] = methodData["allValues"]
        allMethodData["MeanJoularValue"] = getMeanOfAllJoularValues(methodData["allValues"])

        # Ck values
        ckDataMethod = getSeveralCkDataForOneMethod(commitSha=sha, className=className, methodSignature=methodSignature, ckMetrics=allowedCkMetrics)
        for metric in ckDataMethod:
            metricName = metric["name"]
            metricValue = metric["value"]
            allMethodData[metricName] = metricValue
        dataframe.append(allMethodData)
    return pd.DataFrame(dataframe)


In [81]:
def createBoxplotBarchartFigure(df, ckMetric, repository):    
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    for index,row in df.iterrows():
        if index == 0 and repository == "spoon": continue
        showLegend = False
        if index == 1:
            showLegend = True
        nameBar = getNameBar(row)

        # Boxplot
        traceBoxplot = go.Box(y=row["JoularValues"], name=nameBar, marker=dict(color='rgb(252,141,98)'), legendgroup="boxplots", showlegend=showLegend)
        fig.add_trace(traceBoxplot, secondary_y=True)

        # Bar chart
        xBarChart = [nameBar]
        yBarChart = [row[ckMetric]]
        traceBarchart = go.Bar(x=xBarChart, y=yBarChart, name=ckMetric, marker=dict(color='rgb(141,160,203)'), legendgroup="barcharts", showlegend=showLegend)
        fig.add_trace(traceBarchart)

    # Trendline for the bar chart
    overallTrendline = np.polyfit(np.arange(len(df)), df[ckMetric], 1)
    overallSlope = overallTrendline[0]
    overallTrendlineValues = np.polyval(overallTrendline, np.arange(len(df)))
    traceTrendline = go.Scatter(x=[f"{getNameBar(row)}" for _, row in df.iterrows()], y=overallTrendlineValues, name=f'Trendline for {ckMetric} (Slope: {overallSlope:.2f})', mode="lines", line=dict(color='red'))
    #fig.add_trace(traceTrendline)
        
    fig['layout'].update(legend=dict(traceorder='normal'))
    fig['layout'].update(title= "Comparison of joular data and '" + ckMetric + "' metric for repository " + repository, height=750)
    fig.show()

In [82]:
def createScatterPlot(df, ckMetric):
    #print(df.head)
    for index, row in df.iterrows():
        fig = go.Scatter(x=ckMetric, y=[row[ckMetric]])
    fig.show()

In [108]:
#allCommitSha = ["066f4cf207359e06d30911a553dedd054aef595c", "5c9d8989f968d0ee3a942b411ef7fe121ed94609", "12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "59e5152722198526c6ffe5361de7d1a6a87275c7"]
allCommitSha = ["066f4cf207359e06d30911a553dedd054aef595c"]
allowedCkMetrics = ["cbo"]

for sha in allCommitSha:
    joularDataRepo = getAllJoularDataForMethodsHaving30Values(sha)
    sortedJoularDataRepo = sortData(joularDataRepo)
    joularDataRepoWithoutOutliers = removeOutliers(sortedJoularDataRepo)
    joularDataRepoWithoutOutliers = getTheFirstHighestValues(joularDataRepoWithoutOutliers, 50)
    repoName = joularDataRepoWithoutOutliers[0]["commit"]["repository"]["name"]
    print(joularDataRepoWithoutOutliers[0])
    #df = oldCreateJoularAndCkDataframe(joularDataRepoWithoutOutliers, sha, allowedCkMetrics)
    df = createJoularAndCkDataframe(joularDataRepoWithoutOutliers, sha, allowedCkMetrics)

    """for ckMetric in allowedCkMetrics:
        #createBoxplotBarchartFigure(df, ckMetric, repoName)
        createScatterPlot(df, ckMetric)"""

Len with outliers :  197
Len without outliers (only 30 values) :  93
{'allValues': [2109.9353, 2580.8672, 1992.2736, 2085.4592, 2587.01, 2118.5796, 2087.088, 2488.8872, 2529.0112, 2514.089, 2108.82, 2643.9514, 2247.597, 2099.9507, 2037.3049, 2129.1768, 2202.229, 2109.487, 2261.734, 2128.6135, 2747.6675, 2144.3489, 2146.559, 2512.0806, 2193.447, 2201.8904, 2769.0137, 2007.5638, 2310.417, 2193.6514], 'commit': {'sha': '066f4cf207359e06d30911a553dedd054aef595c', 'repository': {'name': 'spoon', 'owner': 'INRIA'}}, 'measurableElement': {'astElem': 'method', 'filePath': '/home/student/j/m/jmaquoi/sentinel/open-source-repositories/spoon/src/main/java/spoon/reflect/visitor/EarlyTerminatingScanner.java', 'className': 'spoon.reflect.visitor.EarlyTerminatingScanner', 'methodSignature': 'scan/2[spoon.reflect.path.CtRole,java.util.Collection<? extends spoon.reflect.declaration.CtElement>]', 'variableName': None, 'classType': None}}
mergedData =  {'Class': ['spoon.reflect.visitor.EarlyTerminatingSca

# Test 

In [31]:
def getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature):
    URL = BASE_URL + "/joular-entities/by-commit-and-ast-elem/" + commitSha
    PARAMS = {"className":className, "methodSignature":methodSignature}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [32]:
def getCkDataFromOneCommit(commitSha, astElem, className, methodSignature=None):
    URL = BASE_URL + "/ck-entities/by-commit-and-ast-elem/" + commitSha
    if methodSignature != None:
        PARAMS = {"astElem": astElem, "className":className, "methodSignature":methodSignature}
    else:
        PARAMS = {"astElem": astElem, "className":className}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [33]:
def createJoularBoxplot(commitSha, className, methodSignature):
    data = getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature)
    values = [item["value"] for item in data["content"] if item["value"] > 0]
    df = pd.DataFrame({"value":values})
    fig = px.box(df, y="value", title="Joular values for " + methodSignature, points='all')
    fig.show()

In [34]:
def createCkBarChart(commitSha, astElem, className, methodSignature):
    NAMES = ["wmc", "cbo", "cboModified", "loc", "fanin", "fanout"]
    data = getCkDataFromOneCommit(commitSha, astElem, className, methodSignature)
    values = {item["name"]:item["value"] for item in data["content"] if item["name"] in NAMES}
    df = pd.DataFrame(list(values.items()), columns=["Metric", "Value"])
    fig = px.bar(df, x="Metric", y="Value", title="Ck metrics for " + methodSignature)
    fig.show()

In [35]:
def createBoxplotAndBarChart(commitSha, astElem, className, methodSignature):
    createJoularBoxplot(commitSha, className, methodSignature)
    createCkBarChart(commitSha, astElem, className, methodSignature)

In [36]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.servlet.server.StaticResourceJars", "isResourcesJar/1[java.util.jar.JarFile]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.engine.jdbc.internal.ResultSetReturnImpl", "executeUpdate/2[java.sql.PreparedStatement,java.lang.String]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.bytecode.internal.bytebuddy.ByteBuddyState", "make/2[TypePool,DynamicType.Builder<?>]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedServiceLoader$ClassPathAndModulePathAggregatedServiceLoader", "hasNextIgnoringServiceConfigurationError/1[java.util.Iterator<?>]")

createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.embedded.tomcat.TomcatWebServer", "initialize/0")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.testing.junit4.FailureExpectedHandler", "evaluate/0")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.testsupport.classpath.ModifiedClassPathClassLoader", "loadClass/1[java.lang.String]")

In [None]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedClassLoader", "getResources/1[java.lang.String]")

### Data with aberrant negative values

The method *verifyCollection* has the value -530894.

In [12]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.orm.test.bootstrap.registry.classloading.PhantomReferenceLeakDetector", "verifyCollection/3[java.lang.ref.ReferenceQueue<T>,int,int]")

The method *accept* below has the value -217371.5

In [13]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.tool.schema.internal.exec.GenerationTargetToDatabase", "accept/1[java.lang.String]")