# Joular and CK data analysis

In [65]:
import requests
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from scipy import stats
import plotly.graph_objects as go
import statistics
import re
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

BASE_URL = "http://localhost:8080/api/v1"

# Exploration of data

- **First objective**: aggregate all the joular data for each method and keep the methods having 30 values
- **Second objective**: remove all the aberrant values like the values that are too high or too low compared to the other values
- **Third objective**: sort all the data in function of their values in descending order
- **Fourth objective**: for a few methods (those having the highest values), compare the distribution of the values with the CK analysis of this method

## Aggregate the joular values and getting the methods with 30 values

In [66]:
def getEndpoint(endpoint, params=None):
    URL = BASE_URL + endpoint
    r = requests.get(url=URL, params=params)
    return r.json()

def getAllJoularDataForMethodsHavingAtLeast25Values(sha=""):
    allResults = []
    endpoint = "/joular/aggregates"
    if (sha != ""):
        endpoint += "/by-commit/" + sha
    result = getEndpoint(endpoint)
    allResults.extend(result["content"])
    while(not result["last"]):
        page = result["number"] + 1
        result = getEndpoint(endpoint, {"page": page})
        allResults.extend(result["content"])
    return allResults

def getSeveralCkDataForOneMethod(commitSha: str, className: str, methodName: str, ckMetrics: list[str]):
    endpoint = "/ck-entities/by-commit-and-ast-elem/" + commitSha
    params = {
        "className": className,
        "methodName": methodName,
        "names": ckMetrics
    }
    return getEndpoint(endpoint=endpoint, params=params)

def createBoxplotJoular(repositoryName, methodName, allValues):
    df = pd.DataFrame({"allValues":allValues})
    fig = px.box(df, y="allValues", title="Joular values for " + methodName + " of repository " +repositoryName, points="outliers")
    fig.show()

In [67]:
def removeOutliersByZScore(data, threshold=3):
    zScores = np.abs(stats.zscore(data))
    #zScores = np.abs((data - np.mean(data)) / np.std(data))
    """boolScore = zScores < threshold
    for i in range(len(data)):
        print(str(data[i]) + "   " + str(zScores[i]) + "  " + str(boolScore[i]))"""
    return data[zScores < threshold]

In [68]:
def removeOutliersByIQR(allValues):
    df = pd.DataFrame({"allValues":allValues})
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    """print("Q1 = ", Q1["allValues"])
    print("Q3 = ", Q3["allValues"])
    print("IQR = ", IQR["allValues"])
    print("Lower limit = ", Q1["allValues"] - 1.5 * IQR["allValues"])
    print("Upper limit = ", Q3["allValues"] + 1.5 * IQR["allValues"])"""
    return df[~((df < (Q1 - 1.5 * IQR))|(df > (Q3 + 1.5 * IQR))).any(axis=1)]

### Data for all repositories

In [69]:
allData = getAllJoularDataForMethodsHavingAtLeast25Values()
print(allData)
joularDataRepo = sorted(allData, key=lambda x: x["allValues"], reverse=True)
joularDataRepo = []
for methodData in joularDataRepo:
    repositoryName = methodData["commit"]["repository"]["name"]
    methodName = methodData["measurableElement"]["methodSignature"]
    allValues = methodData["allValues"]
    allValuesAfterStd = removeOutliersByIQR(allValues)
    #print(len(allValuesAfterIQR))
    if (len(allValuesAfterStd) == 30):
        joularDataRepo.append(methodData)

    #createBoxplotJoular(repositoryName, methodSignature, allValues)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesWithoutOutliers)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesAfterIQR["allValues"])
print(len(joularDataRepo))

dataAllRepositories = []
for doc in joularDataRepo:
    dataAllRepositories.extend({"Repository": row["commit"]["repository"]["name"], "Values":val, "Class": row["measurableElement"]["className"]} for val in row["allValues"])
ckDataFrame = pd.DataFrame(dataAllRepositories)

#figRepository = px.violin(ckDataFrame, y="Values", x="Repository", points="all")
#figRepository.show()

0


### Aggregate by repository and comparison with CK values

In [70]:
def sortData(data):
    return sorted(data, key=lambda x: x["allValues"], reverse=True)

def getTheFirstHighestValues(data, end=5):
    return data[:end]

def createViolinJoular(data, granularity="Repository", points="all"):
    dataAllRepositories = []
    for doc in data:
        dataAllRepositories.extend({"Repository": doc["commit"]["repository"]["name"], "Values":val, "Class": doc["measurableElement"]["className"]} for val in doc["allValues"])
    df = pd.DataFrame(dataAllRepositories)

    figRepository = px.violin(df, y="Values", x=granularity, points=points)
    figRepository.show()

def createMultipleBoxplot(dataOneRepo):
    dataframePreparation = []
    for method in dataOneRepo:
        dataframePreparation.append({"Method": method["measurableElement"]["methodSignature"], "Values":method["allValues"], "Class":method["measurableElement"]["className"]})
    df = pd.DataFrame(dataframePreparation)
    print(df)
    fig = px.box(df, x="Method", y="Values", points="outliers")
    fig.show()
        


def createBarChart(dataframe, xAxisName, yAxisName, title):
    fig = px.bar(dataframe, x=xAxisName, y=yAxisName, title=title)
    fig.update_layout(yaxis_range=[0, 40])
    return fig

def prepareCkDataFrame(ckValues):
    values = {item["name"]:item["value"] for item in ckValues}
    return pd.DataFrame(list(values.items()), columns=["Metric", "Value"])

def createBoxplot(dataframe, title):
    return px.box(dataframe, y="value", title=title, points="all")

def prepareJoularDataFrame(joularValues):
    return pd.DataFrame({"value": joularValues})

def createSubplot(fig1, fig2, title):
    fig1Traces = []
    fig2Traces = []
    for trace in range(len(fig1["data"])):
        fig1Traces.append(fig1["data"][trace])
    for trace in range(len(fig2["data"])):
        fig2Traces.append(fig2["data"][trace])

    
    figure = make_subplots(rows=1, cols=2, subplot_titles=("Joular values", "Ck values"))
    for traces in fig1Traces:
        figure.append_trace(traces, row=1, col=1)
    for traces in fig2Traces:
        figure.append_trace(traces, row=1, col=2)

    figure.update_layout(title_text=title)
    figure.show()

## Methods to get specific fields from data from db

In [71]:
MEASURABLE_ELEMENT = "measurableElement"
CLASS_NAME = "className"
METHOD_NAME = "methodName"
ALL_VALUES = "allValues"
COMMIT = "commit"
REPOSITORY = "repository"
REPO_NAME = "name"

In [72]:
# Repo data
def getRepositoryName(method):
    return method[COMMIT][REPOSITORY][REPO_NAME]

# Joular data
def getClassName(method):
    return method[MEASURABLE_ELEMENT][CLASS_NAME]

def getMethodName(method):
    return method[MEASURABLE_ELEMENT][METHOD_NAME]

def getAllJoularValues(method):
    return method[ALL_VALUES]

def getMeanOfAllJoularValues(method):
    return statistics.mean(getAllJoularValues(method))


# CK data
def getMetricName(ckData):
    return ckData["name"]

def getMetricValue(ckData):
    return ckData["value"]

## Creation of the figures

Const used as key for the joular and ck dataframe

In [73]:
METHOD_DF = "Method"
CLASS_DF = "Class"
JOULAR_VALUES_DF = "JoularValues"
MEAN_JOULAR_VALUES_DF = "MeanJoularValue"
REPO_DF = "Repository"

In [74]:
def matchOnlyArgsNames(matchedArgs):
    argsNames = []
    if len(matchedArgs) > 0:
        for matchedArg in matchedArgs[0].split(','):
            argName = re.sub(r'<.*?>', '', matchedArg).split('.')[-1]
            argsNames.append(argName)
    return argsNames

In [75]:
def simplifyMethodName(methodName):
    matches = re.findall(r'\[([^\]]+)', methodName)
    if len(matches) == 0:
        return methodName.split('/')[0] + '/'
    simplifiedArgs = matchOnlyArgsNames(matches)
    simplifiedName = methodName.split('/')[0] + '/' + str(len(simplifiedArgs)) + '[' + ','.join(simplifiedArgs) + ']'
    return simplifiedName

In [76]:
def getNamesXAxis(row):
    nameLegend = simplifyMethodName(row["Method"])
    return nameLegend #+ " " + row["Class"].split(".")[-1][:20]

In [77]:
def createBoxplotBarchartFigure(df, ckMetric, repository):    
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    for index, row in df.iterrows():
        showLegend = False
        if index == 1:
            showLegend = True
        nameBar = getNamesXAxis(row)

        # Box plot
        boxTrace = go.Box(
            #x=[getNamesXAxis(row)],
            y=row[JOULAR_VALUES_DF],
            name=nameBar,
            marker=dict(color='rgb(252,141,98)'),
            legendgroup="boxplots",
            showlegend=showLegend
        )
        fig.add_trace(boxTrace, secondary_y=True)

        # Bar chart
        xBarChart = [nameBar]
        yBarChart = [row[ckMetric]]
        traceBarchart = go.Bar(x=xBarChart, y=yBarChart, name=ckMetric, marker=dict(color='rgb(141,160,203)'), legendgroup="barcharts", showlegend=showLegend)
        fig.add_trace(traceBarchart)

    # Trendline for the bar chart
    overallTrendline = np.polyfit(np.arange(len(df)), df[ckMetric], 1)
    overallSlope = overallTrendline[0]
    overallTrendlineValues = np.polyval(overallTrendline, np.arange(len(df)))
    traceTrendline = go.Scatter(x=[f"{getNamesXAxis(row)}" for _, row in df.iterrows()], y=overallTrendlineValues, name=f'Trendline for {ckMetric} (Slope: {overallSlope:.2f})', mode="lines", line=dict(color='red'))
    #fig.add_trace(traceTrendline)
        
    fig['layout'].update(legend=dict(traceorder='normal'))
    fig['layout'].update(title= "Energy consumption of 3 methods for project Spoon", height=500)
    fig.update_yaxes(title_text="Energy consumption (J)")
    fig.show()

In [78]:
def createScatterPlot(df, ckMetric, repoName):
    fig = go.Figure()
    scatterTrace = go.Scatter(
        x=df[ckMetric],
        y=df[MEAN_JOULAR_VALUES_DF],
        mode="markers",
        text=[f"{getNamesXAxis(row)}" for _, row in df.iterrows()]
    )
    fig.add_trace(scatterTrace)

    # Update layout
    fig.update_layout(
        title=f"{ckMetric} and mean of Joular values, for each method of repository \"{repoName}\"",
        xaxis_title=ckMetric,
        yaxis_title="Mean of Joular Values"
    )

    fig.show()

In [79]:
def preprocessData(df):
    df['constructor'] = df['constructor'].astype(int)
    df['hasJavaDoc'] = df['hasJavaDoc'].astype(int)
    numericColumns = df.select_dtypes(include=['number']).columns
    X = df[numericColumns]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

In [80]:
def createTsneResult(df, perplexity):
    X_scaled = preprocessData(df)
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    return tsne.fit_transform(X_scaled)

In [81]:
def createTsneFigure(df, coloredFeature, perplexity=30):
    X_tsne = createTsneResult(df, perplexity)
    hover_data = {'Method': df[METHOD_DF], 'Class': df[CLASS_DF], 'Repository': df[REPO_DF]}

    fig = px.scatter(
        X_tsne, x=0, y=1,
        hover_data=hover_data,
        color=df[coloredFeature], labels={'color':coloredFeature}
    )
    fig.show()

In [82]:
def createKmeans(df, numClusters, perplexity=30):
    normalizedData = preprocessData(df)
    kmeans = KMeans(n_clusters=numClusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(normalizedData)
    tsneResults = createTsneResult(df, perplexity)
    print(df)

    fig = px.scatter(
        x=tsneResults[:, 0],
        y=tsneResults[:, 1],
        color=df['cluster'].astype(str),
        title='t-SNE Visualization with K-means Clustering for repo ' + df[REPO_DF],
        labels={'color':'Cluster'},
        hover_data={'cluster': True}
    )
    fig.show()


## Creation of the dataframe containing all data for all repositories

### Utils functions

In [83]:
def exportDataframeToFile(df, path):
    df.to_csv(path + "/allData25Values.csv", index=False)

In [84]:
def addRepoDataframeToAllDataframe(allDf, dfToAdd):
    frames = [allDf, dfToAdd]
    return pd.concat(frames)

In [85]:
def getTheFirstHighestValues(data, end=5):
    return data[:end]

#### Init of dataframe

In [86]:
def createCkDict(methodData, mergedData, sha, allowedCkMetrics):
    className = getClassName(methodData)
    methodName = getMethodName(methodData)

    ckDataMethod = getSeveralCkDataForOneMethod(commitSha=sha, className=className, methodName=methodName, ckMetrics=allowedCkMetrics)
    
    for metricData in ckDataMethod:
        mergedData[getMetricName(metricData)].append(getMetricValue(metricData))
    return mergedData

In [87]:
def createJoularDict(methodData, mergedData):
    mergedData[REPO_DF].append(getRepositoryName(methodData))
    mergedData[CLASS_DF].append(getClassName(methodData))
    #mergedData[METHOD_DF].append(simplifyMethodName(getMethodName(methodData)))
    mergedData[METHOD_DF].append(getMethodName(methodData))
    mergedData[JOULAR_VALUES_DF].append(getAllJoularValues(methodData))
    mergedData[MEAN_JOULAR_VALUES_DF].append(getMeanOfAllJoularValues(methodData))
    return mergedData

In [88]:
def initDataframe(allowedCkMetrics):
    dataframe = {
        REPO_DF: [],
        CLASS_DF : [],
        METHOD_DF : [],
        JOULAR_VALUES_DF : [],
        MEAN_JOULAR_VALUES_DF: []
    }
    for ckMetric in allowedCkMetrics:
        dataframe[ckMetric] = []
    return dataframe

#### Outliers

In [89]:
def removeOutliersByStd(allValues):
    mean = np.mean(allValues)
    stdDev = np.std(allValues)
    """print(mean, stdDev, "\n")
    for x in allValues:
        print(x, np.abs(mean - x), 3*stdDev)"""
    return [x for x in allValues if (np.abs(mean - x) <= 3 * stdDev)]

In [90]:
def removeOutliers(data):
    print("Len with outliers : ", len(data))
    only25ValuesAndMore = []
    for methodData in data:
        allValues = methodData["allValues"]
        allValuesAfterStd = removeOutliersByStd(allValues)
        if (len(allValuesAfterStd) >= 25):
            methodData["allValues"] = allValuesAfterStd
            only25ValuesAndMore.append(methodData)
    print("Len without outliers (25+ values) : ", len(only25ValuesAndMore))
    return only25ValuesAndMore

### Dataframe for one repo

In [91]:
def createJoularAndCkDataframeForOneRepo(dataOneRepo, sha, allowedCkMetrics):
    mergedData = initDataframe(allowedCkMetrics)
    for method in dataOneRepo:
        mergedData = createJoularDict(method, mergedData)
        mergedData = createCkDict(method, mergedData, sha, allowedCkMetrics)
    return pd.DataFrame(mergedData)

In [92]:
def createOneRepoDataframe(sha, allowedCkMetrics):
    joularDataRepo = getAllJoularDataForMethodsHavingAtLeast25Values(sha)
    sortedJoularDataRepo = sortData(joularDataRepo)
    joularDataRepoWithoutOutliers = removeOutliers(sortedJoularDataRepo)
    df = createJoularAndCkDataframeForOneRepo(joularDataRepoWithoutOutliers, sha, allowedCkMetrics)
    return df

### One dataframe for all repos

In [93]:
def createAllRepoDataframe(allCommitSha, allowedCkMetrics):
    allRepoDataDf = pd.DataFrame()
    for sha in allCommitSha:
        df = createOneRepoDataframe(sha, allowedCkMetrics)
        allRepoDataDf = addRepoDataframeToAllDataframe(allRepoDataDf, df)
    return allRepoDataDf

In [94]:
def createScatterPlotForEachRepo(allRepoDataDf, allowedCkMetrics):
    repositories = allRepoDataDf[REPO_DF].unique()
    for repo in repositories:
        repoDf = allRepoDataDf[allRepoDataDf[REPO_DF] == repo]
        for metric in allowedCkMetrics:
            createScatterPlot(repoDf, metric, repo)

In [95]:
def createBoxplotBarchartForEachRepo(allRepoDataDf, allowedCkMetrics):
    repositories = allRepoDataDf[REPO_DF].unique()
    for repo in repositories:
        repoDf = allRepoDataDf[allRepoDataDf[REPO_DF] == repo]
        for metric in allowedCkMetrics:
            createBoxplotBarchartFigure(repoDf, metric, repo)

## Called functions

In [96]:
allCommitSha = ["066f4cf207359e06d30911a553dedd054aef595c", "5c9d8989f968d0ee3a942b411ef7fe121ed94609", "12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "59e5152722198526c6ffe5361de7d1a6a87275c7"]
#allCommitSha = ["066f4cf207359e06d30911a553dedd054aef595c"]
#allowedCkMetrics = ["cbo", "rfc", "loc"]
allowedCkMetrics = ["constructor", "cbo", "cboModified", "fanin", "fanout", "wmc", "rfc", "loc", "returnsQty", "variablesQty", "parametersQty", "methodsInvokedQty", "methodsInvokedLocalQty", "methodsInvokedIndirectLocalQty", "loopQty", "comparisonsQty", "tryCatchQty", "parenthesizedExpsQty", "stringLiteralsQty", "numbersQty", "assignmentsQty", "mathOperationsQty", "maxNestedBlocksQty", "anonymousClassesQty", "innerClassesQty", "lambdasQty", "uniqueWordsQty", "modifiers", "logStatementsQty", "hasJavaDoc"]

allRepoDf = createAllRepoDataframe(allCommitSha, allowedCkMetrics)
print(len(allRepoDf))
exportDataframeToFile(allRepoDf, "/home/jerome/Documents/Assistant/Recherche/joular-scripts")

Len with outliers :  393
Len without outliers (25+ values) :  390
Len with outliers :  188
Len without outliers (25+ values) :  186
Len with outliers :  292
Len without outliers (25+ values) :  274
Len with outliers :  392
Len without outliers (25+ values) :  381
Len with outliers :  35
Len without outliers (25+ values) :  35
1266


In [None]:
allowedCkMetrics = ["cbo", "rfc", "loc"]
createScatterPlotForEachRepo(allRepoDf, allowedCkMetrics)

In [41]:
allRepoDf5HighestValues = getTheFirstHighestValues(allRepoDf, 10)
print(allRepoDf5HighestValues)
allowedCkMetrics = ["cbo"]
createBoxplotBarchartForEachRepo(allRepoDf5HighestValues, allowedCkMetrics)

  Repository                                              Class  \
0      spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
1      spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
2      spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
3      spoon  spoon.reflect.visitor.chain.CtQueryImpl$Anonym...   
4      spoon         spoon.support.visitor.equals.EqualsChecker   
5      spoon                    spoon.reflect.visitor.CtScanner   
6      spoon                    spoon.reflect.visitor.CtScanner   
7      spoon           spoon.support.SerializationModelStreamer   
8      spoon                    spoon.reflect.visitor.CtScanner   
9      spoon               spoon.reflect.factory.PackageFactory   

                                              Method  \
0  scan/2[spoon.reflect.path.CtRole,java.util.Col...   
1  doScan/3[spoon.reflect.path.CtRole,spoon.refle...   
2        scan/1[spoon.reflect.declaration.CtElement]   
3                                     

In [51]:
createKmeans(allRepoDf, 2)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



               Repository                                              Class  \
0                   spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
1                   spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
2                   spoon      spoon.reflect.visitor.EarlyTerminatingScanner   
3                   spoon  spoon.reflect.visitor.chain.CtQueryImpl$Anonym...   
4                   spoon         spoon.support.visitor.equals.EqualsChecker   
..                    ...                                                ...   
10  commons-configuration  org.apache.commons.configuration2.tree.xpath.X...   
11  commons-configuration  org.apache.commons.configuration2.beanutils.Be...   
12  commons-configuration  org.apache.commons.configuration2.beanutils.Be...   
13  commons-configuration  org.apache.commons.configuration2.JSONConfigur...   
14  commons-configuration  org.apache.commons.configuration2.beanutils.Be...   

                                     Me

ValueError: String or int arguments are only possible when a DataFrame or an array is provided in the `data_frame` argument. No DataFrame was provided, but argument 'hover_data_0' is of type str or int.

In [97]:
perplexities = [5, 10, 20, 30, 40, 50]
for i in perplexities:
    createTsneFigure(allRepoDf, 'MeanJoularValue', i)
#createTsne(allRepoDf, 'rfc')
#createTsne(allRepoDf, 'cbo')
#createTsne(allRepoDf, 'MeanJoularValue')




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.




The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



### Data with aberrant negative values

The method *verifyCollection* has the value -530894.

In [12]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.orm.test.bootstrap.registry.classloading.PhantomReferenceLeakDetector", "verifyCollection/3[java.lang.ref.ReferenceQueue<T>,int,int]")

The method *accept* below has the value -217371.5

In [13]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.tool.schema.internal.exec.GenerationTargetToDatabase", "accept/1[java.lang.String]")