# Joular and CK data analysis

In [89]:
import requests
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE_URL = "http://localhost:8080/api/v1"

# Exploration of data

- **First objective**: aggregate all the joular data for each method and keep the methods having 30 values
- **Second objective**: remove all the aberrant values like the values that are too high or too low compared to the other values
- **Third objective**: sort all the data in function of their values in descending order
- **Fourth objective**: for a few methods (those having the highest values), compare the distribution of the values with the CK analysis of this method

## Aggregate the joular values and getting the methods with 30 values

In [76]:
def getAllJoularDataForMethodsHaving30Values():
    URL = BASE_URL + "/joular-entities/allValues"
    r = requests.get(url=URL)
    return r.json()

def createBoxplotJoular(repositoryName, methodSignature, allValues):
    df = pd.DataFrame({"allValues":allValues})
    fig = px.box(df, y="allValues", title="Joular values for " + methodSignature + " of repository " +repositoryName, points="all")
    fig.show()

In [99]:
def removeOutliers(data, threshold=3):
    zScores = np.abs((data - np.mean(data)) / np.std(data))
    """print("mean = ", np.mean(data))
    print("std = ", np.std(data))
    boolScore = zScores < threshold
    for i in range(len(data)):
        print(str(data[i]) + "   " + str(zScores[i]) + "  " + str(boolScore[i]))"""
    return data[zScores < threshold]

In [97]:
def determine_outlier_thresholds_std(dataframe, col_name):
    upper_boundary = dataframe[col_name].mean() + 3 * dataframe[col_name].std()
    lower_boundary = dataframe[col_name].mean() - 3 * dataframe[col_name].std()
    return lower_boundary, upper_boundary

def check_outliers_std(dataframe, col_name):
    lower_boundary, upper_boundary = determine_outlier_thresholds_std(dataframe, col_name)
    if dataframe[(dataframe[col_name] > upper_boundary) | (dataframe[col_name] < lower_boundary)].any(axis=None):
        return True
    else: 
        return False

def replace_with_thresholds_std(dataframe, cols, replace=False):
    from tabulate import tabulate
    data = []
    for col_name in cols:
        if col_name != 'Outcome':
            outliers_ = check_outliers_std(dataframe, col_name)
            count = None
            lower_limit, upper_limit = determine_outlier_thresholds_std(dataframe, col_name)
            if outliers_:
                count = dataframe[(dataframe[col_name] > upper_limit) | (dataframe[col_name] < lower_limit)][col_name].count()
                if replace:
                    if lower_limit < 0:
                        # We don't want to replace with negative values, right!
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
                    else:
                        dataframe.loc[(dataframe[col_name] < lower_limit), col_name] = lower_limit
                        dataframe.loc[(dataframe[col_name] > upper_limit), col_name] = upper_limit
            outliers_status = check_outliers_std(dataframe, col_name)
            data.append([outliers_, outliers_status,count, col_name, lower_limit, upper_limit])
    table = tabulate(data, headers=['Outlier (Previously)','Outliers','Count', 'Column','Lower Limit', 'Upper Limit'], tablefmt='rst', numalign='right')
    print("Removing Outliers using 3 Standard Deviation")
    print(table)

Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers      Count  Column       Lower Limit    Upper Limit
True                    True              1  allValues       -160.478        2020.69


In [100]:
allData = getAllJoularDataForMethodsHaving30Values()["content"]
sortedData = sorted(allData, key=lambda x: x["allValues"], reverse=True)
for e in range(5, 10):
    repositoryName = sortedData[e]["commit"]["repository"]["name"]
    methodSignature = sortedData[e]["measurableElement"]["methodSignature"]
    allValues = sortedData[e]["allValues"]
    allValuesWithoutOutliers = removeOutliers(np.array(allValues))

    df = pd.DataFrame({"allValues":allValues})
    replace_with_thresholds_std(df, df.columns,replace=False)

    #print(len(allValuesWithoutOutliers))
    #print(allValuesWithoutOutliers)
    #print("\n")
    #createBoxplotJoular(repositoryName, methodSignature, allValues)
    #createBoxplotJoular(repositoryName, methodSignature, allValuesWithoutOutliers)



Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers      Count  Column       Lower Limit    Upper Limit
True                    True              1  allValues       -4042.95        7454.05
Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers    Count    Column       Lower Limit    Upper Limit
False                   False                allValues         171.78        2129.87
Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers      Count  Column       Lower Limit    Upper Limit
True                    True              1  allValues        542.118        696.996
Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers    Count    Column       Lower Limit    Upper Limit
False                   False                allValues        453.746        910.452
Removing Outliers using 3 Standard Deviation
Outlier (Previously)    Outliers      Count  Column       Lower Limit    Upper Limit
True      

# Test 

In [2]:
def getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature):
    URL = BASE_URL + "/joular-entities/by-commit-and-ast-elem/" + commitSha
    PARAMS = {"className":className, "methodSignature":methodSignature}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [3]:
def getCkDataFromOneCommit(commitSha, astElem, className, methodSignature=None):
    URL = BASE_URL + "/ck-entities/by-commit-and-ast-elem/" + commitSha
    if methodSignature != None:
        PARAMS = {"astElem": astElem, "className":className, "methodSignature":methodSignature}
    else:
        PARAMS = {"astElem": astElem, "className":className}
    r = requests.get(url = URL, params = PARAMS)
    return r.json()

In [4]:
def createJoularBoxplot(commitSha, className, methodSignature):
    data = getJoularDataFromOneMethodFromOneCommit(commitSha, className, methodSignature)
    values = [item["value"] for item in data["content"] if item["value"] > 0]
    df = pd.DataFrame({"value":values})
    fig = px.box(df, y="value", title="Joular values for " + methodSignature, points='all')
    fig.show()

In [5]:
def createCkBarChart(commitSha, astElem, className, methodSignature):
    NAMES = ["wmc", "cbo", "cboModified", "loc", "fanin", "fanout"]
    data = getCkDataFromOneCommit(commitSha, astElem, className, methodSignature)
    values = {item["name"]:item["value"] for item in data["content"] if item["name"] in NAMES}
    df = pd.DataFrame(list(values.items()), columns=["Metric", "Value"])
    fig = px.bar(df, x="Metric", y="Value", title="Ck metrics for " + methodSignature)
    fig.show()

In [6]:
def createBoxplotAndBarChart(commitSha, astElem, className, methodSignature):
    createJoularBoxplot(commitSha, className, methodSignature)
    createCkBarChart(commitSha, astElem, className, methodSignature)

In [None]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.servlet.server.StaticResourceJars", "isResourcesJar/1[java.util.jar.JarFile]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.engine.jdbc.internal.ResultSetReturnImpl", "executeUpdate/2[java.sql.PreparedStatement,java.lang.String]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.bytecode.internal.bytebuddy.ByteBuddyState", "make/2[TypePool,DynamicType.Builder<?>]")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedServiceLoader$ClassPathAndModulePathAggregatedServiceLoader", "hasNextIgnoringServiceConfigurationError/1[java.util.Iterator<?>]")

createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.web.embedded.tomcat.TomcatWebServer", "initialize/0")

createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.testing.junit4.FailureExpectedHandler", "evaluate/0")

In [None]:
createBoxplotAndBarChart("3ed1f1a064a10e53adc2ad8c0b46a4b2c148ee21", "method", "org.springframework.boot.testsupport.classpath.ModifiedClassPathClassLoader", "loadClass/1[java.lang.String]")

In [None]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.boot.registry.classloading.internal.AggregatedClassLoader", "getResources/1[java.lang.String]")

### Data with aberrant negative values

The method *verifyCollection* has the value -530894.

In [12]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.orm.test.bootstrap.registry.classloading.PhantomReferenceLeakDetector", "verifyCollection/3[java.lang.ref.ReferenceQueue<T>,int,int]")

The method *accept* below has the value -217371.5

In [13]:
createBoxplotAndBarChart("12442bd8c7cde6e7c006a6277eeb8e81ad0c2219", "method", "org.hibernate.tool.schema.internal.exec.GenerationTargetToDatabase", "accept/1[java.lang.String]")