In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Insert name of project
project = "antlr"

# Designite Metrics

In [None]:
df_types = pd.read_csv(f'data/designite-{project}/TypeMetrics.csv', usecols=["Package Name", "Type Name", "NOF", "NOM", "NOPF", "NOPM", "LOC", "WMC", "NC", "DIT", "LCOM", "FANIN", "FANOUT"])
df_types.insert(0, "DeclarableId", df_types[['Package Name', 'Type Name']].agg('.'.join, axis=1))
df_types.drop(["Package Name", "Type Name"], axis="columns", inplace=True)

df_methods = pd.read_csv(f'data/designite-{project}/MethodMetrics.csv', usecols=["Package Name", "Type Name", "Method Name", "LOC", "CC", "PC"])
df_methods.insert(0, "DeclarableId", df_methods[['Package Name', 'Type Name']].agg('.'.join, axis=1))
df_methods["DeclarableId"] = df_methods[['DeclarableId', 'Method Name']].agg('://'.join, axis=1)
df_methods.drop(["Package Name", "Type Name", "Method Name"], axis="columns", inplace=True)

In [None]:
print(df_types)

In [None]:
print(df_methods)

# SonarQube metrics

In [None]:
df_sonar = pd.read_csv(f'data/sonarqube-{project}/CognitiveComplexity.csv', usecols=["DeclarableId","S_COCO", "S_LOC"])
df_sonar['DeclarableId'] = df_sonar['DeclarableId'].astype(str)
pd.to_numeric(df_sonar['S_COCO'])
pd.to_numeric(df_sonar['S_LOC'])
print(df_sonar)

# LAMP Metrics

In [None]:
df_modules = pd.read_csv(f'data/lamp-{project}/modules.csv', usecols=["DeclarableId","MLOC","WMC","CWMC","DIT","NOC","CBO","RFC","LCOM","NOU","LC"])
df_units = pd.read_csv(f'data/lamp-{project}/units.csv', usecols=["DeclarableId","ULOC","CC","COCO","PC","LLOC"])

In [None]:
df_modules.sort_values(by=["DeclarableId"])
print(df_modules)

In [None]:
print(df_units)

## Distribution analysis

In [None]:
print(df_modules.describe())

Within the codebase, inheritance and lambdas are rarely used.

Let's deepdive into lambdas that are used.

In [None]:
df_lambda = df_modules.loc[df_modules["LC"] > 0][['DeclarableId', 'LC']]
print(df_lambda)

Lambdas are used in 5 places of the code base. These lambda statements were very short. Only describing a call to another method.

In [None]:
print(df_units.describe())

# Analysing module differences

In [None]:
module_diff = pd.concat([df_types["DeclarableId"], df_modules["DeclarableId"]]).drop_duplicates(keep=False)
print(module_diff)

33 Modules aren't support by the LAMP framework due to inner modules not being supported.

In [None]:
unit_diff = pd.concat([df_methods["DeclarableId"], df_units["DeclarableId"].apply(lambda s: s.split("$")[0])]).drop_duplicates(keep=False)
print(unit_diff)

261 Units aren't recognized by the LAMP framework at a first glance. When looking at each method precisely, we see that constructors and initializers aren't being matched by name due to their naming conventions within the LAMP framework.

In [None]:
unit_diff_with_constructors = pd.concat([df_methods["DeclarableId"], df_units["DeclarableId"].apply(lambda s:
                                                                                                        s.split("$")[0].removesuffix(".constructor").removesuffix(".initializer")
                                                                                                    )]).drop_duplicates(keep=False)
print(unit_diff_with_constructors)

After removing the .constructor and .initializer from the declarable id, we see that these units are now matched to their corresponding method ids from designite.
This leaves us with the analysis of the remaining 18 units that aren't getting matched. These are units:

- Enum declarations by the LAMP framework, these aren't evaluated by Designite.
- Anonymous class declarations: these aren't evaluated by the LAMP framework
- Inner Module Units: these aren't evaluated by the LAMP framework.

# Metric performance

In [None]:
types = df_types
methods = df_methods

mods = df_modules
units = df_units

types.rename(columns=lambda x: "D_" + x, inplace=True)
types.rename({"D_DeclarableId": "DeclarableId"}, axis=1, inplace=True)

methods.rename(columns=lambda x: "D_" + x, inplace=True)
methods.rename({"D_DeclarableId": "DeclarableId"}, axis=1, inplace=True)

mods.rename(columns=lambda x: "L_" + x, inplace=True)
mods.rename({"L_DeclarableId": "DeclarableId"}, axis=1, inplace=True)

units.rename(columns=lambda x: "L_" + x, inplace=True)
units.rename({"L_DeclarableId": "DeclarableId"}, axis=1, inplace=True)
units["DeclarableId"] = units["DeclarableId"].apply(lambda s: s.split("$")[0].removesuffix(".constructor").removesuffix(".initializer"))

In [None]:
m_mods = pd.merge(types, mods, on="DeclarableId")
m_mods = pd.merge(m_mods, df_sonar, on="DeclarableId")
print(m_mods)

In [None]:
m_units = pd.merge(methods, units, on=["DeclarableId"])
print(m_units)

In [None]:
figure, axis = plt.subplots(4, 2, figsize=(10, 15))

## Module Lines Of Code

In [None]:
mloc = m_mods[['DeclarableId', 'D_LOC', 'L_MLOC']]
print(mloc)

In [None]:
x1 = mloc['D_LOC']
# x2 = mloc['S_LOC']
x3 = mloc['L_MLOC']

plt = axis[0, 0]

# Plot
# plt.hist([x1, x2, x3], bins=np.linspace(0, 1000, 30), density=True, color=['#3C5DA1', '#B53941', '#469B55'], label=['Designite', 'SonarQube', 'LAMP'])
plt.hist([x1, x3], bins=np.linspace(0, 1000, 30), density=True, color=['#3C5DA1', '#469B55'], label=['Designite', 'LAMP'])
plt.set(title= "MLOC Distribution", xlabel= 'Module Lines of Code', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-MLOC.png')
# plt.show()

## Module Number Of Units

In [None]:
nou = m_mods[['DeclarableId', 'D_NOM', 'L_NOU']]
print(nou)

In [None]:
x1 = m_mods['D_NOM']
x2 = m_mods['L_NOU']

plt = axis[0, 1]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 50, 30), density=True, label=['Designite', 'LAMP'])
plt.set(title= "NOU Distribution", xlabel= 'Number of Units', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-NOU.png')
# plt.show()

## Module Weighted Method per Class (WMC)

In [None]:
wmc = m_mods[['DeclarableId', 'D_WMC', 'L_WMC']]
print(wmc)

In [None]:
x1 = m_mods['D_WMC']
x2 = m_mods['L_WMC']


plt = axis[1, 0]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 100, 30), density=True, label=['Designite', 'LAMP'])
plt.set(title= "WMC Distribution", xlabel= 'Weighted Methods per Class', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-WMC.png')
# plt.show()

## Module - Cognitively Weighted Method per Class (CWMC)

In [None]:
cwmc = m_mods[['DeclarableId', 'S_COCO', 'L_CWMC']].copy()
print(cwmc)

In [None]:
cwmc["Diff"] = cwmc["S_COCO"] - cwmc["L_CWMC"]
print(cwmc.loc[cwmc["Diff"] > 5])

In [None]:
x1 = m_mods['S_COCO']
x2 = m_mods['L_CWMC']

plt = axis[1, 1]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 100, 30), density=True, color=['#B53941', '#469B55'], label=['SonarQube', 'LAMP'])
plt.set(title= "CWMC Distribution", xlabel= 'Cognitively Weighted Methods per Class', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-CWMC.png')
# plt.show()

## Module Depth of Inheritance (DIT)

In [None]:
dit = m_mods[['DeclarableId', 'D_DIT', 'L_DIT']]
print(dit)

In [None]:
dit["Diff"] = dit["D_DIT"] - dit["L_DIT"]
print(dit.loc[dit["Diff"] > 0])

In [None]:
x1 = m_mods['D_DIT']
x2 = m_mods['L_DIT']

plt = axis[2, 0]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 6, 10), density=True, label=['Designite', 'LAMP'])
plt.set(title= "DIT Distribution", xlabel= 'Depth of Inheritance Tree', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-DIT.png')
# plt.show()

## Module - Number of Children

In [None]:
noc = m_mods[['DeclarableId', 'D_NC', 'L_NOC']]
print(noc)

In [None]:
x1 = m_mods['D_NC']
x2 = m_mods['L_NOC']

plt = axis[2, 1]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 10, 10), density=True, label=['Designite', 'LAMP'])
plt.set(title= "NOC Distribution", xlabel= 'Number of Children', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-NOC.png')
# plt.show()

## Module - Lack of Cohesion in Methods (LCOM)

In [None]:
lcom = m_mods[['DeclarableId', 'D_LCOM', 'L_LCOM']]
print(lcom)

## Unit - Lines of Code

In [None]:
uloc = m_units[['DeclarableId', 'D_LOC', 'L_ULOC']]
print(uloc)

## Unit - Cyclomatic Complexity

In [None]:
cc = m_units[['DeclarableId', 'D_CC', 'L_CC']]
print(cc)

In [None]:
x1 = m_units['D_CC']
x2 = m_units['L_CC']

plt = axis[3, 0]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 10, 10), density=True, label=['Designite', 'LAMP'])
plt.set(title= "CC Distribution", xlabel= 'Cyclomatic Complexity', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-CC.png')
# plt.show()

## Unit - Parameter Count

In [None]:
pc = m_units[['DeclarableId', 'D_PC', 'L_PC']]
print(pc)

In [None]:
x1 = m_units['D_PC']
x2 = m_units['L_PC']

plt = axis[3, 1]

# Plot
plt.hist([x1, x2], bins=np.linspace(0, 10, 10), density=True, label=['Designite', 'LAMP'])
plt.set(title= "PC Distribution", xlabel= 'Parameter Count', ylabel='Density')
plt.legend()
# plt.savefig('images/lottie-PC.png')
# plt.show()

In [None]:
m = m_mods.copy()
u = m_units.copy()
m["DELTA_MLOC"] = (m["D_LOC"] - m["L_MLOC"]).abs()
m["DELTA_WMC"] = (m["D_WMC"] - m["L_WMC"]).abs()
m["DELTA_CWMC"] = (m["S_COCO"] - m["L_CWMC"]).abs()
m["DELTA_DIT"] = (m["D_DIT"] - m["L_DIT"]).abs()
m["DELTA_NOC"] = (m["D_NC"] - m["L_NOC"]).abs()
m["DELTA_NOU"] = (m["D_NOM"] - m["L_NOU"]).abs()

u["DELTA_CC"] = (u["D_CC"] - u["L_CC"]).abs()
u["DELTA_PC"] = (u["D_PC"] - u["L_PC"]).abs()

In [None]:
m

In [None]:
u

In [None]:
def stats(column):
    print(f"mean = {column.mean()}")
    print(f"median = {column.median()}")
    print(f"90% = {column.quantile(0.9)}")
    print(f"95% = {column.quantile(0.95)}")
    print(f"99% = {column.quantile(0.99)}")

In [None]:
import matplotlib.pyplot as plt

figure, axis = plt.subplots(4, 2, figsize=(10, 15))
plt.locator_params(axis="both", integer=True, tight=True)

def plot(subplot, df, metric: str):
    y, x, _ = subplot.hist(df[f"DELTA_{metric}"], bins=50, label=[metric])
    subplot.set(title= fr"$\Delta$ {metric} Distribution", xlabel= fr"$\Delta$ {metric}", ylabel='Count')
    subplot.legend()

    quant_95 = df[f"DELTA_{metric}"].quantile(0.95)
    subplot.axvline(quant_95, alpha=0.8, ymax = 0.4, linestyle = ":")
    subplot.text(quant_95-0.20, y.max() * 0.425, "95th", size = 10, alpha = 0.85)

plot(axis[0, 0], m, "MLOC")
plot(axis[0, 1], m, "WMC")
plot(axis[1, 0], m, "CWMC")
plot(axis[1, 1], m, "DIT")
plot(axis[2, 0], m, "NOC")
plot(axis[2, 1], m, "NOU")
plot(axis[3, 0], u, "CC")
plot(axis[3, 1], u, "PC")

figure.tight_layout()
figure.savefig(f'images/{project}-distributions.png')

In [None]:
def get_outliers(df, metric: str):
    outliers = df[df[f"DELTA_{metric}"] >= df[f"DELTA_{metric}"].quantile(0.95)]
    return outliers

In [None]:
get_outliers(m, "MLOC").loc[:, ["DeclarableId", "D_LOC", "L_MLOC", "DELTA_MLOC"]].sort_values(by="DELTA_MLOC", ascending=False)

In [None]:
get_outliers(m, "WMC").loc[:, ["DeclarableId", "D_WMC", "L_WMC", "DELTA_WMC"]].sort_values(by="DeclarableId", ascending=False)

In [None]:
get_outliers(u, "CC").loc[:, ["DeclarableId", "D_CC", "L_CC", "DELTA_CC"]].sort_values(by="DeclarableId", ascending=False)

In [None]:
m[m["DELTA_CWMC"] > 0].loc[:, ["DeclarableId", "S_COCO", "L_CWMC", "DELTA_CWMC"]].sort_values(by="DeclarableId")

In [None]:
u[u["L_COCO"] > 0].loc[:, ["DeclarableId","L_COCO"]].sort_values(by="DeclarableId")

In [None]:
get_outliers(m, "DIT").loc[:, ["DeclarableId", "D_DIT", "L_DIT", "DELTA_DIT"]].sort_values(by="DELTA_DIT", ascending=False)

In [None]:
get_outliers(m, "NOC").loc[:, ["DeclarableId", "D_NC", "L_NOC", "DELTA_NOC"]].sort_values(by="DELTA_NOC", ascending=False)

In [None]:
m.loc[:, ["DeclarableId", "L_NOC"]].sort_values(by="L_NOC", ascending=False)

In [None]:
get_outliers(m, "NOU").loc[:, ["DeclarableId", "D_NOM", "L_NOU", "DELTA_NOU"]].sort_values(by="DELTA_NOU", ascending=False)

In [None]:
u.loc[:, ["DeclarableId", "D_PC", "L_PC", "DELTA_PC"]].sort_values(by=["DELTA_PC"], ascending=False).replace(to_replace=0, value=np.nan).dropna().drop_duplicates(subset="DeclarableId", keep=False)