Skip to content


Initial commit with all data and some implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
MrShoenel committed Aug 15, 2022
1 parent 3eb129d commit 266d010
Show file tree
Hide file tree
Showing 14 changed files with 543 additions and 0 deletions.
25 changes: 25 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit:
"version": "0.2.0",
"configurations": [
"name": "Bokeh server",
"type": "python",
"request": "attach",
"port": 5678,
"host": "localhost",
"preLaunchTask": "launch Bokeh server",
"postDebugTask": "kill Bokeh server"
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
54 changes: 54 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// See
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
"label": "kill Bokeh server",
"command": "${command:workbench.action.tasks.terminate}",
"problemMatcher": {},
"presentation": {
"reveal": "never",
"panel": "shared",
"showReuseMessage": false

"label": "launch Bokeh server",
"type": "shell",
"isBackground": true,
"command": "./venv/Scripts/bokeh",
"args": [
"--port", "5678"
"options": {
"env": {
"BOKEH_VS_DEBUG": "true"
// you have to allow the task to not complete by specifying a complete problem matcher
"problemMatcher": {
"fileLocation": [
"pattern": [
"regexp": ".",
"file": 1,
"location": 2,
"message": 3
"background": {
"activeOnStart": true,
"endsPattern": "^.*Waiting for debugger attach.*",
"beginsPattern": "^this should not match on anything"
Binary file added csv/metrics.csv.7z
Binary file not shown.
5 changes: 5 additions & 0 deletions csv/
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

Contains an archive with all extracted metrics values from all systems.
Please refer to this [``](../files/ for how the metrics.csv.7z was created.
Binary file added files/metrics.7z
Binary file not shown.
8 changes: 8 additions & 0 deletions files/
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

The original [``]( (84.2mb) from []( was repacked into `metrics.7z` (30.1mb) using PPMd.

The metrics values from all these files (systems/projects) have been previously extracted into separate CSV files and merged into one large file. Those are stored under [`../csv/metrics.csv.7z`](../csv/metrics.csv.7z) (496kb).

You can, however, extract your own metrics using methods of the class `QualitasCorpusMetricsExtractor`. The merged file from above does not retain any other information than the system, the metric, and the value, because the primary purpose of this repository is to approximate distributions.
83 changes: 83 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from os import walk
from typing import Iterable
from import QualitasCorpusMetricsExtractor, MetricID
import pandas as pd
import numpy as np

from import MetricID
from src.distribution.distribution import Distribution

d = Distribution(df=pd.read_csv('csv/metrics.csv'))
data = d.get_cdf_data(metric_id=MetricID.RMA, unique_vals=False)
cdf = Distribution.fit_parametric(data=data)

temp = pd.read_csv('csv/__ALL__.csv')

rng = np.random.default_rng(seed=1337)
r = rng.choice(np.linspace(0, 1e-6, len(temp)), len(temp), replace=False)
#r = np.linspace(0, 1e-12, len(temp))

nu0 = len(np.unique(temp['value']))
temp['value'] += r
nu = len(np.unique(temp['value']))

def get_file_metrics(files: list[str], proj: str, files_dir: str='./files', csv_dir: str='./csv') -> pd.DataFrame:
dicts = list()

for file in files:
qcme = QualitasCorpusMetricsExtractor(file=f'{files_dir}/{file}')
for mid in set(MetricID):
for v in qcme.metrics_values(metric_id=mid):
dicts.append({ 'project': proj, 'metric':, 'value': v })

df = pd.DataFrame(dicts)
df.to_csv(f'{csv_dir}/{proj}.csv', index=False)
return df

def convert_xml_to_csv(directory: str='./files'):
prefixes = []
p = None

for _, __, files in walk(directory):
for file in files:
p1 = file[0:7]
# The first 7 characters suffice to split by system,
# yet to retain separate projects that make it up
if p is None or p != p1:
p = p1
get_file_metrics(files=list(filter(lambda s: s.startswith(p1), files)), proj=p1)

def concat_csv_files(directory: str='./csv', target_file_name: str='__ALL__.csv'):
_, __, files = list(walk(directory))[0]

df = pd.concat(
map(pd.read_csv, list(map(lambda s: f'{directory}/{s}', files))), ignore_index=True)

df.to_csv(f'{directory}/{target_file_name}', index=False)
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
103 changes: 103 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
''' Present an interactive function explorer with slider widgets.
Scrub the sliders to change the properties of the ``sin`` curve, or
type into the title text box to update the title of the plot.
Use the ``bokeh serve`` command to run the example by executing:
bokeh serve
at your command prompt. Then navigate to the URL
in your browser.
import os
import bokeh
import ptvsd

if os.environ['BOKEH_VS_DEBUG'] == 'true':
# 5678 is the default attach port in the VS Code debug configurations
print('Waiting for debugger attach')
ptvsd.enable_attach(address=('localhost', 5678), redirect_output=True)

import numpy as np

from import MenuItemClick
from import curdoc
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, Slider, TextInput, Dropdown
from bokeh.plotting import figure

# Set up data
N = 200
x = np.linspace(0, 4*np.pi, N)
y = np.sin(x)
source = ColumnDataSource(data=dict(x=x, y=y))

# Set up plot
plot = figure(height=400, width=400, title="my sine wave",
x_range=[0, 4*np.pi], y_range=[-2.5, 2.5])

plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)

# Set up widgets
text = TextInput(title="title", value='my sine wave')
offset = Slider(title="offset", value=0.0, start=-5.0, end=5.0, step=0.1)
amplitude = Slider(title="amplitude", value=1.0, start=-5.0, end=5.0, step=0.1)
phase = Slider(title="phase", value=0.0, start=0.0, end=2*np.pi)
freq = Slider(title="frequency", value=1.0, start=0.1, end=5.1, step=0.1)

menu = [("Item 1", "item_1"), ("Item 2", "item_2"), None, ("Item 3", "item_3")]
ddown = Dropdown(label=menu[0][0], menu=menu)

def temp(evt: MenuItemClick):
print(f'{evt.event_name}: {evt.item}')
ddown.label = evt.item.replace('i', 'I').replace('_', ' ')
if evt.item == 'item_3': = [('Item X', 'item_x')]
ddown.label = 'Item X'


# Set up callbacks
def update_title(attrname, old, new):
plot.title.text = text.value

text.on_change('value', update_title)

def update_data(attrname, old, new):

# Get the current slider values
a = amplitude.value
b = offset.value
w = phase.value
k = freq.value

# Generate the new curve
x = np.linspace(0, 4*np.pi, N)
y = a*np.sin(k*x + w) + b = dict(x=x, y=y)

for w in [offset, amplitude, phase, freq]:
w.on_change('value', update_data)

# Set up layouts and add to document
inputs = column(text, offset, amplitude, phase, freq, ddown)

curdoc().add_root(row(inputs, plot, width=800))
curdoc().title = "Sliders"
48 changes: 48 additions & 0 deletions src/data/
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Union
from typing import Iterable
from xml.dom.minidom import Document, Element, parse
from strenum import StrEnum
from re import match

class MetricID(StrEnum):
TLOC = 'Total Lines of Code'
NOP = 'Number of Packages'
NOC = 'Number of Classes'
NOI = 'Number of Interfaces'
NOM = 'Number of Methods'
NOF = 'Number of Attributes'
NORM = 'Number of Overridden Methods'
PAR = 'Number of Parameters'
NSM = 'Number of Static Methods'
NSF = 'Number of Static Attributes'
WMC = 'Weighted methods per Class'
DIT = 'Depth of Inheritance Tree'
NSC = 'Number of Children'
LCOM = 'Lack of Cohesion of Methods'
MLOC = 'Method Lines of Code'
SIX = 'Specialization Index'
VG = 'McCabe Cyclomatic Complexity'
NBD = 'Nested Block Depth'
RMD = 'Normalized Distance'
CA = 'Afferent Coupling'
CE = 'Efferent Coupling'
RMI = 'Instability'
RMA = 'Abstractness'

class QCMetricsExtractor:
def __init__(self, file: str) -> None:
self.xml: Document = parse(file=file)

def metrics_values(self, metric_id: MetricID) -> Iterable[Union[float, int]]:
metric: Element = self.xml.getElementById(MetricID[metric_id])
fc: Element = metric.firstChild
if fc.tagName == 'Value':
# This metric has only one value, like TLOC.
v = fc.getAttribute('value')
if match(pattern=r'^\d+$', string=v):
yield int(v)
yield float(v)


0 comments on commit 266d010

Please sign in to comment.