Initial commit with all data and some implementation

MrShoenel · Aug 15, 2022 · 266d010 · 266d010
1 parent 3eb129d
commit 266d010
Show file tree

Hide file tree

Showing 14 changed files with 543 additions and 0 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,25 @@
+{
+	// Use IntelliSense to learn about possible attributes.
+	// Hover to view descriptions of existing attributes.
+	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "Bokeh server",
+			"type": "python",
+			"request": "attach",
+			"port": 5678,
+			"host": "localhost",
+			"preLaunchTask": "launch Bokeh server",
+			"postDebugTask": "kill Bokeh server"
+		},
+		{
+			"name": "Python: Current File",
+			"type": "python",
+			"request": "launch",
+			"program": "${file}",
+			"console": "integratedTerminal",
+			"justMyCode": true
+		}
+	]
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,54 @@
+{
+	// See https://go.microsoft.com/fwlink/?LinkId=733558
+	// for the documentation about the tasks.json format
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"label": "kill Bokeh server",
+			"command": "${command:workbench.action.tasks.terminate}",
+			"problemMatcher": {},
+			"presentation": {
+					"reveal": "never",
+					"panel": "shared",
+					"showReuseMessage": false
+			}
+		},
+
+		{
+			"label": "launch Bokeh server",
+			"type": "shell",
+			"isBackground": true,
+			"command": "./venv/Scripts/bokeh",
+			"args": [
+					"serve",
+					"sliders.py",
+					"--port", "5678"
+			],
+			"options": {
+					"env": {
+							"BOKEH_VS_DEBUG": "true"
+					}
+			},
+			// you have to allow the task to not complete by specifying a complete problem matcher
+			"problemMatcher": {
+					"fileLocation": [
+							"relative",
+							"${workspaceFolder}"
+					],
+					"pattern": [
+							{
+									"regexp": ".",
+									"file": 1,
+									"location": 2,
+									"message": 3
+							}
+					],
+					"background": {
+							"activeOnStart": true,
+							"endsPattern": "^.*Waiting for debugger attach.*",
+							"beginsPattern": "^this should not match on anything"
+					}
+			}
+		}
+	]
+}
diff --git a/csv/metrics.csv.7z b/csv/metrics.csv.7z
diff --git a/csv/readme.md b/csv/readme.md
@@ -0,0 +1,5 @@
+CSV
+=====
+
+Contains an archive with all extracted metrics values from all systems.
+Please refer to this [`readme.md`](../files/readme.md) for how the metrics.csv.7z was created.
diff --git a/files/metrics.7z b/files/metrics.7z
diff --git a/files/readme.md b/files/readme.md
@@ -0,0 +1,8 @@
+Files
+======
+
+The original [`metrics.zip`](http://web.archive.org/web/20220814110913/http://java.labsoft.dcc.ufmg.br/qualitas.class/corpus/metrics.zip) (84.2mb) from [http://java.labsoft.dcc.ufmg.br/qualitas.class/download.html](https://web.archive.org/web/20191223234321/http://java.labsoft.dcc.ufmg.br/qualitas.class/download.html) was repacked into `metrics.7z` (30.1mb) using PPMd.
+
+The metrics values from all these files (systems/projects) have been previously extracted into separate CSV files and merged into one large file. Those are stored under [`../csv/metrics.csv.7z`](../csv/metrics.csv.7z) (496kb).
+
+You can, however, extract your own metrics using methods of the class `QualitasCorpusMetricsExtractor`. The merged file from above does not retain any other information than the system, the metric, and the value, because the primary purpose of this repository is to approximate distributions.
diff --git a/main.py b/main.py
@@ -0,0 +1,83 @@
+from os import walk
+from typing import Iterable
+from src.data.metrics import QualitasCorpusMetricsExtractor, MetricID
+import pandas as pd
+import numpy as np
+
+
+
+
+from src.data.metrics import MetricID
+from src.distribution.distribution import Distribution
+
+
+d = Distribution(df=pd.read_csv('csv/metrics.csv'))
+data = d.get_cdf_data(metric_id=MetricID.RMA, unique_vals=False)
+cdf = Distribution.fit_parametric(data=data)
+cdf.save_to_file(file='./results/cdf_VG.pickle')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+temp = pd.read_csv('csv/__ALL__.csv')
+
+rng = np.random.default_rng(seed=1337)
+r = rng.choice(np.linspace(0, 1e-6, len(temp)), len(temp), replace=False)
+#r = np.linspace(0, 1e-12, len(temp))
+#np.random.seed(1337)
+#np.random.shuffle(r)
+
+nu0 = len(np.unique(temp['value']))
+temp['value'] += r
+nu = len(np.unique(temp['value']))
+
+
+def get_file_metrics(files: list[str], proj: str, files_dir: str='./files', csv_dir: str='./csv') -> pd.DataFrame:
+    dicts = list()
+
+    for file in files:
+        qcme = QualitasCorpusMetricsExtractor(file=f'{files_dir}/{file}')
+        for mid in set(MetricID):
+            for v in qcme.metrics_values(metric_id=mid):
+                dicts.append({ 'project': proj, 'metric': mid.name, 'value': v })
+
+    df = pd.DataFrame(dicts)
+    df.to_csv(f'{csv_dir}/{proj}.csv', index=False)
+    return df
+
+
+
+def convert_xml_to_csv(directory: str='./files'):
+    prefixes = []
+    p = None
+
+    for _, __, files in walk(directory):
+        for file in files:
+            p1 = file[0:7]
+            # The first 7 characters suffice to split by system,
+            # yet to retain separate projects that make it up
+            if p is None or p != p1:
+                prefixes.append(p1)
+                p = p1
+                get_file_metrics(files=list(filter(lambda s: s.startswith(p1), files)), proj=p1)
+
+
+
+
+def concat_csv_files(directory: str='./csv', target_file_name: str='__ALL__.csv'):
+    _, __, files = list(walk(directory))[0]
+
+    df = pd.concat(
+        map(pd.read_csv, list(map(lambda s: f'{directory}/{s}', files))), ignore_index=True)
+
+    df.to_csv(f'{directory}/{target_file_name}', index=False)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+bokeh==2.4.3
+jupyterlab==3.4.5
+matplotlib==3.5.3
+nptyping==2.2.0
+pip-chill==1.0.1
+ptvsd==4.3.2
+statsmodels==0.13.2
+strenum==0.4.8
diff --git a/sliders.py b/sliders.py
@@ -0,0 +1,103 @@
+''' Present an interactive function explorer with slider widgets.
+
+Scrub the sliders to change the properties of the ``sin`` curve, or
+type into the title text box to update the title of the plot.
+
+Use the ``bokeh serve`` command to run the example by executing:
+
+    bokeh serve sliders.py
+
+at your command prompt. Then navigate to the URL
+
+    http://localhost:5006/sliders
+
+in your browser.
+
+'''
+import os
+import bokeh
+import ptvsd
+
+if os.environ['BOKEH_VS_DEBUG'] == 'true':
+    # 5678 is the default attach port in the VS Code debug configurations
+    print('Waiting for debugger attach')
+    ptvsd.enable_attach(address=('localhost', 5678), redirect_output=True)
+    ptvsd.wait_for_attach()
+
+
+
+
+
+
+import numpy as np
+
+from bokeh.events import MenuItemClick
+from bokeh.io import curdoc
+from bokeh.layouts import column, row
+from bokeh.models import ColumnDataSource, Slider, TextInput, Dropdown
+from bokeh.plotting import figure
+
+# Set up data
+N = 200
+x = np.linspace(0, 4*np.pi, N)
+y = np.sin(x)
+source = ColumnDataSource(data=dict(x=x, y=y))
+
+
+# Set up plot
+plot = figure(height=400, width=400, title="my sine wave",
+              tools="crosshair",
+              x_range=[0, 4*np.pi], y_range=[-2.5, 2.5])
+
+plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)
+
+
+# Set up widgets
+text = TextInput(title="title", value='my sine wave')
+offset = Slider(title="offset", value=0.0, start=-5.0, end=5.0, step=0.1)
+amplitude = Slider(title="amplitude", value=1.0, start=-5.0, end=5.0, step=0.1)
+phase = Slider(title="phase", value=0.0, start=0.0, end=2*np.pi)
+freq = Slider(title="frequency", value=1.0, start=0.1, end=5.1, step=0.1)
+
+menu = [("Item 1", "item_1"), ("Item 2", "item_2"), None, ("Item 3", "item_3")]
+ddown = Dropdown(label=menu[0][0], menu=menu)
+
+def temp(evt: MenuItemClick):
+    print(f'{evt.event_name}: {evt.item}')
+    ddown.label = evt.item.replace('i', 'I').replace('_', ' ')
+    if evt.item == 'item_3':
+        ddown.menu = [('Item X', 'item_x')]
+        ddown.label = 'Item X'
+
+ddown.on_click(temp)
+
+
+# Set up callbacks
+def update_title(attrname, old, new):
+    plot.title.text = text.value
+
+text.on_change('value', update_title)
+
+def update_data(attrname, old, new):
+
+    # Get the current slider values
+    a = amplitude.value
+    b = offset.value
+    w = phase.value
+    k = freq.value
+
+    # Generate the new curve
+    x = np.linspace(0, 4*np.pi, N)
+    y = a*np.sin(k*x + w) + b
+
+    source.data = dict(x=x, y=y)
+
+for w in [offset, amplitude, phase, freq]:
+    w.on_change('value', update_data)
+
+
+# Set up layouts and add to document
+inputs = column(text, offset, amplitude, phase, freq, ddown)
+
+curdoc().add_root(row(inputs, plot, width=800))
+curdoc().title = "Sliders"
diff --git a/src/data/__init__.py b/src/data/__init__.py
@@ -0,0 +1,48 @@
+from typing import Union
+from typing import Iterable
+from xml.dom.minidom import Document, Element, parse
+from strenum import StrEnum
+from re import match
+
+
+class MetricID(StrEnum):
+    TLOC = 'Total Lines of Code'
+    NOP = 'Number of Packages'
+    NOC = 'Number of Classes'
+    NOI = 'Number of Interfaces'
+    NOM = 'Number of Methods'
+    NOF = 'Number of Attributes'
+    NORM = 'Number of Overridden Methods'
+    PAR = 'Number of Parameters'
+    NSM = 'Number of Static Methods'
+    NSF = 'Number of Static Attributes'
+    WMC = 'Weighted methods per Class'
+    DIT = 'Depth of Inheritance Tree'
+    NSC = 'Number of Children'
+    LCOM = 'Lack of Cohesion of Methods'
+    MLOC = 'Method Lines of Code'
+    SIX = 'Specialization Index'
+    VG = 'McCabe Cyclomatic Complexity'
+    NBD = 'Nested Block Depth'
+    RMD = 'Normalized Distance'
+    CA = 'Afferent Coupling'
+    CE = 'Efferent Coupling'
+    RMI = 'Instability'
+    RMA = 'Abstractness'
+
+
+class QCMetricsExtractor:
+    def __init__(self, file: str) -> None:
+        self.xml: Document = parse(file=file)
+
+    def metrics_values(self, metric_id: MetricID) -> Iterable[Union[float, int]]:
+        metric: Element = self.xml.getElementById(MetricID[metric_id])
+        fc: Element = metric.firstChild
+        if fc.tagName == 'Value':
+            # This metric has only one value, like TLOC.
+            v = fc.getAttribute('value')
+            if match(pattern=r'^\d+$', string=v):
+                yield int(v)
+            else:
+                yield float(v)
+