diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..9a9972c --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,25 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Bokeh server", + "type": "python", + "request": "attach", + "port": 5678, + "host": "localhost", + "preLaunchTask": "launch Bokeh server", + "postDebugTask": "kill Bokeh server" + }, + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..8132481 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,54 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "kill Bokeh server", + "command": "${command:workbench.action.tasks.terminate}", + "problemMatcher": {}, + "presentation": { + "reveal": "never", + "panel": "shared", + "showReuseMessage": false + } + }, + + { + "label": "launch Bokeh server", + "type": "shell", + "isBackground": true, + "command": "./venv/Scripts/bokeh", + "args": [ + "serve", + "sliders.py", + "--port", "5678" + ], + "options": { + "env": { + "BOKEH_VS_DEBUG": "true" + } + }, + // you have to allow the task to not complete by specifying a complete problem matcher + "problemMatcher": { + "fileLocation": [ + "relative", + "${workspaceFolder}" + ], + "pattern": [ + { + "regexp": ".", + "file": 1, + "location": 2, + "message": 3 + } + ], + "background": { + "activeOnStart": true, + "endsPattern": "^.*Waiting for debugger attach.*", + "beginsPattern": "^this should not match on anything" + } + } + } + ] +} \ No newline at end of file diff --git a/csv/metrics.csv.7z b/csv/metrics.csv.7z new file mode 100644 index 0000000..93cbd66 Binary files /dev/null and b/csv/metrics.csv.7z differ diff --git a/csv/readme.md b/csv/readme.md new file mode 100644 index 0000000..5adf00a --- /dev/null +++ b/csv/readme.md @@ -0,0 +1,5 @@ +CSV +===== + +Contains an archive with all extracted metrics values from all systems. +Please refer to this [`readme.md`](../files/readme.md) for how the metrics.csv.7z was created. diff --git a/files/metrics.7z b/files/metrics.7z new file mode 100644 index 0000000..a8f2cbb Binary files /dev/null and b/files/metrics.7z differ diff --git a/files/readme.md b/files/readme.md new file mode 100644 index 0000000..75002ec --- /dev/null +++ b/files/readme.md @@ -0,0 +1,8 @@ +Files +====== + +The original [`metrics.zip`](http://web.archive.org/web/20220814110913/http://java.labsoft.dcc.ufmg.br/qualitas.class/corpus/metrics.zip) (84.2mb) from [http://java.labsoft.dcc.ufmg.br/qualitas.class/download.html](https://web.archive.org/web/20191223234321/http://java.labsoft.dcc.ufmg.br/qualitas.class/download.html) was repacked into `metrics.7z` (30.1mb) using PPMd. + +The metrics values from all these files (systems/projects) have been previously extracted into separate CSV files and merged into one large file. Those are stored under [`../csv/metrics.csv.7z`](../csv/metrics.csv.7z) (496kb). + +You can, however, extract your own metrics using methods of the class `QualitasCorpusMetricsExtractor`. The merged file from above does not retain any other information than the system, the metric, and the value, because the primary purpose of this repository is to approximate distributions. diff --git a/main.py b/main.py new file mode 100644 index 0000000..7caf64d --- /dev/null +++ b/main.py @@ -0,0 +1,83 @@ +from os import walk +from typing import Iterable +from src.data.metrics import QualitasCorpusMetricsExtractor, MetricID +import pandas as pd +import numpy as np + + + + +from src.data.metrics import MetricID +from src.distribution.distribution import Distribution + + +d = Distribution(df=pd.read_csv('csv/metrics.csv')) +data = d.get_cdf_data(metric_id=MetricID.RMA, unique_vals=False) +cdf = Distribution.fit_parametric(data=data) +cdf.save_to_file(file='./results/cdf_VG.pickle') + + + + + + + + + + + + + + +temp = pd.read_csv('csv/__ALL__.csv') + +rng = np.random.default_rng(seed=1337) +r = rng.choice(np.linspace(0, 1e-6, len(temp)), len(temp), replace=False) +#r = np.linspace(0, 1e-12, len(temp)) +#np.random.seed(1337) +#np.random.shuffle(r) + +nu0 = len(np.unique(temp['value'])) +temp['value'] += r +nu = len(np.unique(temp['value'])) + + +def get_file_metrics(files: list[str], proj: str, files_dir: str='./files', csv_dir: str='./csv') -> pd.DataFrame: + dicts = list() + + for file in files: + qcme = QualitasCorpusMetricsExtractor(file=f'{files_dir}/{file}') + for mid in set(MetricID): + for v in qcme.metrics_values(metric_id=mid): + dicts.append({ 'project': proj, 'metric': mid.name, 'value': v }) + + df = pd.DataFrame(dicts) + df.to_csv(f'{csv_dir}/{proj}.csv', index=False) + return df + + + +def convert_xml_to_csv(directory: str='./files'): + prefixes = [] + p = None + + for _, __, files in walk(directory): + for file in files: + p1 = file[0:7] + # The first 7 characters suffice to split by system, + # yet to retain separate projects that make it up + if p is None or p != p1: + prefixes.append(p1) + p = p1 + get_file_metrics(files=list(filter(lambda s: s.startswith(p1), files)), proj=p1) + + + + +def concat_csv_files(directory: str='./csv', target_file_name: str='__ALL__.csv'): + _, __, files = list(walk(directory))[0] + + df = pd.concat( + map(pd.read_csv, list(map(lambda s: f'{directory}/{s}', files))), ignore_index=True) + + df.to_csv(f'{directory}/{target_file_name}', index=False) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cd6849d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +bokeh==2.4.3 +jupyterlab==3.4.5 +matplotlib==3.5.3 +nptyping==2.2.0 +pip-chill==1.0.1 +ptvsd==4.3.2 +statsmodels==0.13.2 +strenum==0.4.8 diff --git a/sliders.py b/sliders.py new file mode 100644 index 0000000..5013b3a --- /dev/null +++ b/sliders.py @@ -0,0 +1,103 @@ +''' Present an interactive function explorer with slider widgets. + +Scrub the sliders to change the properties of the ``sin`` curve, or +type into the title text box to update the title of the plot. + +Use the ``bokeh serve`` command to run the example by executing: + + bokeh serve sliders.py + +at your command prompt. Then navigate to the URL + + http://localhost:5006/sliders + +in your browser. + +''' +import os +import bokeh +import ptvsd + +if os.environ['BOKEH_VS_DEBUG'] == 'true': + # 5678 is the default attach port in the VS Code debug configurations + print('Waiting for debugger attach') + ptvsd.enable_attach(address=('localhost', 5678), redirect_output=True) + ptvsd.wait_for_attach() + + + + + + +import numpy as np + +from bokeh.events import MenuItemClick +from bokeh.io import curdoc +from bokeh.layouts import column, row +from bokeh.models import ColumnDataSource, Slider, TextInput, Dropdown +from bokeh.plotting import figure + +# Set up data +N = 200 +x = np.linspace(0, 4*np.pi, N) +y = np.sin(x) +source = ColumnDataSource(data=dict(x=x, y=y)) + + +# Set up plot +plot = figure(height=400, width=400, title="my sine wave", + tools="crosshair", + x_range=[0, 4*np.pi], y_range=[-2.5, 2.5]) + +plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6) + + +# Set up widgets +text = TextInput(title="title", value='my sine wave') +offset = Slider(title="offset", value=0.0, start=-5.0, end=5.0, step=0.1) +amplitude = Slider(title="amplitude", value=1.0, start=-5.0, end=5.0, step=0.1) +phase = Slider(title="phase", value=0.0, start=0.0, end=2*np.pi) +freq = Slider(title="frequency", value=1.0, start=0.1, end=5.1, step=0.1) + +menu = [("Item 1", "item_1"), ("Item 2", "item_2"), None, ("Item 3", "item_3")] +ddown = Dropdown(label=menu[0][0], menu=menu) + +def temp(evt: MenuItemClick): + print(f'{evt.event_name}: {evt.item}') + ddown.label = evt.item.replace('i', 'I').replace('_', ' ') + if evt.item == 'item_3': + ddown.menu = [('Item X', 'item_x')] + ddown.label = 'Item X' + +ddown.on_click(temp) + + +# Set up callbacks +def update_title(attrname, old, new): + plot.title.text = text.value + +text.on_change('value', update_title) + +def update_data(attrname, old, new): + + # Get the current slider values + a = amplitude.value + b = offset.value + w = phase.value + k = freq.value + + # Generate the new curve + x = np.linspace(0, 4*np.pi, N) + y = a*np.sin(k*x + w) + b + + source.data = dict(x=x, y=y) + +for w in [offset, amplitude, phase, freq]: + w.on_change('value', update_data) + + +# Set up layouts and add to document +inputs = column(text, offset, amplitude, phase, freq, ddown) + +curdoc().add_root(row(inputs, plot, width=800)) +curdoc().title = "Sliders" \ No newline at end of file diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..8860dae --- /dev/null +++ b/src/data/__init__.py @@ -0,0 +1,48 @@ +from typing import Union +from typing import Iterable +from xml.dom.minidom import Document, Element, parse +from strenum import StrEnum +from re import match + + +class MetricID(StrEnum): + TLOC = 'Total Lines of Code' + NOP = 'Number of Packages' + NOC = 'Number of Classes' + NOI = 'Number of Interfaces' + NOM = 'Number of Methods' + NOF = 'Number of Attributes' + NORM = 'Number of Overridden Methods' + PAR = 'Number of Parameters' + NSM = 'Number of Static Methods' + NSF = 'Number of Static Attributes' + WMC = 'Weighted methods per Class' + DIT = 'Depth of Inheritance Tree' + NSC = 'Number of Children' + LCOM = 'Lack of Cohesion of Methods' + MLOC = 'Method Lines of Code' + SIX = 'Specialization Index' + VG = 'McCabe Cyclomatic Complexity' + NBD = 'Nested Block Depth' + RMD = 'Normalized Distance' + CA = 'Afferent Coupling' + CE = 'Efferent Coupling' + RMI = 'Instability' + RMA = 'Abstractness' + + +class QCMetricsExtractor: + def __init__(self, file: str) -> None: + self.xml: Document = parse(file=file) + + def metrics_values(self, metric_id: MetricID) -> Iterable[Union[float, int]]: + metric: Element = self.xml.getElementById(MetricID[metric_id]) + fc: Element = metric.firstChild + if fc.tagName == 'Value': + # This metric has only one value, like TLOC. + v = fc.getAttribute('value') + if match(pattern=r'^\d+$', string=v): + yield int(v) + else: + yield float(v) + \ No newline at end of file diff --git a/src/data/metrics.py b/src/data/metrics.py new file mode 100644 index 0000000..d449f56 --- /dev/null +++ b/src/data/metrics.py @@ -0,0 +1,100 @@ +from typing import Tuple, Union +from typing import Iterable +from xml.etree.cElementTree import parse, Element, iterparse +from strenum import StrEnum +from re import match +from os import walk +from typing import Iterable +import pandas as pd + + +class MetricID(StrEnum): + TLOC = 'Total Lines of Code' + NOP = 'Number of Packages' + NOC = 'Number of Classes' + NOI = 'Number of Interfaces' + NOM = 'Number of Methods' + NOF = 'Number of Attributes' + NORM = 'Number of Overridden Methods' + PAR = 'Number of Parameters' + NSM = 'Number of Static Methods' + NSF = 'Number of Static Attributes' + WMC = 'Weighted methods per Class' + DIT = 'Depth of Inheritance Tree' + NSC = 'Number of Children' + LCOM = 'Lack of Cohesion of Methods' + MLOC = 'Method Lines of Code' + SIX = 'Specialization Index' + VG = 'McCabe Cyclomatic Complexity' + NBD = 'Nested Block Depth' + RMD = 'Normalized Distance' + CA = 'Afferent Coupling' + CE = 'Efferent Coupling' + RMI = 'Instability' + RMA = 'Abstractness' + + +class QualitasCorpusMetricsExtractor: + def __init__(self, file: str) -> None: + #self.xml: Document = parse(file=file) + self.xml = parse(source=file).getroot() + self.xml, self.ns = QualitasCorpusMetricsExtractor.parse_xml(file=file) + + @staticmethod + def parse_xml(file: str) -> Tuple[Element, dict[str, str]]: + xml_iter = iterparse(file, events=['start-ns']) + xml_namespaces = dict(prefix_namespace_pair for _, prefix_namespace_pair in xml_iter) + return xml_iter.root, xml_namespaces + + @staticmethod + def to_numeric(value: str) -> Union[float, int]: + if match(pattern=r'^\d+$', string=value): + return int(value) + return float(value) + + def metrics_values(self, metric_id: MetricID) -> Iterable[Union[float, int]]: + metric = self.xml.find(f'.//*[@id="{metric_id.name}"]', self.ns) + + for elem in metric.findall(f'.//Value', self.ns): + try: + yield QualitasCorpusMetricsExtractor.to_numeric(value=elem.attrib['value']) + except Exception: + pass + + @staticmethod + def get_file_metrics(files: list[str], system: str, files_dir: str='./files', csv_dir: str='./csv') -> pd.DataFrame: + dicts = list() + + for file in files: + qcme = QualitasCorpusMetricsExtractor(file=f'{files_dir}/{file}') + for mid in set(MetricID): + for v in qcme.metrics_values(metric_id=mid): + dicts.append({ 'system': system, 'metric': mid.name, 'value': v }) + + df = pd.DataFrame(dicts) + df.to_csv(f'{csv_dir}/{system}.csv', index=False) + return df + + @staticmethod + def convert_xml_to_csv(directory: str='./files'): + prefixes = [] + p = None + + for _, __, files in walk(directory): + for file in files: + p1 = file[0:7] + # The first 7 characters suffice to split by system, + # yet to retain separate projects that make it up + if p is None or p != p1: + prefixes.append(p1) + p = p1 + QualitasCorpusMetricsExtractor.get_file_metrics(files=list(filter(lambda s: s.startswith(p1), files)), system=p1) + + @staticmethod + def concat_csv_files(directory: str='./csv', target_file_name: str='__ALL__.csv'): + _, __, files = list(walk(directory))[0] + + df = pd.concat( + map(pd.read_csv, list(map(lambda s: f'{directory}/{s}', files))), ignore_index=True) + + df.to_csv(f'{directory}/{target_file_name}', index=False) diff --git a/src/distribution/__init__.py b/src/distribution/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/distribution/distribution.py b/src/distribution/distribution.py new file mode 100644 index 0000000..01fd6ef --- /dev/null +++ b/src/distribution/distribution.py @@ -0,0 +1,109 @@ +from typing import Any, Callable +from nptyping import NDArray, Shape, Float, String +from src.data.metrics import MetricID +from statsmodels.distributions import ECDF as SMEcdf +from scipy.stats import kstest +import pandas as pd +import numpy as np +import scipy.stats +import pickle + + + + +class DensityFunc: + def __init__(self, range: tuple[float, float], func: Callable[[float], float]) -> None: + self.range = range + self.func = np.vectorize(func) + + def __call__(self, x: NDArray[Shape["*"], Float]) -> NDArray[Shape["*"], Float]: + x = np.max(self.range[0], np.min(self.range[1], x)) + return self.func(x) + + def save_to_file(self, file: str) -> None: + with open(file=file, mode='wb') as f: + pickle.dump(obj=self, file=f) + + @staticmethod + def load_from_file(file: str) -> 'DensityFunc': + with open(file=file, mode='rb') as f: + return pickle.load(file=f) + + + +class ECDF(DensityFunc): + def __init__(self, data: NDArray[Shape["*"], Float]) -> None: + super().__init__(range=(np.min(data), np.max(data)), func=SMEcdf(data)) + + + +class CDF(DensityFunc): + def __init__(self, name: str, range: tuple[float, float], func: Callable[[float], float], pval: float, dstat: float) -> None: + super().__init__(range, func) + self.name = name + self.pval = pval + self.dstat = dstat + + + +class Distribution: + def __init__(self, df: pd.DataFrame) -> None: + self.df = df + + + @property + def available_systems(self) -> NDArray[Shape["*"], String]: + return self.df['project'].unique() + + + def get_cdf_data(self, metric_id: MetricID, system: str=None, unique_vals: bool=True) -> NDArray[Shape["*"], Float]: + new_df = self.df[self.df['metric'] == metric_id.name] + if system is not None: + new_df = new_df[new_df['system'] == system] + + vals = new_df['value'] + if unique_vals: + rng = np.random.default_rng(seed=1337) + r = rng.choice(np.linspace(0, 1e-6, vals.size), vals.size, replace=False) + # Add small but insignificant perturbations to the data to produce unique + # values that would otherwise be eliminated by certain methods. + vals += r + + return vals.to_numpy() + + @staticmethod + def fit_parametric(data: NDArray[Shape["*"], Float], alpha: float=0.05, max_samples: int=10_000) -> CDF: + distNames = ['gamma', 'gennorm', 'genexpon', 'expon', 'exponnorm', + 'exponweib', 'exponpow', 'genextreme', 'gausshyper', 'dweibull', 'invgamma', 'gilbrat','genhalflogistic', 'ncf', 'nct', 'ncx2', 'pareto', 'uniform', 'pearson3', 'mielke', 'moyal', 'nakagami', 'laplace', 'laplace_asymmetric', 'rice', 'rayleigh', 'trapezoid', 'vonmises','kappa4', 'lomax', 'loguniform', 'loglaplace', 'foldnorm', 'kstwobign', 'erlang', 'ksone','chi2', 'logistic', 'johnsonsb', 'gumbel_l', 'gumbel_r', 'genpareto', 'powerlognorm', 'bradford', 'alpha', 'tukeylambda', 'wald', 'maxwell', 'loggamma', 'fisk', 'cosine', 'burr', + 'beta', 'betaprime', 'crystalball', 'burr12', 'anglit', 'arcsine', 'gompertz', 'geninvgauss'] + + if data.shape[0] > max_samples: + # Then we will sub-sample to speed up the process. + data = np.random.choice(data, size=max_samples, replace=False) + + best_kst = None + use_dist: tuple = None + for distName in distNames: + res = float('inf') + + try: + dist = getattr(scipy.stats, distName) + distParams = dist.fit(data) + kst = kstest(data, cdf=dist.cdf, args=distParams) + + if kst.pvalue >= alpha and kst.statistic < res: + res = kst.statistic + best_kst = kst + use_dist = (distName, dist.cdf, distParams) + except Exception as ex: + print(ex) + pass + + if use_dist is None: + raise Exception('Cannot fit parametric distribution for given data.') + + + def cdf(x): + return use_dist[1](*(x, *use_dist[2])) + + return CDF(name=use_dist[0], range=(np.min(data), np.max(data)), func=cdf, pval=best_kst.pvalue, dstat=best_kst.statistic) diff --git a/src/webapp/__init__.py b/src/webapp/__init__.py new file mode 100644 index 0000000..e69de29