In [1]:
#hide
#default_exp test_ratio
from nbdev.showdoc import show_doc

In [68]:
# export

from collections import Counter
import ast
import nbformat

from nbdev.export import read_nb
from sciflow.utils import load_nb_module

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Quality relevant data extraction

## Definitions
* Function ($f$) = function in `# export` block
* Test ($\tau$) = call of exported function outside `# export` block

## Metrics
1. Tests per Function: $\mathrm{TpF}$ = $\dfrac{|\tau|}{f}$,when $f=0; \mathrm{TpF} = 0$
2. In-function Percentage: $\mathrm{IP} = $\# $\mathrm{statements-in-function}:$ \# $\mathrm{all-statements}$ 
3. MD to Code Ratio: $\mathrm{CMR}$ = # $ \mathrm{markdown-cells}:$ \# $\mathrm{code-cells}$ 
4. Total Code Lines: $\mathrm{TCL}$ = \# $\mathrm{all-code-lines}$ 

# 1. Tests-per-Function

In [124]:
def get_function_defs(code):
    func_names = []
    for stmt in ast.walk(ast.parse(code)):
        if isinstance(stmt, ast.FunctionDef) and not stmt.name.startswith('_'):
            func_names.append(stmt.name)
    return func_names

In [125]:
def count_func_calls(code, func_defs):
    func_calls = Counter({k: 0 for k in func_defs})
    for stmt in ast.walk(ast.parse(code)):
        if isinstance(stmt, ast.Call):
            func_name = stmt.func.id if 'id' in stmt.func.__dict__ else stmt.func.attr
            if func_name in func_defs:
                if func_name in func_calls:
                    func_calls[func_name] += 1
    return func_calls

In [126]:
test_code = """self.hierarchical_topic_reduction(3); 
topic_reduction(3); 
lambda x: topic(x); 
hierarchical_topic_reduction[4]; 
hierarchical_topic_reduction(4); 
blabla()
"""
test_func_defs = ['topic', 'topic_reduction', 'blablabla', 'hierarchical_topic_reduction']

In [127]:
assert count_func_calls(test_code, test_func_defs) == Counter({'topic': 1,
         'topic_reduction': 1,
         'blablabla': 0,
         'hierarchical_topic_reduction': 2})

In [128]:
def calc_tpf(num_tests, num_funcs):
    return 0 if num_funcs == 0 else num_tests/num_funcs

In [129]:
assert calc_tpf(1,1) == 1
assert calc_tpf(2,1) == 2
assert calc_tpf(1,2) == 0.5
assert calc_tpf(0,1) == 0
assert calc_tpf(1,0) == 0
assert calc_tpf(10,1) == 10

In [130]:
def tpf(nb_path):
    nb, module_code = load_nb_module(nb_path)
    pnb = nbformat.from_dict(nb)
    nb_cell_code = '\n'.join([c['source'].replace('%', '#') for c in pnb.cells if c['cell_type'] == 'code'])
    func_defs = get_function_defs(module_code)
    func_calls = count_func_calls(nb_cell_code, func_defs)
    num_funcs = len(func_calls.keys())
    num_tests = sum(func_calls.values())
    print(num_tests, num_funcs)
    return calc_tpf(num_tests, num_funcs)

In [131]:
tpf("test/test_module.ipynb")

0 1


0.0

In [132]:
tpf("test/test_export.ipynb")

2 5


0.4

In [133]:
tpf("test/test_data_handling.ipynb")

3 3


1.0

# 2. In-function Percentage

In [134]:
nb_cell_code = """
def something():
    pass; pass
#load_ext autoreload
#autoreload 2
# export


import numpy as np
import pandas as pd
from sciflow.utils import lib_path, odbc_connect, query
pd.set_option("display.max_colwidth", 800)
# export
"""

In [138]:
def ifp(nb_path):
    nb = read_nb(nb_path)
    nb_cell_code = '\n'.join([c['source'].replace('%', '#') for c in nb.cells if c['cell_type'] == 'code'])
    stmts_in_func = 0
    stmts_outside_func = 0
    for stmt in ast.walk(ast.parse(nb_cell_code)):
        if isinstance(stmt, ast.FunctionDef) and not stmt.name.startswith('_'):
            for body_item in stmt.body:
                stmts_in_func += 1
        else:
            stmts_outside_func += 1
    return 0 if stmts_outside_func + stmts_in_func == 0 else stmts_in_func/ (stmts_outside_func + stmts_in_func)

In [141]:
ifp("test/test_data_handling.ipynb")

0.032679738562091505

In [142]:
ifp("test/test_export.ipynb")

0.07009345794392523

In [143]:
ifp("test/test_clustering.ipynb")

0.03551401869158879

# 3. Markdown to Code Ratio

In [81]:
def cmr(nb_path):
    nb = read_nb(nb_path)
    md_cells = [c for c in nb.cells if c['cell_type'] == 'markdown']
    code_cells = [c for c in nb.cells if c['cell_type'] == 'code']
    num_code_cells = len(code_cells)
    num_md_cells = len(md_cells)
    return 0 if num_code_cells == 0 else num_md_cells/num_code_cells

In [82]:
cmr("test/test_module.ipynb")

0.5

In [83]:
cmr("test/test_export.ipynb")

0.3157894736842105

In [84]:
cmr("test/test_clustering.ipynb")

0.30303030303030304

In [85]:
cmr("test/test_module.ipynb")

0.5

# 4. Total Code Length

In [75]:
def tcl(nb_path):
    nb = read_nb(nb_path)
    return sum([len(c['source']) for c in nb.cells if c['cell_type'] == 'code'])

In [76]:
tcl("test/test_module.ipynb")

290

In [77]:
tcl("test/test_export.ipynb")

1375

In [78]:
tcl("test/test_data_handling.ipynb")

1367

In [80]:
tcl("test/test_clustering.ipynb")

6406