In [1]:
# hide
# default_exp scilint

# Data Science Notebook Linting

[nbQA](https://github.com/nbQA-dev/nbQA) lets you run any standard Python code quality tool on a Jupyter Notebook. The changes can be suggested or made in-place using `nbqa-mutate`.

Command Line Examples:
```
> nbqa autoflake my_notebook.ipynb -i --remove-unused-variables
--remove-all-unused-imports --nbqa-mutate
> nbqa black my_notebook.ipynb --nbqa-mutate
> nbqa check-ast my_notebook.ipynb
> nbqa doctest my_notebook.ipynb
> nbqa flake8 my_notebook.ipynb --extend-ignore=E203,E302,E305,E703
> nbqa isort my_notebook.ipynb --nbqa-mutate
> nbqa mypy my_notebook.ipynb --ignore-missing-imports
> nbqa pylint my_notebook.ipynb --disable=C0114
> nbqa pyupgrade my_notebook.ipynb --py36-plus --nbqa-mutate
```

In [2]:
# export

import ast
import os
from collections import Counter
from pathlib import Path

import nbformat
from fastcore.script import call_parse
from nbdev.export import read_nb
from nbqa.__main__ import _get_configs, _main
from nbqa.cmdline import CLIArgs
from nbqa.find_root import find_project_root
from sciflow.utils import load_nb_module

In [3]:
%load_ext autoreload
%autoreload 2

# NB Code Style

In [4]:
# export


def run_nbqa_cmd(cmd):
    print(f"Running {cmd}")
    project_root: Path = find_project_root(tuple([str(Path(".").resolve())]))
    args = CLIArgs.parse_args([cmd, str(project_root)])
    configs = _get_configs(args, project_root)
    output_code = _main(args, configs)
    return output_code

In [5]:
project_root: Path = find_project_root(tuple([str(Path(".").resolve())]))
assert os.path.basename(project_root) == "sciflow"

In [6]:
# export


@call_parse
def sciflow_tidy():
    "Run notebook formatting and tidy utilities. \
    These tools should be configured to run automatically without intervention."
    tidy_tools = ["black", "isort", "autoflake"]
    [run_nbqa_cmd(c) for c in tidy_tools]

# Quality relevant data extraction

## Definitions
* Function ($f$) = function in `# export` block
* Test ($\tau$) = call of exported function outside `# export` block

## Metrics
1. Tests per Function: $\mathrm{TpF}$ = $\dfrac{|\tau|}{f}$,when $f=0; \mathrm{TpF} = 0$
2. In-function Percentage: $\mathrm{IP} = $\# $\mathrm{statements-in-function}:$ \# $\mathrm{all-statements}$ 
3. MD to Code Ratio: $\mathrm{CMR}$ = # $ \mathrm{markdown-cells}:$ \# $\mathrm{code-cells}$ 
4. Total Code Lines: $\mathrm{TCL}$ = \# $\mathrm{all-code-lines}$ 

# 1. Tests-per-Function

In [7]:
# export


def get_function_defs(code):
    func_names = []
    for stmt in ast.walk(ast.parse(code)):
        if isinstance(stmt, ast.FunctionDef) and not stmt.name.startswith("_"):
            func_names.append(stmt.name)
    return func_names

In [8]:
# export


def count_func_calls(code, func_defs):
    func_calls = Counter({k: 0 for k in func_defs})
    for stmt in ast.walk(ast.parse(code)):
        if isinstance(stmt, ast.Call):
            func_name = stmt.func.id if "id" in stmt.func.__dict__ else stmt.func.attr
            if func_name in func_defs:
                if func_name in func_calls:
                    func_calls[func_name] += 1
    return func_calls

In [9]:
test_code = """self.hierarchical_topic_reduction(3); 
topic_reduction(3); 
lambda x: topic(x); 
hierarchical_topic_reduction[4]; 
hierarchical_topic_reduction(4); 
blabla()
"""
test_func_defs = [
    "topic",
    "topic_reduction",
    "blablabla",
    "hierarchical_topic_reduction",
]

In [10]:
assert count_func_calls(test_code, test_func_defs) == Counter(
    {
        "topic": 1,
        "topic_reduction": 1,
        "blablabla": 0,
        "hierarchical_topic_reduction": 2,
    }
)

In [11]:
# export


def calc_tpf(num_tests, num_funcs):
    return 0 if num_funcs == 0 else num_tests / num_funcs

In [12]:
assert calc_tpf(1, 1) == 1
assert calc_tpf(2, 1) == 2
assert calc_tpf(1, 2) == 0.5
assert calc_tpf(0, 1) == 0
assert calc_tpf(1, 0) == 0
assert calc_tpf(10, 1) == 10

In [13]:
# export


def tpf(nb_path):
    nb, module_code = load_nb_module(nb_path)
    pnb = nbformat.from_dict(nb)
    nb_cell_code = "\n".join(
        [c["source"].replace("%", "#") for c in pnb.cells if c["cell_type"] == "code"]
    )
    func_defs = get_function_defs(module_code)
    func_calls = count_func_calls(nb_cell_code, func_defs)
    num_funcs = len(func_calls.keys())
    num_tests = sum(func_calls.values())
    print(num_tests, num_funcs)
    return calc_tpf(num_tests, num_funcs)

In [14]:
tpf("test/test_module.ipynb")

0 1


0.0

In [15]:
tpf("test/test_data_handling.ipynb")

3 3


1.0

In [16]:
tpf("test/test_export.ipynb")

2 5


0.4

# 2. In-function Percentage

In [17]:
nb_cell_code = """
def something():
    pass; pass
#load_ext autoreload
#autoreload 2
# export


import numpy as np
import pandas as pd
from sciflow.utils import lib_path, odbc_connect, query
pd.set_option("display.max_colwidth", 800)
# export
"""

In [18]:
# export


def ifp(nb_path):
    nb = read_nb(nb_path)
    nb_cell_code = "\n".join(
        [c["source"].replace("%", "#") for c in nb.cells if c["cell_type"] == "code"]
    )
    stmts_in_func = 0
    stmts_outside_func = 0
    for stmt in ast.walk(ast.parse(nb_cell_code)):
        if isinstance(stmt, ast.FunctionDef) and not stmt.name.startswith("_"):
            for body_item in stmt.body:
                stmts_in_func += 1
        else:
            stmts_outside_func += 1
    return (
        0
        if stmts_outside_func + stmts_in_func == 0
        else stmts_in_func / (stmts_outside_func + stmts_in_func)
    )

In [19]:
ifp("test/test_data_handling.ipynb")

0.03289473684210526

In [20]:
ifp("test/test_export.ipynb")

0.07009345794392523

In [21]:
ifp("test/test_clustering.ipynb")

0.03773584905660377

# 3. Markdown to Code Ratio

In [22]:
# export


def mcr(nb_path):
    nb = read_nb(nb_path)
    md_cells = [c for c in nb.cells if c["cell_type"] == "markdown"]
    code_cells = [c for c in nb.cells if c["cell_type"] == "code"]
    num_code_cells = len(code_cells)
    num_md_cells = len(md_cells)
    return 0 if num_code_cells == 0 else num_md_cells / num_code_cells

In [23]:
mcr("test/test_module.ipynb")

0.25

In [24]:
mcr("test/test_export.ipynb")

0.3157894736842105

In [25]:
mcr("test/test_data_handling.ipynb")

0.0

# 4. Total Code Length

In [26]:
# export


def tcl(nb_path):
    nb = read_nb(nb_path)
    return sum([len(c["source"]) for c in nb.cells if c["cell_type"] == "code"])

In [27]:
tcl("test/test_module.ipynb")

290

In [28]:
tcl("test/test_clustering.ipynb")

4599

In [29]:
tcl("test/test_data_handling.ipynb")

1361