In [2]:
# | include: false
# | default_exp scilint

In [3]:
# | export

import ast
import json
import operator
import os
import re
import shutil
import sys
import warnings
from collections import Counter
from configparser import InterpolationMissingOptionError
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Tuple

import nbformat
import numpy as np
import pandas as pd
import yaml
from execnb.nbio import read_nb
from fastcore.script import Param, call_parse, store_false
from fastcore.xtras import globtastic
from nbdev.clean import nbdev_clean
from nbdev.config import get_config
from nbdev.doclinks import nbdev_export, nbglob
from nbdev.quarto import nbdev_docs, nbdev_readme
from nbdev.test import nbdev_test
from nbqa.__main__ import _get_configs, _main
from nbqa.cmdline import CLIArgs
from nbqa.find_root import find_project_root
from scilint.indicators import *

In [4]:
%load_ext autoreload
%autoreload 2

# Test Data Prep

In [5]:
nbdev_path = Path(Path(".").resolve(), "example_nbs", "nbdev.ipynb")
nbdev_hq_path = Path(Path(".").resolve(), "example_nbs", "nbdev_high_quality.ipynb")
non_nbdev_path = Path(Path(".").resolve(), "example_nbs", "non_nbdev.ipynb")
non_nbdev_lq_path = Path(
    Path(".").resolve(), "example_nbs", "non_nbdev_low_quality.ipynb"
)
index_path = Path(Path(".").resolve(), "index.ipynb")
syntax_error_path = Path(Path(".").resolve(), "syntax_error.ipynb")

nbdev_nb = read_nb(nbdev_path)
nbdev_hq_nb = read_nb(nbdev_hq_path)
non_nbdev_nb = read_nb(non_nbdev_path)
non_nbdev_lq_nb = read_nb(non_nbdev_lq_path)
index = read_nb(index_path)
syntax_error = read_nb(index_path)

# Tidy 

simple wrapper around no-decisions version of nbqa

In [6]:
# | export


def run_nbqa_cmd(cmd):
    print(f"Running {cmd}")
    project_root: Path = find_project_root(tuple([str(Path(".").resolve())]))
    args = CLIArgs.parse_args([cmd, str(project_root)])
    configs = _get_configs(args, project_root)
    output_code = _main(args, configs)
    return output_code

In [7]:
project_root: Path = find_project_root(tuple([str(Path(".").resolve())]))
assert os.path.basename(project_root) == "scilint"

In [8]:
# | export


def tidy():
    tidy_tools = ["black", "isort", "autoflake"]
    [run_nbqa_cmd(c) for c in tidy_tools]

# Helpers

In [9]:
# | export


def get_project_root(path: Path = Path(".").resolve()):
    return find_project_root(tuple([str()]))

In [10]:
# | export


def is_nbdev_project(project_path: Path = Path(".")):
    is_nbdev = True
    project_root = find_project_root(tuple([str(project_path.resolve())]))

    if not Path(project_root, "settings.ini").exists():
        is_nbdev = False
    try:
        get_config().lib_name
    except InterpolationMissingOptionError:
        is_nbdev = False

    return is_nbdev

In [11]:
assert is_nbdev_project()

In [12]:
import tempfile

with tempfile.TemporaryDirectory() as tmp_dir:
    assert not is_nbdev_project(Path(tmp_dir))

# Linting Functions

## `lint_nb`

In [13]:
# | export


def lint_nb(
    spec_name: str,
    nb_path: Path,
    conf: Dict[str, Any],
    indicators: Dict[str, Callable],
    include_in_scoring: bool,
) -> Tuple[float]:
    nb = read_nb(nb_path)

    has_syntax_error = False
    indic_vals = list(np.repeat(np.nan, len(indicators)))
    try:
        for i, indic_name in enumerate(indicators):
            indic_vals[i] = round(indicators[indic_name](nb), conf["precision"])
    except SyntaxError as se:
        if conf["print_syntax_errors"]:
            print(f"Syntax error in notebook: {nb_path} reason: ", se)
        has_syntax_error = True
    indic_vals.append(has_syntax_error)
    indic_vals.append(include_in_scoring)
    indic_vals.insert(0, spec_name)

    return tuple(indic_vals)

In [31]:
# TODO test

## `_get_excluded_paths`

In [14]:
# | export


def _get_excluded_paths(paths: Iterable[Path], exclude_pattern: str) -> Iterable[Path]:
    """Excluded paths should either be absolute paths or paths rooted at the project root directory"""
    excl_paths = []
    paths = [p.absolute() for p in paths]

    for ex_pattern in exclude_pattern.split(","):
        if Path(ex_pattern).is_absolute():
            ex_path = Path(ex_pattern)
        else:
            ex_path = Path(get_project_root(), ex_pattern)

        if ex_path.exists():
            excl_paths.extend([p for p in paths if ex_pattern in str(p)])
        elif not ex_path.exists():
            raise ValueError(f"Path component: {ex_path} does not exist")
        else:
            raise ValueError(
                f"Invalid exclusion pattern: {ex_path} pattern is comma separrated list of 'dir/' for directories and 'name.ipynb' for specific notebook"
            )
    return excl_paths

In [15]:
paths = [Path(p) for p in nbglob(Path("."))]
assert sorted(
    [
        p.name
        for p in _get_excluded_paths(
            paths, exclude_pattern="nbs/example_nbs/experimental,nbs/index.ipynb"
        )
    ]
) == sorted(["non_nbdev.ipynb", "nbdev.ipynb", "index.ipynb"])
assert sorted(
    [
        p.name
        for p in _get_excluded_paths(
            paths, exclude_pattern="nbs/example_nbs/nbdev.ipynb"
        )
    ]
) == sorted(["nbdev.ipynb"])

## `_calculate_warnings`

In [16]:
# | export


def _calculate_warnings(
    spec_name: str,
    scoring_report: pd.DataFrame,
    conf: Dict[str, Any],
    include_missing: bool = False,
) -> Tuple[Dict[str, Any], int]:
    warning_details = []

    for op_text in list(conf["warnings"].keys()):
        for indic in conf["warnings"][op_text]:
            metric_series = scoring_report[indic]
            or_exp = pd.isnull(metric_series) if include_missing else False
            op = (
                operator.lt
                if op_text == "lt"
                else operator.gt
                if op_text == "gt"
                else operator.eq
            )
            warning_data = metric_series[
                (op(metric_series, conf["warnings"][op_text][indic])) | (or_exp)
            ]
            warning_dict = warning_data.to_dict()
            for key, val in warning_dict.items():
                warning_dict[key] = (
                    indic,
                    val,
                    op_text,
                    conf["warnings"][op_text][indic],
                )
            warning_details.append(warning_dict)

    all_warns = _reshape_warnings(spec_name, scoring_report, warning_details)
    num_warnings = len(all_warns)
    return all_warns, num_warnings

In [32]:
# TODO test

## `_reshape_warnings`

In [17]:
# | export


def _reshape_warnings(
    spec_name: str, scoring_report: pd.DataFrame, warning_details: Iterable[Any]
) -> Dict[str, Iterable[Tuple]]:
    warnings_by_nb = {nb: [] for nb in scoring_report.index}
    for nb in scoring_report.index:
        for wd in warning_details:
            if nb in wd:
                warnings_by_nb[nb].append(tuple([spec_name, nb] + list(wd[nb])))
    warnings_by_nb = {key: val for key, val in warnings_by_nb.items() if len(val) > 0}
    flattened_warns = [item for sublist in warnings_by_nb.values() for item in sublist]
    return pd.DataFrame.from_records(
        data=flattened_warns,
        columns=[
            "spec_name",
            "notebook",
            "indicator",
            "value",
            "operator",
            "threshold",
        ],
    )

In [33]:
# TODO test

## `lint_nbs`

In [18]:
# | export


def lint_nbs(
    spec_name: str,
    conf: Dict[str, Any],
    indicators: Dict[str, Callable],
    nb_paths: Iterable[Path] = None,
    nb_glob: Path = None,
):
    if nb_paths is None:
        nb_paths = [Path(p).absolute() for p in nbglob(nb_glob)]
    else:
        nb_paths = [Path(p).absolute() for p in nb_paths]

    if len(nb_paths) == 0:
        return None, None, None

    excluded_paths = None
    exclusions = conf["exclusions"]
    if exclusions is not None:
        excluded_paths = _get_excluded_paths(nb_paths, exclude_pattern=exclusions)

    results = []
    nb_names = []
    for nb_path in nb_paths:
        include_in_scoring = True
        if exclusions is not None:
            include_in_scoring = False if nb_path in excluded_paths else True

        nb_names.append(nb_path.stem)
        lint_result = lint_nb(spec_name, nb_path, conf, indicators, include_in_scoring)
        results.append(lint_result)

    lint_report = pd.DataFrame.from_records(
        data=results,
        index=nb_names,
        columns=["spec_name"]
        + list(indicators.keys())
        + ["has_syntax_error", "include_in_scoring"],
    ).sort_values(["in_func_pct", "markdown_code_pct"], ascending=False)

    scoring_report = lint_report[lint_report.include_in_scoring].copy()
    all_warns, num_warnings = _calculate_warnings(spec_name, scoring_report, conf)
    return lint_report, all_warns, num_warnings

## `_map_paths_to_specs`

In [19]:
# | export


def _map_paths_specs(nb_glob: Path = None, specs_glob: Path = Path(".").resolve()):
    nbs = nbglob(nb_glob)
    spec_files = [
        Path(p)
        for p in globtastic(
            specs_glob,
            file_glob="scilint-*.yaml",
            skip_folder_re="ipynb_checkpoints|_proc",
        )
    ]
    default_spec_files = [p for p in spec_files if p.name == "scilint-default.yaml"]
    default_spec_file = default_spec_files[0] if len(default_spec_files) > 0 else None
    spec_dirs = [p.parent for p in spec_files]

    spec_nbs = {k: [] for k in spec_files}
    for nb in [Path(p) for p in nbs]:
        found_spec = False
        for name, spec_dir in zip(spec_files, spec_dirs):
            if nb.parent == spec_dir:
                spec_nbs[name].append(nb)
                found_spec = True
        if not found_spec:
            if default_spec_file is not None:
                spec_nbs[default_spec_file].append(nb)
            else:
                # Special case: not actually a valid file path - triggers loading a fallback
                fallback_path = Path("scilint-default")
                if fallback_path not in spec_nbs:
                    spec_nbs[fallback_path] = []
                spec_nbs[fallback_path].append(nb)

    return spec_nbs

In [20]:
legacy_dir = Path(Path(".").resolve(), "example_nbs/legacy")
legacy_spec_nbs = _map_paths_specs(legacy_dir, legacy_dir)
legacy_spec = Path(Path(".").resolve(), "example_nbs", "legacy", "scilint-legacy.yaml")
assert legacy_spec in legacy_spec_nbs
assert len(legacy_spec_nbs[legacy_spec]) == 2

In [21]:
legacy_dir = Path(Path(".").resolve(), "example_nbs/legacy")
legacy_spec_nbs = _map_paths_specs(legacy_dir)
legacy_spec = Path(Path(".").resolve(), "example_nbs", "legacy", "scilint-legacy.yaml")
assert legacy_spec in legacy_spec_nbs
assert len(legacy_spec_nbs[legacy_spec]) == 2

In [22]:
no_spec_dir = Path(Path(".").resolve(), "example_nbs/no_spec_provided/")
no_spec_nbs = _map_paths_specs(no_spec_dir, no_spec_dir)
assert Path("scilint-default") in no_spec_nbs
assert len(no_spec_nbs) == 1
assert len(no_spec_nbs[Path("scilint-default")]) == 1

In [25]:
spec_nbs = _map_paths_specs()
assert len(spec_nbs[Path(Path(".").resolve(), "scilint-default.yaml")]) == 9
assert (
    len(
        spec_nbs[
            Path(
                Path(".").resolve(),
                "example_nbs",
                "exploratory",
                f"scilint-exploratory.yaml",
            )
        ]
    )
    == 3
)
assert (
    len(
        spec_nbs[
            Path(
                Path(".").resolve(),
                "example_nbs",
                "experimental",
                f"scilint-experimental.yaml",
            )
        ]
    )
    == 2
)
assert (
    len(
        spec_nbs[
            Path(
                Path(".").resolve(),
                "example_nbs",
                "validated",
                f"scilint-validated.yaml",
            )
        ]
    )
    == 1
)

In [26]:
spec_nbs = _map_paths_specs(specs_glob=get_project_root())
assert sorted([k.name for k in spec_nbs.keys()]) == sorted(
    [
        "scilint-default.yaml",
        "scilint-validated.yaml",
        "scilint-experimental.yaml",
        "scilint-exploratory.yaml",
        "scilint-legacy.yaml",
    ]
)

## Testing `lint_nbs`

In [27]:
conf = yaml.safe_load(Path("scilint-default.yaml").read_text())
default_spec_paths = list(_map_paths_specs().values())[0]

In [28]:
lint_report, all_warns, num_warns = lint_nbs(
    "scilint-default.yaml", conf, indicator_funcs, nb_paths=default_spec_paths
)
assert num_warns == 0

AssertionError: 

In [30]:
lint_report

Unnamed: 0,spec_name,calls_per_func_mean,calls_per_func_median,asserts_func_ratio,inline_asserts_per_func_mean,inline_asserts_per_func_median,in_func_pct,markdown_code_pct,loc_per_md_section,total_code_len,has_syntax_error,include_in_scoring
nbdev,scilint-default.yaml,2.231,1.0,1.308,0.846,0.0,50.725,30.769,546.444,4918.0,False,False
nbdev,scilint-default.yaml,2.231,1.0,1.308,0.846,0.0,50.725,30.769,546.444,4918.0,False,False
non_nbdev_low_quality,scilint-default.yaml,1.625,1.0,0.0,0.0,0.0,45.0,15.789,2955.0,2955.0,False,False
nbdev_high_quality,scilint-default.yaml,2.5,1.5,1.667,1.0,0.0,44.118,30.769,553.111,4978.0,False,False
scilint,scilint-default.yaml,1.923,1.0,2.308,0.154,0.0,37.805,32.308,1001.286,21027.0,False,True
non_nbdev,scilint-default.yaml,1.0,1.0,0.0,0.0,0.0,35.714,0.0,,1233.0,False,False
indicators,scilint-default.yaml,4.0,5.0,2.947,2.789,1.0,34.395,20.0,1128.091,12409.0,False,True
index,scilint-default.yaml,,,,,,,,,0.0,False,True
syntax_error,scilint-default.yaml,,,,,,,,,,True,False


In [None]:
conf["exclusions"] = None
lint_report, all_warns, num_warns = lint_nbs(
    "scilint-default.yaml", conf, indicator_funcs, nb_paths=default_spec_paths
)
assert num_warns == 7

In [None]:
conf = yaml.safe_load(
    Path(
        Path(".").resolve(), "example_nbs", "experimental", "scilint-experimental.yaml"
    ).read_text()
)
lint_report, all_warns, num_warns = lint_nbs(
    "scilint-experimental.yaml",
    conf,
    indicator_funcs,
    nb_glob=Path("example_nbs/experimental/"),
)
assert num_warns == 3

In [None]:
conf["exclusions"] = """nbs/example_nbs/,nbs/index.ipynb"""
_, all_warns, num_warns = lint_nbs(
    "scilint-experimental.yaml",
    conf,
    indicator_funcs,
    nb_glob=Path("example_nbs/experimental/"),
)
assert num_warns == 0
conf["exclusions"] = None

In [None]:
conf["exclusions"] = """nbs/example_nbs/experimental/non_nbdev.ipynb"""
_, all_warns, num_warns = lint_nbs(
    "scilint-experimental.yaml",
    conf,
    indicator_funcs,
    nb_glob=Path("example_nbs/experimental/"),
)
assert num_warns == 0
conf["exclusions"] = None

In [None]:
conf["exclusions"] = """nbs/example_nbs/exploratory/syntax_error.ipynb"""
_, all_warns, num_warns = lint_nbs(
    "scilint-exploratory.yaml",
    conf,
    indicator_funcs,
    nb_glob=Path("example_nbs/exploratory/"),
)
assert num_warns == 3
conf["exclusions"] = None

## `display_warning_report`

In [None]:
# | export


def display_warning_report(all_warns: pd.DataFrame):
    print(
        "\n******************************************Begin Scilint Warning Report*****************************************"
    )
    print(all_warns.to_markdown(tablefmt="grid", index=False))
    print(
        "\n******************************************End Scilint Warning Report*******************************************\n"
    )

## `_persist_results`

In [None]:
# | export


def _persist_results(
    lint_report: pd.DataFrame, all_warns: pd.DataFrame, conf: Dict[str, Any]
):
    out_dir = Path(conf["out_dir"])
    conf_to_persist = {k: v for k, v in conf.items() if k != "indicators"}
    if not out_dir.exists():
        Path(out_dir).mkdir()
    with open(Path(out_dir, "scilint_config.json"), "w") as outfile:
        json.dump(conf_to_persist, outfile)
    all_warns.to_csv(Path(out_dir, "scilint_warnings.csv"), index=False)
    lint_report.to_csv(Path(out_dir, "scilint_report.csv"))

In [None]:
with tempfile.TemporaryDirectory() as tmp_dir:
    report = pd.DataFrame({"a": [1, 2, 3]})
    _persist_results(report, report, {"indicators": [], "out_dir": tmp_dir})
    assert pd.read_csv(Path(tmp_dir, "scilint_report.csv"), index_col=0).equals(
        pd.DataFrame({"a": [1, 2, 3]})
    )
    assert pd.read_csv(Path(tmp_dir, "scilint_warnings.csv")).equals(
        pd.DataFrame({"a": [1, 2, 3]})
    )
    with open(Path(tmp_dir, "scilint_config.json")) as infile:
        assert json.load(infile) == {"out_dir": tmp_dir}

## `_load_conf`

In [None]:
# | export


def _load_conf(
    conf_path: str = None,
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    if conf_path is None:
        project_root = find_project_root(tuple([str(Path(".").resolve())]))
        conf_path = Path(project_root, "nbs", "scilint-default.yaml")
        print(f"Loading default lint config: {conf_path}")
    else:
        conf_path = Path(conf_path)

    conf = yaml.safe_load(conf_path.read_text())
    override_names = (
        "exclusions",
        "fail_over",
        "out_dir",
        "precision",
        "print_syntax_errors",
    )
    overrides = (exclusions, fail_over, out_dir, precision, print_syntax_errors)
    for override in zip(override_names, overrides):
        if override[1] is not None:
            conf[override[0]] = override[1]
    return conf

In [None]:
experimental_spec_path = Path(
    Path("."), "example_nbs", "experimental", "scilint-experimental.yaml"
)
experimental_spec = _load_conf(experimental_spec_path)
assert experimental_spec["precision"] == 3
assert experimental_spec["fail_over"] == 3

In [None]:
experimental_spec = _load_conf(experimental_spec_path, fail_over=-1, precision=1)
assert experimental_spec["precision"] == 1
assert experimental_spec["fail_over"] == -1

## `lint`

In [None]:
# | export


def lint(
    display_report: bool = True,
    nb_glob: Path = None,
    specs_glob: Path = Path(".").resolve(),
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    spec_nbs = _map_paths_specs(nb_glob, specs_glob)
    lint_reports = []
    all_warns = []
    warns_count = []
    for spec, nbs in spec_nbs.items():
        if spec == "scilint-default":
            conf = get_default_spec()
        else:
            conf = _load_conf(
                spec, exclusions, fail_over, out_dir, precision, print_syntax_errors
            )
        if conf["evaluate"] == False:
            print(f"Linting skipped for: {spec.name} as evaluate is set to false")
            continue
        lint_report, report_warns, num_warnings = lint_nbs(
            spec.name, conf, indicator_funcs, nb_paths=nbs
        )
        lint_reports.append(lint_report)
        all_warns.append(report_warns)
        warns_count.append(num_warnings)

        fail_over_conf = conf["fail_over"]
        if conf["fail_over"] == -1:
            print(f"Linting warnings ignored for: {spec.name} as fail_over set to -1")
        elif num_warnings == 0:
            print(f"Linting success for: {spec.name}, no issues found")
        elif num_warnings <= conf["fail_over"]:
            print(
                f"Linting success for: {spec.name}, warnings ({num_warnings}) <= than threshold ({fail_over_conf}) "
            )
        else:
            print(
                f"Linting failed for: {spec.name}, total warnings ({num_warnings}) exceeded threshold ({fail_over_conf})"
            )
            sys.exit(num_warnings)

    lint_report = pd.concat(lint_reports) if len(lint_reports) > 0 else lint_report
    all_warns = pd.concat(all_warns) if len(all_warns) > 0 else report_warns
    num_warnings = sum(warns_count)

    if num_warnings > 0:
        print(
            f"{num_warnings} warnings founds, within tolerated thresholds for all specs"
        )
        if display_report:
            display_warning_report(all_warns)
    elif num_warnings == 0:
        print("No issues found during linting")

    _persist_results(lint_report, all_warns, conf)
    print("Linting completed")

In [None]:
lint()

## `build`

In [None]:
# | export


def build(
    display_report: bool = True,
    nb_glob: Path = None,
    specs_glob: Path = Path(".").resolve(),
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    print("Tidying notebooks..")
    tidy()
    if is_nbdev_project():
        nbdev_export.__wrapped__()
        print("Converted notebooks to modules")
        print("Testing notebooks..")
        nbdev_test.__wrapped__()
    print("Running notebook linter..")
    lint(
        display_report,
        nb_glob,
        specs_glob,
        exclusions,
        fail_over,
        out_dir,
        precision,
        print_syntax_errors,
    )
    if is_nbdev_project():
        nbdev_clean.__wrapped__()
        print("Cleaned notebooks")

# Console Scripts

## `scilint_tidy`

In [None]:
# | export


@call_parse
def scilint_tidy():
    tidy()

## `scilint_lint`

In [None]:
# | export


@call_parse
def scilint_lint(
    display_report: Param("Print the lint report", store_false) = False,
    nb_glob: Path = None,
    specs_glob: Path = Path(".").resolve(),
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    lint(
        display_report,
        nb_glob,
        specs_glob,
        exclusions,
        fail_over,
        out_dir,
        precision,
        print_syntax_errors,
    )

In [None]:
scilint_lint()

## `scilint_build`

In [None]:
# | export


@call_parse
def scilint_build(
    display_report: Param("Print the lint report", store_false) = False,
    nb_glob: Path = None,
    specs_glob: Path = Path(".").resolve(),
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    build(
        display_report,
        nb_glob,
        specs_glob,
        exclusions,
        fail_over,
        out_dir,
        precision,
        print_syntax_errors,
    )

## `scilint_ci`

In [None]:
# | export


@call_parse
def scilint_ci(
    display_report: Param("Print the lint report", store_false) = False,
    nb_glob: Path = None,
    specs_glob: Path = Path(".").resolve(),
    exclusions: str = None,
    fail_over: int = None,
    out_dir: int = None,
    precision: int = None,
    print_syntax_errors: bool = None,
):
    if not is_nbdev_project():
        print("scilint_ci feature is only available for nbdev projects")
        return

    build(
        display_report,
        nb_glob,
        specs_glob,
        exclusions,
        fail_over,
        out_dir,
        precision,
        print_syntax_errors,
    )

    if not shutil.which("quarto"):
        print(
            "Quarto is not installed. A working quarto install is required for the CI build"
        )
        sys.exit(-1)
    nbdev_readme.__wrapped__()
    nbdev_docs.__wrapped__()