# Experimental Study Notebook

This notebook mirrors `dev.ipynb` but exercises the refactored modules under `modules`.
Use it to benchmark the updated pipeline without touching the legacy code.

**Default selections:** history = top 10% of activation scores; co-occurrence = top 5% of pair activations.
**Default history weights:** direct = 6/9, hierarchical = 1/9, mereological = 2/9.


In [None]:
from __future__ import annotations

import pathlib
import sys

NOTEBOOK_DIR = pathlib.Path.cwd()
if '_NB_SYS_PATH_ADJUSTED' not in globals():
    sys.path.insert(0, str(NOTEBOOK_DIR))
    _NB_SYS_PATH_ADJUSTED = True

import pandas as pd
import rdflib

from modules import potential as exp_potential
from modules import rdf_utils
from modules.query_runner import QueryRunner

print(sys.version)


In [None]:
def load_graph(input_ttl: pathlib.Path) -> rdflib.Graph:
    graph = rdf_utils.read_graph(str(input_ttl))
    return graph

from modules.surprise_score import SelectionCriteria, SelectionMode
from modules.file_utils import output_df

def analyse(
    upper_proposition_number: int,
    history_weights: tuple[float, float, float],
    history_selection: SelectionCriteria,
    cooccurrence_selection: SelectionCriteria,
    verbose: bool = False,
    graph: rdflib.Graph | None = None,
    runner: QueryRunner | None = None,
) -> pd.DataFrame:
    graph = graph
    runner = runner

    def _format_value(value: float) -> str:
        if isinstance(value, float):
            return f"{value:.4f}".rstrip("0").rstrip(".")
        return str(value)

    analyses = []
    output_rows = []
    for proposition in range(1, upper_proposition_number + 1):
        if verbose:
            print(f"Analysing proposition {proposition}")

        background_concepts, surprising = exp_potential.main(
            graph,
            proposition,
            history_weights=history_weights,
            history_selection=history_selection,
            cooccurrence_selection=cooccurrence_selection,
            verbose=verbose,
            runner=runner,
        )

        background_str = " ; ".join(sorted(background_concepts))
        surprising_str = " ; ".join(sorted(surprising))
        analyses.append({
            "proposition": proposition,
            "background_concepts": background_str,
            "surprising_concepts": surprising_str,
        })
        output_rows.append((str(proposition), background_str, surprising_str))

    description_parts = [
        f"history-{history_selection.mode.value}-{_format_value(history_selection.value)}",
        f"coocc-{cooccurrence_selection.mode.value}-{_format_value(cooccurrence_selection.value)}",
        "weights-" + "-".join(_format_value(weight) for weight in history_weights),
    ]
    description = "__".join(description_parts)
    output_dir = NOTEBOOK_DIR / "output"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"analyses_{description}"
    output_df(output_rows, filename=str(output_path))
    return pd.DataFrame(analyses)


In [None]:
INPUT_TTL = NOTEBOOK_DIR.parent / "ontologies" / "ontology_euclid_book1.ttl"
graph = load_graph(INPUT_TTL)
runner = QueryRunner(graph)

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 10)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 20)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 20)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 40)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 2)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 2)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 2 / 3)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 2 / 3)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 3 / 4)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 3 / 4)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 4 / 5)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 4 / 5)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=49,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=False,
    graph=graph,
    runner=runner,
)
results_df