# Experimental Study Notebook

to-do: use RDFox instead of rdflib to account for implications (and, in particular, owl:sameAs)


In [None]:
from __future__ import annotations

import pathlib
import sys

NOTEBOOK_DIR = pathlib.Path.cwd()
if '_NB_SYS_PATH_ADJUSTED' not in globals():
    sys.path.insert(0, str(NOTEBOOK_DIR))
    _NB_SYS_PATH_ADJUSTED = True

import pandas as pd
import rdflib

from modules import file_utils
from modules import potential
from modules import rdf_utils
from modules.query_runner import QueryRunner

print(sys.version)

In [None]:
def load_graph(input_ttl: pathlib.Path) -> rdflib.Graph:
    graph = rdf_utils.read_graph(str(input_ttl))
    return graph

from modules.surprise_score import SelectionCriteria, SelectionMode
from modules.file_utils import output_df

def print_results_stats(results_df: pd.DataFrame) -> None:
    empty_surprising = (
        results_df["surprising_concepts"]  # work on the analyse() output column
        .fillna("")                        # treat NaN as empty string values
        .str.strip()                       # ignore entries that only contain whitespace
        .eq("")                            # boolean mask for empty strings
        .sum()                             # count how many rows satisfy the mask
    )
    print(f"Empty surprising_concepts rows: {empty_surprising} out of {len(results_df)}")

def analyse(
    upper_proposition_number: int,
    history_weights: tuple[float, float, float, float],
    history_selection: SelectionCriteria,
    cooccurrence_selection: SelectionCriteria,
    verbose: bool = False,
    graph: rdflib.Graph | None = None,
    runner: QueryRunner | None = None,
    type_selection: bool = False
) -> pd.DataFrame:
    graph = graph
    runner = runner

    def _format_value(value: float) -> str:
        if isinstance(value, float):
            return f"{value:.4f}".rstrip("0").rstrip(".")
        return str(value)

    analyses = []
    output_rows = []
    for proposition in range(1, upper_proposition_number + 1):
        if verbose:
            print(f"Analysing proposition {proposition}")

        background_concepts, surprising = potential.main(
            graph,
            proposition,
            history_weights=history_weights,
            history_selection=history_selection,
            cooccurrence_selection=cooccurrence_selection,
            runner=runner,
            type_selection=type_selection
        )

        background_list = sorted(background_concepts)
        surprising_list = sorted(surprising)
        background_str = " ; ".join(background_list).replace("https://www.foom.com/core#", "")
        surprising_str = " ; ".join(surprising_list).replace("https://www.foom.com/core#", "")
        analyses.append({
            "proposition": proposition,
            "background_concepts": background_list,
            "surprising_concepts": surprising_list,
            "number_of_surprising_concepts": len([concept for concept in surprising_list if concept]),
        })
        output_rows.append((str(proposition), background_str, surprising_str))

    description_parts = [
        f"history-{history_selection.mode.value}-{_format_value(history_selection.value)}",
        f"coocc-{cooccurrence_selection.mode.value}-{_format_value(cooccurrence_selection.value)}",
        "weights-" + "-".join(_format_value(weight) for weight in history_weights),
        "type" if type_selection else "no-type"
    ]
    description = "__".join(description_parts)
    output_dir = NOTEBOOK_DIR / "output"
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"analyses_{description}"
    output_df(output_rows, filename=str(output_path))
    
    # Count empty surprising_concepts entries (handles nulls and whitespace)
    results_df = pd.DataFrame(analyses)
    print_results_stats(results_df)
    return results_df


In [None]:
INPUT_TTL = file_utils.latest_file(folder=NOTEBOOK_DIR / "ontologies", filename_fragment="ontology_", extension="ttl")
graph = load_graph(INPUT_TTL)
runner = QueryRunner(graph)

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
)
results_df

In [None]:
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=True
)
results_df