Docstring: Title cell introducing the analyses notebook.
# ANALYSES

In [None]:
"""Set up notebook paths, imports, and helper classes for analyses."""
from __future__ import annotations

import pathlib
import sys

NOTEBOOK_DIR = pathlib.Path.cwd()
if '_NB_SYS_PATH_ADJUSTED' not in globals():
    sys.path.insert(0, str(NOTEBOOK_DIR))
    _NB_SYS_PATH_ADJUSTED = True

print(sys.version)

from modules import rdf_utils, file_utils
from modules.analyses import analyse, print_results_stats
from modules.surprise_score import SelectionCriteria, SelectionMode
from modules.query_runner import QueryRunner


In [None]:
"""Load the latest ontology TTL graph and create a query runner."""
INPUT_TTL = file_utils.latest_file(folder=NOTEBOOK_DIR / 'ontologies', filename_fragment='ontology_', extension='ttl')
graph = rdf_utils.load_graph(INPUT_TTL)

# QueryRunner is an in-memory helper that runs SPARQL queries against an rdflib.Graph, 
# caches each queryâ€™s dataframe result, and reuses it unless refresh=True is passed.
runner = QueryRunner(graph)

## 4 WEIGHTS

In [None]:
"""Run analysis up to proposition 48 using full history and co-occurrence selections with four history weights, no type filtering."""
# 4 weights: analysis 1
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=False,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Repeat the four-weight run but enable type-based selection."""
# 4 weights: analysis 2
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=True,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Use the top half of history with full co-occurrence and four weights without type selection."""
# 4 weights: analysis 3
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 2)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=False,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Use the top half of history with type selection and four weights."""
# 4 weights: analysis 4
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 2)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=True,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Use the top three-quarters of history with no type selection and four weights."""
# 4 weights: analysis 5
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 3 / 4)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=False,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Repeat the top three-quarters history run with four weights and no type selection."""
# 4 weights: analysis 6
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 3 / 4)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 1 / 9, 1 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=False,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

# 3 WEIGHTS

In [None]:
"""Run analysis with three-history weights, full selections, and no type selection."""
# 3 weights: analysis 1
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=False,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df

In [None]:
"""Run the three-weight analysis with type selection enabled."""
# 3 weights: analysis 2
DEFAULT_HISTORY_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_COOCCURRENCE_SELECTION = SelectionCriteria(SelectionMode.TOP_FRACTION, 1 / 1)
DEFAULT_HISTORY_WEIGHTS = (6 / 9, 1 / 9, 2 / 9)

results_df = analyse(
    upper_proposition_number=48,
    history_selection=DEFAULT_HISTORY_SELECTION,
    cooccurrence_selection=DEFAULT_COOCCURRENCE_SELECTION,
    history_weights=DEFAULT_HISTORY_WEIGHTS,
    verbose=True,
    graph=graph,
    runner=runner,
    type_selection=True,
    exclude_numeric_concepts=True,
    output_base_dir=NOTEBOOK_DIR
)
results_df