# Example Notebook for Graphrag-Tagger

This notebook demonstrates how to use the Graphrag-Tagger tool. It shows how to import the module and run the tagging pipeline.

In [None]:
%cd ..

In [None]:
import os
from graphrag_tagger import tagger

# Define sample parameters
params = {
    'pdf_folder': 'notebook/example',  # update path to your PDF folder
    'chunk_size': 256,
    'chunk_overlap': 25,
    'n_components': None,
    'n_features': 512,
    'min_df': 2,
    'max_df': 0.95,
    'llm_model': 'ollama:qwen2.5',
    'output_folder': 'notebook/example/results',  # update path to your output folder
    'model_choice': 'kt' # kt for ktrain or sk for scikit-learn
}

In [None]:
# Create output folder if it doesn't exist
os.makedirs(params['output_folder'], exist_ok=True)

# Run the tagging pipeline
tagger.main(params)

# The results will be saved in the specified output folder.

In [None]:
from glob import glob
import os

files = glob(os.path.join(params["output_folder"], "chunk_*.json"))
len(files)

In [None]:
import json

raws: list[dict] = [json.load(open(file)) for file in files]

for i, raw in enumerate(raws):
    if "chunk" not in raw:
        print(i, raw)

In [None]:
raw

In [None]:
import json

raws: list[dict] = [json.load(open(file)) for file in files]
raws = [{
    "chunk": raw["chunk"],
    "source_file": raw["source_file"],
    "chunk_file": file,
    **raw["classification"],
} for file, raw in zip(files, raws)]

raws[0].keys()

In [None]:
print(raws[19]["chunk"])

In [None]:
print(raws[19])

In [None]:
print(raws[19]["source_file"])

In [None]:
import pandas as pd

chunk_classification = pd.DataFrame(raws)

chunk_classification

In [None]:
chunk_classification["topics"].apply(lambda x: len(x) if x else 0).describe()

In [None]:
chunk_classification[chunk_classification["topics"].apply(len) == 0]

In [None]:
print(raws[36]["chunk"])

In [None]:
chunk_classification[chunk_classification["topics"].apply(len) == 1]

In [None]:
print(raws[42]["chunk"])

In [None]:
chunk_classification[chunk_classification["topics"].apply(len) == 2]

In [None]:
chunk_classification[chunk_classification["topics"].apply(len) == 3]

In [None]:
chunk_classification["content_type"].value_counts(normalize=False)

In [None]:
chunk_classification["content_type"].value_counts(normalize=True)

In [None]:
chunk_classification["is_sufficient"].mean()

Update the paths in the code cell above before running the notebook.