In [16]:
import os
from dotenv import load_dotenv

env = load_dotenv('config.env')

from src.agents.ontology_explorer import OntologyExplorer
from src.config import LLMConf, EmbedderConf, KnowledgeGraphConfig, ChunkerConf, Source
from src.graph.graph_model import Ontology
from src.ingestion.local_ingestor import LocalIngestor
from src.ingestion.chunker import Chunker
from src.ingestion.cleaner import Cleaner

In [17]:
DOMAIN_DESCRIPTION = "Press releases and updates from the European Commission"

In [18]:
source = Source(folder='source_docs')

chunker_conf = ChunkerConf(
    type="recursive", 
    chunk_size=1500, 
    chunk_overlap=0
)

model_conf=LLMConf(
    type=os.getenv("RE_MODEL_TYPE"),
    model=os.getenv("RE_MODEL_NAME"), 
    temperature=os.getenv("RE_MODEL_TEMPERATURE"), 
    deployment=os.getenv("RE_MODEL_DEPLOYMENT"),
    api_key=os.getenv("RE_API_KEY"),
    endpoint=os.getenv("RE_MODEL_ENDPOINT"),
    api_version=os.getenv("RE_MODEL_API_VERSION") or None
)

ingestor = LocalIngestor(source=source)
cleaner = Cleaner()
chunker = Chunker(conf=chunker_conf)

ontology_explorer = OntologyExplorer(
    model_conf, 
    domain_description=DOMAIN_DESCRIPTION
)

In [19]:
docs = ingestor.batch_ingest()
docs = cleaner.clean_documents(docs)
docs = chunker.chunk_documents(docs)

for i in range(len(docs)):
    print(f"Number of chunks in doc {i}: {len(docs[i].chunks)}")

Number of chunks in doc 0: 6
Number of chunks in doc 1: 6
Number of chunks in doc 2: 1


In [20]:
ontology = ontology_explorer.find_suitable_ontology(docs, pct_chunks=0.5)

In [21]:
ontology.model_dump()

{'allowed_labels': ['Country',
  'Payment',
  'Milestone',
  'Reform',
  'Investment',
  'Policy',
  'Infrastructure',
  'Loan',
  'Disbursement',
  'Committee',
  'Regulation',
  'Mechanism',
  'Asset',
  'Revenue',
  'Facility',
  'Plan',
  'Strategy',
  'Initiative',
  'Committee Opinion',
  'Assessment',
  'Request',
  'Suspension',
  'Expenditure',
  'Infrastructure',
  'Transport',
  'Energy',
  'Water',
  'Healthcare',
  'Housing',
  'Education',
  'Digitalization',
  'Climate',
  'Economy',
  'Governance',
  'Security',
  'Defense',
  'Aid',
  'Support',
  'Repayment',
  'Profit',
  'Loss',
  'Grant',
  'Allocation',
  'Implementation',
  'Monitoring',
  'Evaluation',
  'Compliance',
  'Enforcement',
  'Amendment',
  'Termination',
  'Extension',
  'Renewal',
  'Review',
  'Appeal',
  'Mediation',
  'Arbitration',
  'Litigation',
  'Settlement',
  'Enforcement',
  'Termination',
  'Extension',
  'Renewal',
  'Review',
  'Appeal',
  'Mediation',
  'Arbitration',
  'Litigation',
