In [1]:
from PyPDF2 import PdfReader
import re
import os

def clean_text_for_markdown_pypdf2(text):
    """Cleans text for better Markdown display, handling multiple newlines."""
    if text is None:
        return ""
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Consolidate paragraph breaks
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # Replace single newlines with space
    return text.strip()

def pypdf2_to_markdown_chunks(pdf_path):
    """
    Extracts text from a PDF using PyPDF2 and splits it per page.
    
    Returns:
        list of str: Each string is a Markdown formatted page.
    """
    try:
        reader = PdfReader(pdf_path)
    except Exception as e:
        return [f"<!-- Error opening PDF {pdf_path}: {e} -->"]

    chunks = []
    for page_num, page in enumerate(reader.pages):
        page_md = f"\n## Page {page_num + 1}\n\n"
        try:
            # Extract and clean the page text
            page_text = page.extract_text()
            cleaned_text = clean_text_for_markdown_pypdf2(page_text)
            if cleaned_text:
                page_md += f"### Text on Page {page_num + 1}\n" + cleaned_text + "\n"
            else:
                page_md += f"<!-- No text found on Page {page_num + 1} -->\n"
        except Exception as e:
            page_md += f"<!-- Error extracting text from Page {page_num + 1}: {e} -->\n"
        chunks.append(page_md)
    return chunks

# --- Example Usage ---
if __name__ == "__main__":
    folder_path = "docs"  # Folder containing PDF files
    list_docs_md = []
    
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist. Please provide a valid folder path.")
    else:
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".pdf"):
                pdf_file_path = os.path.join(folder_path, file_name)
                print(f"Processing file: {pdf_file_path}")
                # Get each page separately
                page_chunks = pypdf2_to_markdown_chunks(pdf_file_path)
                list_docs_md.extend(page_chunks)

    print("Total markdown chunks extracted:", len(list_docs_md))

Processing file: docs/Legge regionale n_37_2014 artt. 20-21-22.pdf
Processing file: docs/Direttiva 2014_25_UE.pdf
Processing file: docs/Direttiva 2014_23_UE.pdf
Processing file: docs/Decreto Legislativo 7 marzo 2005_agg_L_147_2013.pdf
Processing file: docs/L. 27 Dicembre 2006 n.296 (Finanziaria 2007).pdf
Processing file: docs/L. 23 Dicembre 2000 n.388 (Finanziaria 2001).pdf
Processing file: docs/dPR 5 ottobre 2010_207_agg_DM_infrastrutture_24apr2014.pdf
Processing file: docs/Direttiva 2014_24_UE.pdf
Processing file: docs/D.Lgs. 50_2016.pdf
Processing file: docs/DGR_17_2024_01_22_signed_signed.pdf
Processing file: docs/Decreto legislativo 12 aprile  2006_163_agg_DL_24apr2014_n_66.pdf
Processing file: docs/L. 23 Dicembre 1999 n.488 (Finanziaria 2000).pdf
Processing file: docs/BURP_n.177_del_17112008.pdf
Processing file: docs/DELIBERAZIONE DELLA GIUNTA REGIONALE 21 marzo 2017 n_354.pdf
Processing file: docs/Dir.1999 93 CE del Parlamento Europeo e del Consiglio.pdf
Total markdown chunks ex

In [2]:
import os

# Create an output directory for the text file
output_folder = "extracted_txt"
os.makedirs(output_folder, exist_ok=True)

combined_file_path = os.path.join(output_folder, "combined_output.txt")
with open(combined_file_path, "w", encoding="utf-8") as f:
    for idx, chunk in enumerate(list_docs_md):
        if chunk.strip():
            # Optional: add a separator between chunks
            f.write(chunk)
            f.write("\n------------------\n")
            print(f"Appended chunk {idx+1} to {combined_file_path}")

print(f"All chunks have been combined into {combined_file_path}")

Appended chunk 1 to extracted_txt/combined_output.txt
Appended chunk 2 to extracted_txt/combined_output.txt
Appended chunk 3 to extracted_txt/combined_output.txt
Appended chunk 4 to extracted_txt/combined_output.txt
Appended chunk 5 to extracted_txt/combined_output.txt
Appended chunk 6 to extracted_txt/combined_output.txt
Appended chunk 7 to extracted_txt/combined_output.txt
Appended chunk 8 to extracted_txt/combined_output.txt
Appended chunk 9 to extracted_txt/combined_output.txt
Appended chunk 10 to extracted_txt/combined_output.txt
Appended chunk 11 to extracted_txt/combined_output.txt
Appended chunk 12 to extracted_txt/combined_output.txt
Appended chunk 13 to extracted_txt/combined_output.txt
Appended chunk 14 to extracted_txt/combined_output.txt
Appended chunk 15 to extracted_txt/combined_output.txt
Appended chunk 16 to extracted_txt/combined_output.txt
Appended chunk 17 to extracted_txt/combined_output.txt
Appended chunk 18 to extracted_txt/combined_output.txt
Appended chunk 19 t

In [3]:
import ssl
import nltk

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/giacomo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
from kg_gen import KGGen
import json

# Initialize KGGen as before
kg = KGGen(
    model="ollama_chat/gemma3:1b",
    temperature=0.0,
    api_key="sk-no-key-needed",
)

graph_list = []
for idx, chunk in enumerate(list_docs_md):
    if not chunk.strip():
        print(f"Skipping empty chunk {idx}")
        continue
    print(f"Processing chunk {idx} of length:", len(chunk))
    print("Chunk preview:", chunk[:500])
    try:
        # You could try without cluster if needed, or adjust as desired
        graph = kg.generate(input_data=chunk, context="Business regulations", cluster=True)
    except Exception as e:
        print(f"Error processing chunk {idx}: {e}")
        continue
    # Check that the response contains the expected output
    if not getattr(graph, "relations", None):
        print(f"Chunk {idx} produced an unexpected response and is being skipped.")
        continue
    print("Individual Graph:", graph)
    graph_list.append(graph)

if graph_list:
    aggregated_graph = kg.aggregate(graph_list)
    print("Aggregated Graph:", aggregated_graph)
    with open("aggregated_graph.json", "w", encoding="utf-8") as outfile:
        json.dump(aggregated_graph, outfile, ensure_ascii=False, indent=4)
    print("Graph saved to aggregated_graph.json")
else:
    print("No valid graphs were generated.")

  from .autonotebook import tqdm as notebook_tqdm


Processing chunk 0 of length: 201
Chunk preview: 
## Page 1

### Text on Page 1
REPUBBLICA ITALIANA ANNO XLV BARI, 8 AGOSTO 2014 n. 109BOLLETTINO UFFICIALE della Regione Puglia Leggi e regolamenti regionali VOLUME PRIMO 2014.08. 19  09:09:08 +02'00'



[92m11:32:49 - LiteLLM:ERROR[0m: litellm_logging.py:3482 - Error creating standard logging object - __annotations__
Traceback (most recent call last):
  File "/Users/giacomo/Documents/kg+llm_task2_nlp/.venv/lib/python3.11/site-packages/litellm/litellm_core_utils/litellm_logging.py", line 3464, in get_standard_logging_object_payload
    model_parameters=ModelParamHelper.get_standard_logging_model_parameters(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/kg+llm_task2_nlp/.venv/lib/python3.11/site-packages/litellm/litellm_core_utils/model_param_helper.py", line 28, in get_standard_logging_model_parameters
    ModelParamHelper._get_relevant_args_to_use_for_logging()
  File "/Users/giacomo/Documents/kg+llm_task2_nlp/.venv/lib/python3.11/site-packages/litellm/litellm_core_utils/model_param_helper.py", line 45, in _get_relevant_args_to_use_for_logging
    all_openai_llm_api_params = ModelParamHelper._get_all_llm_api_params()
  

KeyboardInterrupt: 