In [1]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [5]:
# Cell 1: Extract Details (Prompt V1 - Weak Prompt)

import os
import json
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

def load_prompt_from_file(prompt_file: str) -> str:
    """Utility function to load prompt text from a file."""
    with open(prompt_file, 'r', encoding='utf-8') as pf:
        return pf.read()

def extract_full_details_realtime(
    base_dir: str,
    output_dir: str,
    output_prefix: str,
    chat_model,
    prompt_file: str
):
    """
    Traverses `base_dir` (each top-level subfolder is a 'company'),
    processes all .md files in real-time, writing extracted info to
    separate JSON files in `output_dir`.

    For each company:
      - Creates (or appends to) a JSON file named "{output_prefix}_{company_name}.json"
      - After each .md file is processed, the data is written immediately.
    """
    # Make sure output_dir exists
    os.makedirs(output_dir, exist_ok=True)

    # Load the prompt content
    prompt_text = load_prompt_from_file(prompt_file)
    prompt = PromptTemplate(input_variables=["content"], template=prompt_text)

    for company_folder in os.listdir(base_dir):
        company_path = os.path.join(base_dir, company_folder)

        # Process only directories
        if not os.path.isdir(company_path):
            continue

        print(f"\n=== Processing Company: {company_folder} ===")

        # Prepare the company's JSON output file
        company_json_filename = f"{output_prefix}_{company_folder}.json"
        company_json_path = os.path.join(output_dir, company_json_filename)

        # If a JSON file already exists, load it to avoid overwriting
        if os.path.exists(company_json_path):
            with open(company_json_path, 'r', encoding='utf-8') as f:
                company_data = json.load(f)
        else:
            company_data = {}

        # Recursively walk the company's folder
        for root, dirs, files in os.walk(company_path):
            for filename in files:
                if filename.endswith(".md"):
                    file_path = os.path.join(root, filename)
                    rel_path = os.path.relpath(file_path, company_path)

                    try:
                        with open(file_path, 'r', encoding='utf-8') as file:
                            markdown_content = file.read()

                        # Format the LLM prompt
                        formatted_prompt = prompt.format(content=markdown_content)
                        # Get model response
                        response = chat_model.predict(formatted_prompt).strip()

                        # Parse response
                        product_name = None
                        details = []

                        for line in response.splitlines():
                            if line.startswith("Produktname:"):
                                product_name = line.replace("Produktname:", "").strip()
                            elif line.startswith("- "):
                                details.append(line[2:].strip())

                        company_data[rel_path] = {
                            "product_name": product_name,
                            "details": details
                        }

                        # Write out the JSON immediately after each file
                        with open(company_json_path, 'w', encoding='utf-8') as out_f:
                            json.dump(company_data, out_f, ensure_ascii=False, indent=4)

                        print(f"Extrahiert aus {rel_path} => Produkt: {product_name}, Details: {len(details)} Einträge")

                    except Exception as e:
                        print(f"Fehler beim Verarbeiten der Datei {rel_path}: {e}")

        print(f"→ Gesamte Ergebnisse für {company_folder} in: {company_json_path}")

# -------------------------------------------------------------------
# MAIN EXECUTION FOR PROMPT V1
# -------------------------------------------------------------------
BASE_DIR_V1 = "03_textbinary_products_v1"
OUTPUT_DIR_V1 = "04_product_details_extraction_v1"
OUTPUT_PREFIX_V1 = "report_full_details_v1"  # => e.g. "report_full_details_v1_arag.json", etc.
PROMPT_FILE_V1 = "extract_full_details_prompt_v1.txt"

chat_model_v1 = ChatOpenAI(model="gpt-4o", temperature=0)

print("=== Extracting with Prompt V1 (Weak Prompt) ===")
extract_full_details_realtime(
    base_dir=BASE_DIR_V1,
    output_dir=OUTPUT_DIR_V1,
    output_prefix=OUTPUT_PREFIX_V1,
    chat_model=chat_model_v1,
    prompt_file=PROMPT_FILE_V1
)


=== Extracting with Prompt V1 (Weak Prompt) ===

=== Processing Company: generali ===
Extrahiert aus privatkunden_rundum-schutz_young-and-drive.md => Produkt: Generali Protect Me App, Details: 22 Einträge
Extrahiert aus service-kontakt_apps_generali-protect-me-app.md => Produkt: Generali Protect Me App, Details: 23 Einträge
Extrahiert aus geschaeftskunden_gesundheit-betriebliche-vorsorge_betriebliche-krankenversicherung-inland.md => Produkt: Business+ Die betriebliche Krankenversicherung der Generali, Details: 12 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_unfallversicherung_unfall-assistance-xxl.md => Produkt: Kinderunfallversicherung, Details: 20 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_reise-krankenversicherung.md => Produkt: None, Details: 22 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_wassersportversicherung.md => Produkt: Generali Protect Me App, Details: 16 Einträge
Extrahiert aus geschaeftskunden_geschaeft-gebaeude_ertragsausfallversich

KeyboardInterrupt: 

In [7]:
# -----------------------------------------------------
# Cell: Extract Details with Enhanced Prompt v3
# -----------------------------------------------------
import os
import json
from typing import Tuple
from langchain.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

def parse_system_human_prompt(prompt_file: str) -> Tuple[str, str]:
    """
    Reads a text file containing two sections:
      [System]
      ...some text...
      [Human]
      ...some text...
    Splits them and returns (system_text, human_text).
    """
    with open(prompt_file, 'r', encoding='utf-8') as f:
        full_prompt = f.read()
    if "[System]" not in full_prompt or "[Human]" not in full_prompt:
        raise ValueError("Prompt file must contain both [System] and [Human] sections.")
    
    # Split on "[System]" -> discard anything before it
    _, after_system = full_prompt.split("[System]", 1)
    # Then split on "[Human]"
    system_part, human_part = after_system.split("[Human]", 1)
    return system_part.strip(), human_part.strip()

def load_and_create_chat_prompt(prompt_file: str) -> ChatPromptTemplate:
    """
    Parses a refined prompt file with [System] + [Human] sections
    and creates a ChatPromptTemplate from them.
    """
    system_text, human_text = parse_system_human_prompt(prompt_file)
    return ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(system_text),
        HumanMessagePromptTemplate.from_template(human_text)
    ])

def extract_enhanced_details_realtime(
    base_dir: str,
    output_dir: str,
    output_prefix: str,
    chat_prompt: ChatPromptTemplate,
    chat_model,
):
    """
    Traverses `base_dir` (each top-level subfolder is a 'company'),
    applies a ChatPromptTemplate with system+human messages to each .md file,
    and writes results to separate JSON files in `output_dir`.
    
    Data is written to disk after each file is processed to avoid data loss.
    """
    os.makedirs(output_dir, exist_ok=True)

    for company_folder in os.listdir(base_dir):
        company_path = os.path.join(base_dir, company_folder)
        if not os.path.isdir(company_path):
            continue  # Only process directories

        print(f"\n=== Processing Company: {company_folder} ===")
        company_json_filename = f"{output_prefix}_{company_folder}.json"
        company_json_path = os.path.join(output_dir, company_json_filename)

        # If a JSON file already exists, load it; else start fresh
        if os.path.exists(company_json_path):
            with open(company_json_path, 'r', encoding='utf-8') as cf:
                company_data = json.load(cf)
        else:
            company_data = {}

        # Recursively walk all .md files for the current company
        for root, dirs, files in os.walk(company_path):
            for filename in files:
                if filename.endswith(".md"):
                    file_path = os.path.join(root, filename)
                    rel_path = os.path.relpath(file_path, company_path)

                    try:
                        with open(file_path, 'r', encoding='utf-8') as md_file:
                            markdown_content = md_file.read()

                        # Format the messages
                        # "content" placeholder -> markdown_content
                        messages = chat_prompt.format_messages(content=markdown_content)
                        # Use the LLM
                        response = chat_model(messages).content.strip()

                        # Parse the result
                        # We'll look for lines that start with: 
                        # "Produktname:", "Versicherungsart:", "Deckungssumme:", etc.
                        # If not found, store "Keine Angabe" or an empty list for bullet points.
                        
                        # Default dictionary with possible fields
                        extracted_info = {
                            "Produktname": "Keine Angabe",
                            "Versicherungsart": "Keine Angabe",
                            "Deckungssumme": "Keine Angabe",
                            "Selbstbeteiligung": "Keine Angabe",
                            "Preis": "Keine Angabe",
                            "Gültigkeitsbereich": "Keine Angabe",
                            "Vertragslaufzeit": "Keine Angabe",
                            "Weitere Leistungen": [],
                            "Ausnahmen oder Ausschlüsse": []
                        }
                        
                        # We'll parse line by line
                        current_section = None
                        for line in response.splitlines():
                            line = line.strip()
                            if line.startswith("Produktname:"):
                                extracted_info["Produktname"] = line.replace("Produktname:", "").strip() or "Keine Angabe"
                            elif line.startswith("Versicherungsart:"):
                                extracted_info["Versicherungsart"] = line.replace("Versicherungsart:", "").strip() or "Keine Angabe"
                            elif line.startswith("Deckungssumme:"):
                                extracted_info["Deckungssumme"] = line.replace("Deckungssumme:", "").strip() or "Keine Angabe"
                            elif line.startswith("Selbstbeteiligung:"):
                                extracted_info["Selbstbeteiligung"] = line.replace("Selbstbeteiligung:", "").strip() or "Keine Angabe"
                            elif line.startswith("Preis:"):
                                extracted_info["Preis"] = line.replace("Preis:", "").strip() or "Keine Angabe"
                            elif line.startswith("Gültigkeitsbereich:"):
                                extracted_info["Gültigkeitsbereich"] = line.replace("Gültigkeitsbereich:", "").strip() or "Keine Angabe"
                            elif line.startswith("Vertragslaufzeit:"):
                                extracted_info["Vertragslaufzeit"] = line.replace("Vertragslaufzeit:", "").strip() or "Keine Angabe"
                            elif line.startswith("Weitere Leistungen:"):
                                current_section = "Weitere Leistungen"
                            elif line.startswith("Ausnahmen oder Ausschlüsse:"):
                                current_section = "Ausnahmen oder Ausschlüsse"
                            elif line.startswith("- "):
                                # A bullet point
                                bullet_text = line[2:].strip()
                                if current_section in ["Weitere Leistungen", "Ausnahmen oder Ausschlüsse"]:
                                    extracted_info[current_section].append(bullet_text)

                        # Store in company_data dictionary
                        company_data[rel_path] = extracted_info

                        # Write out the JSON in real-time
                        with open(company_json_path, 'w', encoding='utf-8') as out_f:
                            json.dump(company_data, out_f, ensure_ascii=False, indent=4)

                        print(f"Extrahiert aus {rel_path}: {extracted_info['Produktname']}")

                    except Exception as err:
                        print(f"Fehler bei {rel_path}: {err}")

        print(f"→ Ergebnisse für {company_folder} in {company_json_path}")

# -----------------------------
# MAIN EXECUTION
# -----------------------------
BASE_DIR_V3 = "03_textbinary_products_v2"  # or any folder with .md subfolders
OUTPUT_DIR_V3 = "04_product_details_extraction_v2"
OUTPUT_PREFIX_V3 = "report_full_details_v2"
PROMPT_FILE_V3 = "extract_full_details_prompt_v2.txt"

# 1) Create ChatPromptTemplate from system+human messages
chat_prompt_v3 = load_and_create_chat_prompt(PROMPT_FILE_V3)

# 2) Initialize Chat Model
chat_model_v3 = ChatOpenAI(model="gpt-4o", temperature=0)

# 3) Extract details in real-time, storing them in separate JSON for each company
extract_enhanced_details_realtime(
    base_dir=BASE_DIR_V3,
    output_dir=OUTPUT_DIR_V3,
    output_prefix=OUTPUT_PREFIX_V3,
    chat_prompt=chat_prompt_v3,
    chat_model=chat_model_v3
)

print("\n*** Enhanced Prompt Extraction (v2) completed! ***")



=== Processing Company: generali ===


  response = chat_model(messages).content.strip()


Extrahiert aus privatkunden_rundum-schutz_young-and-drive.md => Produkt: Young & Drive, Details: 8 Einträge
Extrahiert aus service-kontakt_apps_generali-protect-me-app.md => Produkt: Vermögensaufbau4you, Details: 4 Einträge
Extrahiert aus geschaeftskunden_gesundheit-betriebliche-vorsorge_betriebliche-krankenversicherung-inland.md => Produkt: Business+ Die betriebliche Krankenversicherung der Generali, Details: 7 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_unfallversicherung_unfall-assistance-xxl.md => Produkt: Unfall Assistance XXL, Details: 7 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_reise-krankenversicherung.md => Produkt: Generali GesundheitsApp, Details: 9 Einträge
Extrahiert aus privatkunden_gesundheit-freizeit_wassersportversicherung.md => Produkt: Wassersportversicherung, Details: 7 Einträge
Extrahiert aus geschaeftskunden_geschaeft-gebaeude_ertragsausfallversicherung.md => Produkt: Ertragsausfallversicherung, Details: 15 Einträge
Extrahiert aus priva

KeyboardInterrupt: 