In [1]:
# --- Dependencies ---
# Ensure you have these packages installed. You can install them using pip:
# pip install langchain langchain-community langchain-ollama unstructured[docx] nltk

import os
import json
import ssl  # Imported to handle SSL certificate verification issues
import nltk  # Required for the data download function
from urllib.error import URLError  # To catch the specific download error
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

# --- NLTK Data Management ---
# This function checks for necessary NLTK data and downloads it if missing.
# Unstructured typically requires 'punkt' for sentence tokenization.
def ensure_nltk_data():
    """
    Checks for and downloads required NLTK data, handling potential SSL errors.
    """
    required_package = 'punkt'  # Minimal requirement for Unstructured
    nltk_data_path = os.path.join(os.path.expanduser("~"), "nltk_data")

    if not os.path.exists(nltk_data_path):
        os.makedirs(nltk_data_path)

    if nltk_data_path not in nltk.data.path:
        nltk.data.path.append(nltk_data_path)

    print("--- Verifying NLTK data packages... ---")
    try:
        # Check for 'punkt'
        nltk.data.find('tokenizers/punkt')
        print(f"  [✓] NLTK 'punkt' data is available.")
    except LookupError:
        print(f"  [!] NLTK 'punkt' data not found. Attempting to download...")
        try:
            # First, try downloading normally.
            nltk.download(required_package, download_dir=nltk_data_path)
        except URLError as e:
            # If a URLError with an SSL certificate issue occurs, apply the workaround.
            if "CERTIFICATE_VERIFY_FAILED" in str(e):
                print("  [!] SSL certificate verification failed. Applying workaround...")
                # Create an unverified SSL context.
                ssl._create_default_https_context = ssl._create_unverified_context
                # Retry the download with the unverified context.
                nltk.download(required_package, download_dir=nltk_data_path)
                print(f"  [✓] '{required_package}' downloaded successfully using SSL workaround.")
            else:
                # If it's a different URLError, re-raise it.
                raise
        print(f"  [✓] '{required_package}' downloaded successfully.")

    print("--- NLTK setup complete. ---")

# --- Configuration ---
llm = ChatOllama(model="mixtral")

# --- NLTK Data Path Configuration ---
ensure_nltk_data()  # No need to store the path, as it's appended to nltk.data.path

# --- Document Loading with Error Handling ---
document_path = "./docs/designdoc.docx"
document = None

try:
    loader = UnstructuredWordDocumentLoader(document_path)
    document = loader.load()
    print(f"--- Successfully loaded document: {document_path} ---")
except FileNotFoundError:
    print(f"--- FATAL ERROR: Input document not found at '{document_path}' ---")
    print("Please ensure the 'designdoc.docx' file exists in a 'docs' subdirectory.")
    exit()
except Exception as e:
    print(f"--- FATAL ERROR: An unexpected error occurred while loading the document. ---")
    print(f"Error details: {e}")
    exit()

if not document or not document[0].page_content:
    print("--- FATAL ERROR: Document loaded but is empty. ---")
    exit()

# --- Prompt Engineering ---
prompt_template = """
You are an expert cybersecurity architect specializing in threat modeling.
Your task is to read the provided system design document and extract the core components
for a Data Flow Diagram (DFD) in a valid JSON format.

The JSON output must contain three keys: 'assets', 'processes', and 'data_flows'.
- An 'asset' is a data store where data rests (e.g., a database, a cache, a log file). List as an array of strings.
- A 'process' is a component that acts on or transforms data (e.g., an API, a microservice, a user-facing application). List as an array of strings.
- A 'data_flow' is an array of objects, each with 'source', 'destination', and 'data_description'. The 'source' and 'destination'
  must be one of the previously identified processes or assets.

System Design Document:
---
{document_text}
---

Now, generate the JSON object based on the document. Output ONLY the JSON object itself, with no
additional commentary, explanations, or markdown formatting.
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

# --- Chain Construction with JSON Output Parser ---
output_parser = JsonOutputParser()
chain = prompt | llm | output_parser

# --- Invocation and Output ---
print("\n--- Invoking Local LLM Chain (Mixtral) to extract DFD components ---")
try:
    response_dict = chain.invoke({"document_text": document[0].page_content})
    print("\n--- LLM Output (Parsed JSON) ---")
    print(json.dumps(response_dict, indent=2))
    print("\n--- JSON Validation: Success ---")
    print("The LLM output was successfully parsed into a dictionary.")
except Exception as e:
    print(f"\n--- An error occurred during chain invocation or parsing ---")
    print(f"Error: {e}")
    print("This may be due to the LLM not returning a well-formed JSON object.")

--- Verifying NLTK data packages... ---
  [!] NLTK 'punkt' data not found. Attempting to download...


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeffreyvonrotz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


  [✓] 'punkt' downloaded successfully.
--- NLTK setup complete. ---
--- Successfully loaded document: ./docs/designdoc.docx ---

--- Invoking Local LLM Chain (Mixtral) to extract DFD components ---

--- LLM Output (Parsed JSON) ---
{
  "assets": [
    "Azure Database for MySQL",
    "Azure Blob Storage"
  ],
  "processes": [
    "Customer Browser",
    "Azure App Service (WordPress/WooCommerce)",
    "Azure Database for MySQL",
    "Stripe/PayPal API",
    "Admin Browser",
    "/wp-admin"
  ],
  "data_flows": [
    {
      "source": "Customer Browser",
      "destination": "Azure App Service (WordPress/WooCommerce)",
      "data_description": "Browsing, product catalog"
    },
    {
      "source": "Azure App Service (WordPress/WooCommerce)",
      "destination": "Azure Database for MySQL",
      "data_description": "Products, cart, orders, user data"
    },
    {
      "source": "Customer Browser",
      "destination": "Stripe/PayPal API",
      "data_description": "Payment informatio