In [None]:
# --- Dependencies ---
# Ensure you have these packages installed. You can install them using pip:
# pip install langchain langchain-community langchain-ollama unstructured[docx] nltk

import os
import json
import ssl  # Imported to handle SSL certificate verification issues
import nltk  # Required for the data download function
from urllib.error import URLError  # To catch the specific download error
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from datetime import datetime

# --- NLTK Data Management ---
# This function checks for necessary NLTK data and downloads it if missing.
# Unstructured typically requires 'punkt' for sentence tokenization.
def ensure_nltk_data():
    """
    Checks for and downloads required NLTK data, handling potential SSL errors.
    """
    required_package = 'punkt'  # Minimal requirement for Unstructured
    nltk_data_path = os.path.join(os.path.expanduser("~"), "nltk_data")

    if not os.path.exists(nltk_data_path):
        os.makedirs(nltk_data_path)

    if nltk_data_path not in nltk.data.path:
        nltk.data.path.append(nltk_data_path)

    print("--- Verifying NLTK data packages... ---")
    try:
        # Check for 'punkt'
        nltk.data.find('tokenizers/punkt')
        print(f"  [✓] NLTK 'punkt' data is available.")
    except LookupError:
        print(f"  [!] NLTK 'punkt' data not found. Attempting to download...")
        try:
            # First, try downloading normally.
            nltk.download(required_package, download_dir=nltk_data_path)
        except URLError as e:
            # If a URLError with an SSL certificate issue occurs, apply the workaround.
            if "CERTIFICATE_VERIFY_FAILED" in str(e):
                print("  [!] SSL certificate verification failed. Applying workaround...")
                # Create an unverified SSL context.
                ssl._create_default_https_context = ssl._create_unverified_context
                # Retry the download with the unverified context.
                nltk.download(required_package, download_dir=nltk_data_path)
                print(f"  [✓] '{required_package}' downloaded successfully using SSL workaround.")
            else:
                # If it's a different URLError, re-raise it.
                raise
        print(f"  [✓] '{required_package}' downloaded successfully.")

    print("--- NLTK setup complete. ---")

# --- Configuration ---
llm = ChatOllama(model="mixtral")

# --- NLTK Data Path Configuration ---
ensure_nltk_data()  # No need to store the path, as it's appended to nltk.data.path

# --- Document Loading with Error Handling ---
document_path = "./docs/designdoc.docx"
document = None

try:
    loader = UnstructuredWordDocumentLoader(document_path)
    document = loader.load()
    print(f"--- Successfully loaded document: {document_path} ---")
except FileNotFoundError:
    print(f"--- FATAL ERROR: Input document not found at '{document_path}' ---")
    print("Please ensure the 'designdoc.docx' file exists in a 'docs' subdirectory.")
    exit()
except Exception as e:
    print(f"--- FATAL ERROR: An unexpected error occurred while loading the document. ---")
    print(f"Error details: {e}")
    exit()

if not document or not document[0].page_content:
    print("--- FATAL ERROR: Document loaded but is empty. ---")
    exit()

# --- Prompt Engineering ---
prompt_template = """
You are an expert cybersecurity architect specializing in threat modeling.
Your task is to read the provided system design document and extract the core components
for a Data Flow Diagram (DFD) in a valid JSON format.

The JSON output must contain three keys: 'assets', 'processes', and 'data_flows'.
- An 'asset' is a data store where data rests (e.g., a database, a cache, a log file). List as an array of strings.
- A 'process' is a component that acts on or transforms data (e.g., an API, a microservice, a user-facing application). List as an array of strings.
- A 'data_flow' is an array of objects, each with 'source', 'destination', and 'data_description'. The 'source' and 'destination'
  must be one of the previously identified processes or assets.

System Design Document:
---
{document_text}
---

Now, generate the JSON object based on the document. Output ONLY the JSON object itself, with no
additional commentary, explanations, or markdown formatting.
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

# --- Chain Construction with JSON Output Parser ---
output_parser = JsonOutputParser()
chain = prompt | llm | output_parser

# --- Invocation and Output ---
print("\n--- Invoking Local LLM Chain (Mixtral) to extract DFD components ---")
output_dir = "./output"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#output_path = os.path.join(output_dir, f"dfd_components_{timestamp}.json")
output_path = os.path.join(output_dir, f"dfd_components.json")

try:
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    response_dict = chain.invoke({"document_text": document[0].page_content})
    
    # Add timestamp to the JSON data itself
    response_dict["metadata"] = {
        "timestamp": datetime.now().isoformat(),
        "source_document": document_path
    }
    
    print("\n--- LLM Output (Parsed JSON) ---")
    print(json.dumps(response_dict, indent=2))
    
    # Save the dictionary to a JSON file
    with open(output_path, 'w') as f:
        json.dump(response_dict, f, indent=2)
    
    print(f"\n--- DFD components successfully saved to '{output_path}' ---")


except Exception as e:
    print(f"\n--- An error occurred during chain invocation or parsing ---")
    print(f"Error: {e}")
    print("This may be due to the LLM not returning a well-formed JSON object.")

--- Verifying NLTK data packages... ---
  [✓] NLTK 'punkt' data is available.
--- NLTK setup complete. ---
--- Successfully loaded document: ./docs/designdoc.docx ---

--- Invoking Local LLM Chain (Mixtral) to extract DFD components ---

--- LLM Output (Parsed JSON) ---
{
  "assets": [
    "Azure Database for MySQL",
    "Azure Blob Storage"
  ],
  "processes": [
    "Browser",
    "Azure App Service (WordPress/WooCommerce)",
    "Azure Database for MySQL",
    "Stripe/PayPal API",
    "Admin Browser",
    "/wp-admin"
  ],
  "data_flows": [
    {
      "source": "Browser",
      "destination": "Azure App Service (WordPress/WooCommerce)",
      "data_description": "Customer browsing products"
    },
    {
      "source": "Azure App Service (WordPress/WooCommerce)",
      "destination": "Azure Database for MySQL",
      "data_description": "Product data, customer actions"
    },
    {
      "source": "Browser",
      "destination": "Stripe/PayPal API",
      "data_description": "Payment 

In [None]:
# --- Dependencies ---
import os
import json
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

# --- Configuration ---
llm = ChatOllama(model="mixtral")
input_dir = "./output"
dfd_input_path = os.path.join(input_dir, "dfd_components.json")
threats_output_path = os.path.join(input_dir, "identified_threats.json")

# --- Load DFD Components ---
print(f"--- Loading DFD components from '{dfd_input_path}' ---")
try:
    with open(dfd_input_path, 'r') as f:
        dfd_data = json.load(f)
    print("--- DFD components loaded successfully. ---")
except FileNotFoundError:
    print(f"--- FATAL ERROR: Input file not found at '{dfd_input_path}' ---")
    print("Please run the first script (to generate DFD components) before running this one.")
    exit()
except json.JSONDecodeError:
    print(f"--- FATAL ERROR: Could not parse JSON from '{dfd_input_path}'. ---")
    print("The file may be corrupted or empty.")
    exit()

# --- Prompt Engineering for Threat Generation ---
threat_prompt_template = """
You are a senior cybersecurity analyst specializing in threat modeling using the STRIDE methodology.
Based on the provided Data Flow Diagram (DFD) components in JSON format, identify potential threats for each data flow, process, and asset.

For each identified threat, provide the following details:
- 'component_name': The name of the affected asset, process, or data flow.
- 'stride_category': The relevant STRIDE category (Spoofing, Tampering, Repudiation, Information Disclosure, Denial of Service, Elevation of Privilege).
- 'threat_description': A clear and concise description of the specific threat.
- 'mitigation_suggestion': A practical suggestion for mitigating the threat.

DFD Components:
---
{dfd_json}
---

Generate a JSON object with a single key 'threats', which contains a list of all identified threats.
Output ONLY the JSON object itself, with no additional commentary or markdown formatting.
"""

threat_prompt = ChatPromptTemplate.from_template(threat_prompt_template)

# --- Chain Construction with JSON Output Parser ---
threat_parser = JsonOutputParser()
threat_chain = threat_prompt | llm | threat_parser

# --- Invocation and Output ---
print("\n--- Invoking Local LLM Chain (Mixtral) to generate STRIDE threats ---")
try:
    # Convert the loaded DFD dictionary back to a JSON string for the prompt
    dfd_json_string = json.dumps(dfd_data, indent=2)

    # Invoke the threat analysis chain
    threats_dict = threat_chain.invoke({"dfd_json": dfd_json_string})
    
    # Ensure the output directory exists
    os.makedirs(input_dir, exist_ok=True)
    
    # Save the threats to a new file
    with open(threats_output_path, 'w') as f:
        json.dump(threats_dict, f, indent=2)
        
    print("\n--- LLM Output (Identified Threats) ---")
    print(json.dumps(threats_dict, indent=2))
    print(f"\n--- Identified threats successfully saved to '{threats_output_path}' ---")

except Exception as e:
    print(f"\n--- An error occurred during threat generation ---")
    print(f"Error: {e}")
    print("This could be due to the LLM not returning a well-formed JSON object or an issue with the input data.")