In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

print("Attempting to download NLTK data (punkt)...")
nltk.download('punkt')
print("Download complete.")

print("\nAttempting to download NLTK data (averaged_perceptron_tagger)...")
nltk.download('averaged_perceptron_tagger')
print("Download complete.")

print("\nAll necessary NLTK data has been downloaded.")

In [1]:
import os
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# FIX 2: Import the correct document loader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
import json

# --- Configuration ---
# I see you switched to Mixtral, great choice for a more powerful model!
llm = ChatOllama(model="mixtral")

# --- FIX 3: Use the Word Document loader ---
loader = UnstructuredWordDocumentLoader("./docs/designdoc.docx")
document = loader.load()

# The prompt template remains the same
prompt_template = """
You are an expert cybersecurity architect specializing in threat modeling.
Your task is to read the provided system design document and extract the core components for a Data Flow Diagram (DFD) in a valid JSON format.

The JSON output must contain lists for 'assets', 'processes', and 'data_flows'.
- An 'asset' is a data store where data rests (e.g., database, log file).
- A 'process' is a component that acts on data (e.g., API, service).
- A 'data_flow' describes data movement between a source and a destination process/asset.

System Design Document:
---
{document_text}
---

Now, generate the JSON object based on the document. Output ONLY the JSON object, with no additional commentary or explanations.
"""

prompt = ChatPromptTemplate.from_template(prompt_template)
output_parser = StrOutputParser()

# Create the chain to link everything together
chain = prompt | llm | output_parser

# Invoke the chain and print the output
print("--- Invoking Local LLM Chain (Mixtral) ---")
response = chain.invoke({"document_text": document[0].page_content})
print("--- LLM Output ---")
print(response)

# Validate if the output is valid JSON
try:
    # It's good practice to strip whitespace/new lines from the response
    parsed_json = json.loads(response.strip())
    print("\n--- JSON Validation: Success ---")
except json.JSONDecodeError as e:
    print(f"\n--- JSON Validation: Failed --- \nError: {e}")

[nltk_data] Error loading averaged_perceptron_tagger_eng: <urlopen
[nltk_data]     error [SSL: CERTIFICATE_VERIFY_FAILED] certificate
[nltk_data]     verify failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1028)>
[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/jeffreyvonrotz/nltk_data'
    - '/Users/jeffreyvonrotz/SynologyDrive/Projects/Threatalicious/venv/nltk_data'
    - '/Users/jeffreyvonrotz/SynologyDrive/Projects/Threatalicious/venv/share/nltk_data'
    - '/Users/jeffreyvonrotz/SynologyDrive/Projects/Threatalicious/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
