In [29]:
import os
import re
import json
import pytesseract
from PIL import Image
from docx import Document
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage
import logging

# Logging configuration
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

# Max input size for LLM prompt (characters). Can be overridden via env var.
MAX_INPUT_CHARS = int(os.getenv("MAX_INPUT_CHARS", "30000"))


In [30]:
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [31]:
folder_path = "./Data"

# Allow overriding tesseract command via environment variable (Windows)
tess_cmd = os.getenv("TESSERACT_CMD")
if tess_cmd:
    pytesseract.pytesseract.tesseract_cmd = tess_cmd

all_texts = []

for filename in os.listdir(folder_path):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(folder_path, filename)
        try:
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
        except Exception as e:
            logger.exception(f"Error processing {filename}: {e}")
            continue

        logger.info(f"Text from {filename}:")
        logger.debug(text)
        logger.info("-" * 50)
        # Append OCR output so all images' text is collected
        all_texts.append(text)

text = "\n".join([t for t in all_texts if t and t.strip()])

# Truncate very long inputs to avoid exceeding LLM token limits
if len(text) > MAX_INPUT_CHARS:
    # Truncate at the last newline before the limit if possible
    cutoff = text.rfind("\n", 0, MAX_INPUT_CHARS)
    if cutoff == -1:
        cutoff = MAX_INPUT_CHARS
    logger.warning(f"OCR text exceeds MAX_INPUT_CHARS ({MAX_INPUT_CHARS}). Truncating to {cutoff} chars.")
    text = text[:cutoff]


2025-10-14 11:24:12,609 INFO: Text from Screenshot 2025-10-14 100949.png:
2025-10-14 11:24:12,610 INFO: --------------------------------------------------
2025-10-14 11:24:12,883 INFO: Text from Screenshot 2025-10-14 101001.png:
2025-10-14 11:24:12,884 INFO: --------------------------------------------------


In [32]:
OUTPUT_DOCX = "project_proposal.docx"

prompt_text = f"""
You are an expert project analyst. Your task is to read the text below and create a complete, structured project proposal.
STRICTLY OUTPUT VALID JSON ONLY, using the exact template provided. 
DO NOT include any explanations, notes, or text outside the JSON.

Guidelines:

1. Every field that is a list (Objectives, Goals, Expected Outcomes, Success Metrics, Core Features, Core Features & Functionalities, Tech Stack, Integration Needs, Security & Compliance, Performance Criteria, App Flow Summary, Deliverables, Milestones, Team Roles, Dependencies, Potential Risks, Mitigation Strategies) MUST be a proper JSON array of strings. 
2. Fields that are single values (Project Title, Client Name, Project Summary, Target Audience, Monetization Strategy, Estimated Duration, Estimated Budget, Estimated Timeline & Pricing, Other Notes) MUST be strings.
3. Nested dictionaries (like Cost Breakdown under Budget & Costing) must have string keys and string or number values.
4. Ensure all sections are filled logically based on the input text. If no information is present, leave the field empty ("" for strings, [] for lists, {{}} for dictionaries).
5. Avoid any extra formatting, markdown, or comments.
6. Follow this **exact JSON template structure**:

{{
    "Project Overview": {{
        "Project Title": "",
        "Client Name": "",
        "Project Summary": "",
        "Objectives": []
    }},
    "Business Requirements": {{
        "Goals": [],
        "Target Audience": "",
        "Expected Outcomes": [],
        "Success Metrics": [],
        "Monetization Strategy": ""
    }},
    "Technical Requirements": {{
        "Core Features": [],
        "Core Features & Functionalities": [],
        "Tech Stack": [],
        "Integration Needs": [],
        "Security & Compliance": [],
        "Performance Criteria": []
    }},
    "App Flow": {{
        "App Flow Summary": []
    }},
    "Project Scope": {{
        "Inclusions": [],
        "Exclusions": [],
        "Deliverables": [],
        "Milestones": [],
        "Estimated Timeline & Pricing": ""
    }},
    "Timeline & Resources": {{
        "Estimated Duration": "",
        "Team Roles": [],
        "Dependencies": []
    }},
    "Budget & Costing": {{
        "Estimated Budget": "",
        "Cost Breakdown": {{}}
    }},
    "Risk Assessment": {{
        "Potential Risks": [],
        "Mitigation Strategies": []
    }},
    "Other Notes": ""
}}

Text to analyze:
{text}
"""

messages = [HumanMessage(content=prompt_text)]

import time

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY environment variable not set!")

client = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model="openai/gpt-oss-120b",
    temperature=1.5,
)

def extract_first_json(s: str):
    """Extract the first balanced JSON object from a string. Returns the JSON substring or None."""
    s = s.strip()
    start = None
    brace_count = 0
    for i, ch in enumerate(s):
        if ch == '{':
            if start is None:
                start = i
            brace_count += 1
        elif ch == '}':
            brace_count -= 1
            if brace_count == 0 and start is not None:
                return s[start:i+1]
    return None

# Try calling the LLM with simple retry/backoff
llm_output = None
for attempt in range(3):
    try:
        response = client(messages)
        llm_output = response.content.strip()
        break
    except Exception as e:
        logger.exception(f"LLM call failed (attempt {attempt+1}): {e}")
        if attempt < 2:
            time.sleep(2 ** attempt)
        else:
            raise

try:
    structured_output = json.loads(llm_output)
except json.JSONDecodeError:
    # fallback: extract first balanced JSON block
    json_block = extract_first_json(llm_output or "")
    if json_block:
        try:
            structured_output = json.loads(json_block)
        except json.JSONDecodeError as e:
            raise ValueError("LLM returned JSON-like block but parsing failed: " + str(e))
    else:
        # try fenced code block with JSON
        fence_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", llm_output or "", re.DOTALL)
        if fence_match:
            structured_output = json.loads(fence_match.group(1))
        else:
            raise ValueError("Could not parse JSON from LLM output!")

# basic validation
if not isinstance(structured_output, dict):
    raise ValueError("Parsed structured_output is not a JSON object/dict!")


doc = Document()
doc.add_heading("Project Proposal", 0)

def add_section(section_name, content):
    """Add a section to the Word document"""
    doc.add_heading(section_name, level=1)
    if not isinstance(content, dict):
        doc.add_paragraph(str(content))
        return
    for key, value in content.items():
        # Special handling for Budget & Costing
        if section_name == "Budget & Costing":
            if key == "Estimated Budget":
                doc.add_heading("Estimated Budget", level=2)
                doc.add_paragraph(str(value))
            elif key == "Cost Breakdown":
                doc.add_heading("Cost Breakdown", level=2)
                if isinstance(value, dict) and value:
                    table = doc.add_table(rows=1, cols=2)
                    table.style = "Light Grid"
                    hdr_cells = table.rows[0].cells
                    hdr_cells[0].text = "Item"
                    hdr_cells[1].text = "Amount"
                    for item_name, amount in value.items():
                        row_cells = table.add_row().cells
                        row_cells[0].text = str(item_name)
                        row_cells[1].text = str(amount)
                else:
                    doc.add_paragraph("No cost breakdown provided.")
            continue

        doc.add_heading(key, level=2)
        if isinstance(value, list):
            for item in value:
                doc.add_paragraph(item, style="List Bullet")
        elif isinstance(value, dict):
            for sub_key, sub_value in value.items():
                doc.add_paragraph(f"{sub_key}: {sub_value}", style="List Bullet")
        else:
            doc.add_paragraph(str(value))

# Sections to include
sections = [
    "Project Overview",
    "Business Requirements",
    "Technical Requirements",
    "App Flow",
    "Project Scope",
    "Timeline & Resources",
    "Budget & Costing",
    "Risk Assessment"
]

for section in sections:
    content = structured_output.get(section, {})
    if content:
        add_section(section, content)

# Other Notes
other_notes = structured_output.get("Other Notes", "")
if other_notes:
    doc.add_heading("Other Notes", level=1)
    doc.add_paragraph(other_notes)

# Save Document
# Final validation: ensure at least one section was written
if not any(structured_output.get(s) for s in sections):
    logger.warning("No recognised sections found in LLM output; saving whatever was returned.")

try:
    doc.save(OUTPUT_DOCX)
    logger.info(f"✅ Project proposal created successfully: {OUTPUT_DOCX}")
except Exception as e:
    logger.exception(f"Failed to save document: {e}")


2025-10-14 11:24:17,659 INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-10-14 11:24:17,741 INFO: ✅ Project proposal created successfully: project_proposal.docx


In [33]:
# === Validation & Local Test Helpers ===
# Optional: use pydantic to validate the LLM output structure and provide defaults
try:
    from pydantic import BaseModel, Field
except Exception:
    print("pydantic not installed. Install with: pip install pydantic")
    BaseModel = object
    Field = lambda *a, **k: None

class ProjectOverviewModel(BaseModel):
    Project_Title: str = Field("", alias="Project Title")
    Client_Name: str = Field("", alias="Client Name")
    Project_Summary: str = Field("", alias="Project Summary")
    Objectives: list[str] = Field(default_factory=list, alias="Objectives")

class RootModel(BaseModel):
    Project_Overview: dict = Field(default_factory=dict, alias="Project Overview")
    Business_Requirements: dict = Field(default_factory=dict, alias="Business Requirements")
    Technical_Requirements: dict = Field(default_factory=dict, alias="Technical Requirements")
    App_Flow: dict = Field(default_factory=dict, alias="App Flow")
    Project_Scope: dict = Field(default_factory=dict, alias="Project Scope")
    Timeline_Resources: dict = Field(default_factory=dict, alias="Timeline & Resources")
    Budget_Costing: dict = Field(default_factory=dict, alias="Budget & Costing")
    Risk_Assessment: dict = Field(default_factory=dict, alias="Risk Assessment")
    Other_Notes: str = Field("", alias="Other Notes")

# MOCK mode for local testing without calling the LLM
MOCK_LLM = False
MOCK_RESPONSE = {
    "Project Overview": {
        "Project Title": "Sample Project",
        "Client Name": "ACME Corp",
        "Project Summary": "A sample project for testing.",
        "Objectives": ["Test OCR", "Generate docx"]
    },
    "Business Requirements": {
        "Goals": ["Goal A"],
        "Target Audience": "Developers",
        "Expected Outcomes": ["Outcome 1"],
        "Success Metrics": ["Metric 1"],
        "Monetization Strategy": "Subscription"
    }
}

if MOCK_LLM:
    structured_output = MOCK_RESPONSE
    print("Using MOCK LLM response for testing.")
else:
    print("Using live LLM response (structured_output as parsed earlier).")

# If pydantic available, validate
if BaseModel is not object:
    try:
        validated = RootModel.parse_obj(structured_output)
        print("Structured output validated with pydantic.")
    except Exception as e:
        print(f"Validation warning: {e}")

# You can re-run the doc generation cells now to test with MOCK_RESPONSE


Using live LLM response (structured_output as parsed earlier).
Structured output validated with pydantic.


C:\Users\Bd Calling\AppData\Local\Temp\ipykernel_2292\1497845967.py:54: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated = RootModel.parse_obj(structured_output)
