<a href="https://colab.research.google.com/github/Nagarjun-07/Advance-Face-recognition-attendance-management/blob/main/Grant_Automation_Agent_Proj_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%pip install pdfplumber
%pip install pytesseract
%pip install pdf2image
import os
import re
import json
import torch
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from datetime import datetime
from transformers import pipeline
from openai import OpenAI, AuthenticationError
from google.colab import files, userdata



In [15]:
# Re-run this cell to ensure the corrected class definition is loaded
class BioMetallicaLLMAgent:
    """
    A professional-grade class for OCR processing, LLM analysis, and TRL assessment.
    Uses OpenRouter as the primary LLM provider with Transformers as a fallback.
    """

    def __init__(self):
        """Initialize with OpenRouter client using a secure API key."""
        self.api_key = userdata.get('OPENROUTER_API_KEY') or "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"
        if not self.api_key:
            raise ValueError("Error: No OpenRouter API key found in Colab Secrets. Please add OPENROUTER_API_KEY via the Secrets panel.")
        self.openrouter_client = OpenAI(
            api_key=self.api_key,
            base_url="https://openrouter.ai/api/v1"
        )
        self.transformers_generator = None

    def process_uploaded_file(self, file_path: str) -> str:
        """
        Converts an uploaded PDF or scanned handwritten file into raw text using OCR.

        Args:
            file_path (str): Path to the uploaded file (PDF or image).

        Returns:
            str: Extracted text.

        Raises:
            ValueError: If the file format is unsupported or processing fails.
        """
        extracted_text = ""
        try:
            if file_path.lower().endswith(('.pdf', '.PDF')):
                with pdfplumber.open(file_path) as pdf:
                    for page in pdf.pages:
                        text = page.extract_text() or ""
                        extracted_text += text
                if not extracted_text.strip():
                    print("Fallback to OCR for scanned PDF...")
                    images = convert_from_path(file_path)
                    for image in images:
                        extracted_text += pytesseract.image_to_string(image).strip()
            elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG')):
                image = Image.open(file_path)
                extracted_text = pytesseract.image_to_string(image).strip()
            else:
                raise ValueError("Unsupported file format. Use PDF, PNG, JPG, or JPEG.")
            return extracted_text
        except Exception as e:
            raise ValueError(f"Error processing file: {str(e)}. Ensure poppler is installed for PDF processing.")

    def _initialize_transformers(self):
        """Lazy initialization of Transformers pipeline for fallback LLM."""
        if self.transformers_generator is None:
            self.transformers_generator = pipeline("text-generation", model="facebook/bart-large", device=0 if torch.cuda.is_available() else -1)

    def run_llm_insight_analysis(self, extracted_text: str, system_prompt: str) -> str:
        """
        Generates a whitepaper-style technical briefing using OpenRouter LLM.

        Args:
            extracted_text (str): Text extracted from the file.
            system_prompt (str): Prompt to guide LLM generation.

        Returns:
            str: Insight report in whitepaper format.

        Raises:
            ValueError: If LLM generation fails.
        """
        if not extracted_text or len(extracted_text) > 5000:
            raise ValueError("Extracted text is empty or exceeds 5000 characters.")
        try:
            response = self.openrouter_client.chat.completions.create(
                model="openrouter/auto",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": extracted_text}
                ],
                max_tokens=1500,
                temperature=0.7
            )
            insight_report = response.choices[0].message.content.strip()
        except AuthenticationError as ae:
            print(f"Warning: OpenRouter authentication failed: {str(ae)}. Switching to Transformers.")
            self._initialize_transformers()
            prompt = f"{system_prompt}\n{extracted_text}"
            insight_report = self.transformers_generator(prompt, max_length=1500, num_return_sequences=1)[0]["generated_text"]
        except Exception as e:
            raise ValueError(f"Error in LLM analysis: {str(e)}")

        if not insight_report.startswith("#"):
            insight_report = f"# Technical Briefing: BioMetallica System\n\n## Introduction\n{insight_report}"
        return insight_report

    def extract_trl_breakdown(self, technical_content: str) -> dict:
        """
        Assesses maturity of bioreactor system components using OpenRouter LLM.

        Args:
            technical_content (str): Technical description of the system.

        Returns:
            dict: TRL breakdown with components, TRL levels, justifications, and timestamps.

        Raises:
            ValueError: If TRL extraction fails.
        """
        if not technical_content or len(technical_content) > 5000:
            raise ValueError("Technical content is empty or exceeds 5000 characters.")
        try:
            trl_patterns = {
                "TRL 1": r"basic principles|initial concept|theoretical",
                "TRL 2": r"formulation|conceptual design|application",
                "TRL 3": r"proof of concept|experimental|prototype",
                "TRL 4": r"lab validation|small-scale testing",
                "TRL 5": r"large-scale testing|integrated prototype",
                "TRL 6": r"demonstrated in relevant environment",
                "TRL 7": r"system prototype|operational environment",
                "TRL 8": r"system complete|qualified",
                "TRL 9": r"fully operational|deployed"
            }
            components = list(set(re.findall(r"(bioreactor|reactor|sensor|control system|pump|valve)", technical_content, re.IGNORECASE)))

            system_prompt = (
                "You are a TRL assessment expert. Analyze the technical content and assign a TRL (1-9) "
                "to each mentioned component (e.g., sensor, control system, pump, valve). Return ONLY a valid JSON object "
                "mapping components to their TRL levels and justifications, with a 'timestamp' field. Example: "
                "{'sensor': {'trl': 4, 'justification': 'Lab validated', 'timestamp': '2025-07-19 17:32:00 IST'}}"
            )
            response = self.openrouter_client.chat.completions.create(
                model="openrouter/auto",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": technical_content}
                ],
                max_tokens=1000,
                temperature=0.5
            )
            content = response.choices[0].message.content.strip()
            try:
                trl_breakdown = json.loads(content)
                if not isinstance(trl_breakdown, dict):
                    raise ValueError("Response is not a valid JSON object.")
            except json.JSONDecodeError as je:
                raise ValueError(f"Failed to parse JSON response: {str(je)}. Raw content: {content}")

            # Validate and enhance TRL breakdown
            current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S IST")
            for component in components:
                if component not in trl_breakdown:
                    trl_breakdown[component] = {
                        "trl": 1,
                        "justification": "No data, defaulting to theoretical stage.",
                        "timestamp": current_time
                    }
                else:
                    trl = trl_breakdown[component].get("trl")
                    if not isinstance(trl, int) or trl < 1 or trl > 9:
                        trl_breakdown[component]["trl"] = 1
                        trl_breakdown[component]["justification"] += " (Invalid TRL corrected to 1)."
                    trl_breakdown[component]["timestamp"] = current_time

            return trl_breakdown
        except AuthenticationError as ae:
            raise ValueError(f"Error: Invalid OpenRouter API key. Check Colab Secrets or ensure the key is correct.") from ae
        except Exception as e:
            raise ValueError(f"Error in TRL breakdown: {str(e)}") from e

In [16]:
# Re-run this cell to continue the process
if __name__ == "__main__":
    try:
        agent = BioMetallicaLLMAgent()
        print("BioMetallica LLM Agent initialized successfully.\n")
    except ValueError as e:
        print(f"Initialization failed: {str(e)}")
        exit(1)

    # Upload file for processing
    print("Please upload a PDF or image file...")
    uploaded = files.upload()
    file_path = next(iter(uploaded.keys()))  # Use the first uploaded file

    try:
        print(f"\nProcessing file: {file_path}")
        extracted_text = agent.process_uploaded_file(file_path)
        print("Extracted Text:")
        print(extracted_text if extracted_text else "No text extracted.")
    except ValueError as e:
        print(f"File Processing Error: {str(e)}")

    print("\nRunning LLM Insight Analysis...")
    try:
        system_prompt = (
            "Generate a whitepaper-style technical briefing summarizing the bioreactor system details. "
            "Include sections for Introduction, Components, and Conclusion."
        )
        report = agent.run_llm_insight_analysis(extracted_text, system_prompt)
        print("Insight Report:\n", report)
    except ValueError as e:
        print(f"LLM Error: {str(e)}")

    print("\nRunning TRL Breakdown Extraction...")
    try:
        trl_output = agent.extract_trl_breakdown(extracted_text)
        print("TRL Breakdown:\n", json.dumps(trl_output, indent=2))
    except ValueError as e:
        print(f"TRL Error: {str(e)}")

BioMetallica LLM Agent initialized successfully.

Please upload a PDF or image file...


Saving Bioreactor_Technical_Report_Sample.pdf to Bioreactor_Technical_Report_Sample (4).pdf

Processing file: Bioreactor_Technical_Report_Sample (4).pdf
Extracted Text:
Technical Report: Design and Development of an Automated Bioreactor System
Introduction:
This report outlines the architecture and development of a fully automated bioreactor system designed for controlled fermentation processes. The system includes modular components that integrate sensors, actuators, and control software.
Components:
1. Sensor Module:
- Temperature sensor (experimental validation achieved)
- pH sensor (prototype stage)
2. Control System:
- Microcontroller-based PID controller
- Communication via RS485 protocol
3. Pump and Valve Mechanisms:
- Peristaltic pump with flow control
- Solenoid valves for media input/output
Technology Readiness Levels:
- Temperature sensor module is at TRL 4 (lab validation).
- pH sensor module is at TRL 3 (proof of concept).
- Control system software is at TRL 2 (initial dev