In [1]:
!pip install  google-generativeai
!pip install pdfplumber
!pip install BeautifulSoup4

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-p

In [2]:
import os
import json
import google.generativeai as genai
import pdfplumber
from bs4 import BeautifulSoup

In [14]:

GEMINI_API_KEY = "AIzaSyAlvyJ-iqBDe9k1Wrp25x9g5w6BaYpxYuI"
genai.configure(api_key=GEMINI_API_KEY)

def read_pdf_text(pdf_file_path: str) -> str:
    try:
        pdf_text = ""
        with pdfplumber.open(pdf_file_path) as pdf:
            for page in pdf.pages:
                pdf_text += page.extract_text()
        return pdf_text.strip()
    except Exception as error:
        raise RuntimeError(f"Failed to extract text from PDF: {error}")

def read_html_text(html_file_path: str) -> str:
    try:
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_soup = BeautifulSoup(file, "html.parser")
        return html_soup.get_text(separator="\n").strip()
    except Exception as error:
        raise RuntimeError(f"Failed to extract text from HTML: {error}")


In [15]:
def extract_structured_data_from_gemini(user_prompt: str):
    config_settings = {
        "temperature": 0.7,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
    }

    ai_model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=config_settings,
    )

    session = ai_model.start_chat(
        history=[
            {
                "role": "user",
                "parts": [
                    "You are an intelligent assistant tasked with analyzing RFP documents. Your goal is to extract and structure relevant information from the input text."
                    " Carefully analyze the content and return the structured data in the following JSON format, which includes all fields, even if some are missing or have no value."
                    "\n\n"
                    "JSON Format:\n"
                    "{\n"
                    "  \"Fields\": {\n"
                    "    \"Bid Number\": \"value\",\n"
                    "    \"Title\": \"value\",\n"
                    "    \"Due Date\": \"value\",\n"
                    "    \"Bid Submission Type\": \"value\",\n"
                    "    \"Term of Bid\": \"value\",\n"
                    "    \"Pre Bid Meeting\": \"value\",\n"
                    "    \"Installation\": \"value\",\n"
                    "    \"Bid Bond Requirement\": \"value\",\n"
                    "    \"Delivery Date\": \"value\",\n"
                    "    \"Payment Terms\": \"value\",\n"
                    "    \"Any Additional Documentation Required\": \"value\",\n"
                    "    \"MFG for Registration\": \"value\",\n"
                    "    \"Contract or Cooperative to use\": \"value\",\n"
                    "    \"Model_no\": \"value\",\n"
                    "    \"Part_no\": \"value\",\n"
                    "    \"Product\": \"value\",\n"
                    "    \"Contact Info\": \"value\",\n"
                    "    \"Company Name\": \"value\",\n"
                    "    \"Bid Summary\": \"value\",\n"
                    "    \"Product Specification\": \"value\"\n"
                    "  }\n"
                    "}\n\n"
                    "Notes:\n"
                    "1. Ensure every field is included in the JSON response, even if the value is 'null' or empty.\n"
                    "2. Identify and map field names carefully to their corresponding values from the input text.\n"
                    "3. Handle cases where data is missing or not explicitly mentioned in the input by assigning an empty string (\"\").\n"
                    "4. For fields like 'Bid Summary' and 'Product Specification', provide detailed and structured information where available.\n"
                    "5. Avoid returning data in any format other than JSON."
                    "6. Give the respone in plain text but in json structure so that your response starts with { and ends with } and nothing else should be in respone"
                    "7. Dont use ```json in respone as it cannot pass python json parsor"
                ],
            },
            {
                "role": "model",
                "parts": [
                    "Understood. I will analyze the input RFP text and return the data in the requested JSON format. I will ensure that all specified fields are present and correctly mapped."
                ],
            },
        ]
    )

    api_response = session.send_message(user_prompt)
    structured_response = api_response.text
    print(f"Raw response from Gemini API: {structured_response}")

    if structured_response.startswith("```json"):
        structured_response = structured_response.lstrip("```json").strip("```")
    structured_response = structured_response.replace("\\", "\\\\")
    structured_response = structured_response.replace('\n', '\\n')
    structured_response = structured_response.strip()
    structured_response = structured_response.encode("utf-8", "ignore").decode("utf-8")

    return structured_response


In [16]:
def process_rfp_document(file_path: str):
    if file_path.endswith(".pdf"):
        document_text = read_pdf_text(file_path)
    elif file_path.endswith(".html"):
        document_text = read_html_text(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or HTML file.")

    structured_output = extract_structured_data_from_gemini(document_text)
    return structured_output


In [17]:
if __name__ == "__main__":
    file_path = input("Enter the path to the RFP file (PDF or HTML): ").strip()


    structured_data = process_rfp_document(file_path)




Enter the path to the RFP file (PDF or HTML): /Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html
Raw response from Gemini API: ```json
{
  "Fields": {
    "Bid Number": "BPM044557",
    "Title": "Dell Laptops w/Extended Warranty",
    "Due Date": "06/10/2024 02:00 PM EDT",
    "Bid Submission Type": "RFP - Request for Proposal (Informal)",
    "Term of Bid": null,
    "Pre Bid Meeting": null,
    "Installation": null,
    "Bid Bond Requirement": null,
    "Delivery Date": null,
    "Payment Terms": null,
    "Any Additional Documentation Required": null,
    "MFG for Registration": null,
    "Contract or Cooperative to use": null,
    "Model_no": "Latitude 5550, WD22TB4",
    "Part_no": "CC7802",
    "Product": "Dell Latitude 5550 Laptops, Dell Thunderbolt 4 Dock – WD22TB4",
    "Contact Info": "Tamaira Hawkins\n410-260-7533\nThawkins@treasurer.state.md.us",
    "Company Name": "State of Maryland Treasurer's Office",
    "Bid Summary": "1. SI# CC7802 Dell La

# New Section