In [191]:
import pdfplumber
import re
import os
import json
import google.generativeai as genai
from sentence_transformers import SentenceTransformer, util


In [192]:

# Open the PDF
with pdfplumber.open("/content/document.pdf") as pdf:
    text_data = []
    tables_data = []

    # Loop through each page in the PDF
    for page in pdf.pages:
        # Extract text and append to text_data
        text_data.append(page.extract_text())

        # Extract tables and append to tables_data
        tables_data.extend(page.extract_tables())


In [193]:
len(text_data)

23

In [194]:
len(tables_data)

4

In [195]:
tables_data[0]

[['Please find enclosed herewith the following:', None, 'Page No.'],
 ['1.',
  'e-Tender Notice - NITJ/DRC/PUR/TT/36/2024\n& Schedule for Opening of the e-Tender',
  '2'],
 ['2.', 'Annexure-I (Detail of Equipment, Tender Fee & EMD)', '3'],
 ['3.', 'Important Note', '4'],
 ['4.', 'Instruction to Tenderers', '5-7'],
 ['5.', 'Tender Evaluation', '8-9'],
 ['6.', 'Terms & Conditions', '10-11'],
 ['7.', 'Acceptance of terms and conditions', '12'],
 ['8.', 'Questionnaires A & B', '13-14'],
 ['9.', 'Performa for performance statement', '15'],
 ['10.', 'Annexure ‘A’ (Specifications of the Equipment)', '16'],
 ['11.',
  'Annexure ‘B’ & ‘C’ (Format for Performance Bond/\nGuarantee & Format For Performance Bond (Bank\nGuarantee)',
  '17-18'],
 ['12.',
  'Annexure – ‘D’ (Format for Manufacturer’s Authorization\nForm)',
  '19'],
 ['13.',
  'Annexure- ‘E’ (Declaration Regarding Blacklisting/ Debarring\nfor taking part in Tender)',
  '20'],
 ['14.', 'Annexure –‘F’ (Certificate of Warranty)', '21'],
 [

In [196]:
# Combine text from all pages into a single string
text_combined = " ".join(text_data)

# Optional: Clean the text by removing excessive whitespace and newline characters
text_cleaned = text_combined.replace("\n", " ").strip()

# Split the text into smaller chunks (for example, by sentence or paragraph)
text_chunks = text_cleaned.split(". ")  # Splits by sentence; adjust based on your needs

# Inspect the result
for chunk in text_chunks[:5]:  # View the first few chunks
    print(chunk)


Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY G T Road By Pass, Jalandhar-144008, Punjab (India) EPABX-0181-2690301-453 Email drc@nitj.ac.in Ref
e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 Please find enclosed herewith the following: Page No
1
e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 2 & Schedule for Opening of the e-Tender 2
Annexure-I (Detail of Equipment, Tender Fee & EMD) 3 3


In [197]:

# Pages that contain tables
table_pages = [0, 1, 2, 14]

# Append the table data to text data on the respective pages
for i, page in enumerate(table_pages):
    # Append table data from tables_data[i] to the respective page in text_data
    text_data[page] += f"\nTable Data:\n{tables_data[i]}"

# Display the modified text_data to confirm the changes
for page, content in enumerate(text_data):
    print(f"Page {page} Content:\n{content}\n")


Page 0 Content:
Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY
G T Road By Pass, Jalandhar-144008, Punjab (India)
EPABX-0181-2690301-453 Email drc@nitj.ac.in
Ref. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024
Please find enclosed herewith the following: Page No.
1. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 2
& Schedule for Opening of the e-Tender
2. Annexure-I (Detail of Equipment, Tender Fee & EMD) 3
3. Important Note 4
4. Instruction to Tenderers 5-7
5. Tender Evaluation 8-9
6. Terms & Conditions 10-11
7. Acceptance of terms and conditions 12
8. Questionnaires A & B 13-14
9. Performa for performance statement 15
10. Annexure ‘A’ (Specifications of the Equipment) 16
11. Annexure ‘B’ & ‘C’ (Format for Performance Bond/ 17-18
Guarantee & Format For Performance Bond (Bank
Guarantee)
12. Annexure – ‘D’ (Format for Manufacturer’s Authorization 19
Form)
13. Annexure- ‘E’ (Declaration Regarding Blacklisting/ Debarring 20
for taking part in Tender)
14. Annexure –‘F’ (Certificate of Warranty) 21

In [198]:


# Example of the repeated header text to remove
header_text = """Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY
G T Road By Pass, Jalandhar-144008, Punjab (India)
EPABX-0181-2690301-453 Email drc@nitj.ac.in"""


# Function to clean each page
def preprocess_text(text):
    # Remove the header text
    text = re.sub(re.escape(header_text), "", text, flags=re.IGNORECASE)

    # Remove dotted lines for signatures
    text = re.sub(r'\.{5,}', '', text)
    text = re.sub(r'\_{5,}', '', text)
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to each page
cleaned_text_data = [preprocess_text(page) for page in text_data]

# Optionally, add the header to the first page or a specific page if needed
cleaned_text_data.insert(0, header_text)

# Display cleaned text data
for i, content in enumerate(cleaned_text_data):
    print(f"Page {i} Content:\n{content}\n")


Page 0 Content:
Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY
G T Road By Pass, Jalandhar-144008, Punjab (India)
EPABX-0181-2690301-453 Email drc@nitj.ac.in

Page 1 Content:
Ref. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 Please find enclosed herewith the following: Page No. 1. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 2 & Schedule for Opening of the e-Tender 2. Annexure-I (Detail of Equipment, Tender Fee & EMD) 3 3. Important Note 4 4. Instruction to Tenderers 5-7 5. Tender Evaluation 8-9 6. Terms & Conditions 10-11 7. Acceptance of terms and conditions 12 8. Questionnaires A & B 13-14 9. Performa for performance statement 15 10. Annexure ‘A’ (Specifications of the Equipment) 16 11. Annexure ‘B’ & ‘C’ (Format for Performance Bond/ 17-18 Guarantee & Format For Performance Bond (Bank Guarantee) 12. Annexure – ‘D’ (Format for Manufacturer’s Authorization 19 Form) 13. Annexure- ‘E’ (Declaration Regarding Blacklisting/ Debarring 20 for taking part in Tender) 14. Annexure –‘F’ (Certificat

In [199]:
for i in range(24):
  print(i)
  print(cleaned_text_data[i])
  print("------------")

0
Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY
G T Road By Pass, Jalandhar-144008, Punjab (India)
EPABX-0181-2690301-453 Email drc@nitj.ac.in
------------
1
Ref. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 Please find enclosed herewith the following: Page No. 1. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 2 & Schedule for Opening of the e-Tender 2. Annexure-I (Detail of Equipment, Tender Fee & EMD) 3 3. Important Note 4 4. Instruction to Tenderers 5-7 5. Tender Evaluation 8-9 6. Terms & Conditions 10-11 7. Acceptance of terms and conditions 12 8. Questionnaires A & B 13-14 9. Performa for performance statement 15 10. Annexure ‘A’ (Specifications of the Equipment) 16 11. Annexure ‘B’ & ‘C’ (Format for Performance Bond/ 17-18 Guarantee & Format For Performance Bond (Bank Guarantee) 12. Annexure – ‘D’ (Format for Manufacturer’s Authorization 19 Form) 13. Annexure- ‘E’ (Declaration Regarding Blacklisting/ Debarring 20 for taking part in Tender) 14. Annexure –‘F’ (Certificate of Warranty) 2

In [200]:
from nltk import sent_tokenize  # You can use NLTK for sentence tokenization

# Chunk text by sentence or paragraph (adjust chunk size as needed)
chunked_data = []
chunk_size = 5  # Example size; adjust based on desired chunk granularity

for page_num, page_content in enumerate(cleaned_text_data):
    sentences = sent_tokenize(page_content)  # Split text into sentences
    for i in range(0, len(sentences), chunk_size):
        chunk = " ".join(sentences[i:i + chunk_size])  # Group sentences into a chunk
        chunked_data.append({
            "page_num": page_num,
            "content": chunk,
            "metadata": {"page": page_num}
        })


In [201]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [202]:
import json

with open("formatted_data.json", "w") as f:
    json.dump(chunked_data, f)


In [203]:
from sentence_transformers import SentenceTransformer
import faiss
import json
import numpy as np

# Load the formatted text JSON
with open('/content/formatted_data.json', 'r') as f:
    formatted_text = json.load(f)

# Initialize the model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract the content for each page
texts = [page_data["content"] for page_data in formatted_text]

# Generate embeddings for each page's content
embeddings = model.encode(texts, convert_to_numpy=True)

# Check the shape of embeddings (Should be (num_pages, embedding_dim))
print(embeddings.shape)


(83, 384)


In [204]:
# Dimensionality of the embeddings (this is 384 for the 'all-MiniLM-L6-v2' model)
dim = embeddings.shape[1]

# Create a FAISS index using L2 distance (Euclidean distance)
index = faiss.IndexFlatL2(dim)

# Add the embeddings to the FAISS index
index.add(embeddings)

# Check the number of items in the index
print(f"Number of items in the index: {index.ntotal}")


Number of items in the index: 83


In [205]:
# Print the structure of formatted_text to understand its format
print(formatted_text)


[{'page_num': 0, 'content': 'Dr B R AMBEDKAR NATIONAL INSTITUTE OF TECHNOLOGY\nG T Road By Pass, Jalandhar-144008, Punjab (India)\nEPABX-0181-2690301-453 Email drc@nitj.ac.in', 'metadata': {'page': 0}}, {'page_num': 1, 'content': 'Ref. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 Please find enclosed herewith the following: Page No. 1. e-Tender Notice - NITJ/DRC/PUR/TT/36/2024 2 & Schedule for Opening of the e-Tender 2. Annexure-I (Detail of Equipment, Tender Fee & EMD) 3 3. Important Note 4 4.', 'metadata': {'page': 1}}, {'page_num': 1, 'content': 'Instruction to Tenderers 5-7 5. Tender Evaluation 8-9 6. Terms & Conditions 10-11 7. Acceptance of terms and conditions 12 8. Questionnaires A & B 13-14 9.', 'metadata': {'page': 1}}, {'page_num': 1, 'content': 'Performa for performance statement 15 10. Annexure ‘A’ (Specifications of the Equipment) 16 11. Annexure ‘B’ & ‘C’ (Format for Performance Bond/ 17-18 Guarantee & Format For Performance Bond (Bank Guarantee) 12. Annexure – ‘D’ (Format 

In [206]:
# Example search function
def search(query, top_k=5):
    # Generate the embedding for the query
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Perform the search for the top_k most similar documents
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the corresponding texts for the top_k results
    results = [formatted_text[idx]["content"] for idx in indices[0]]
    return results, distances[0]

# Example query
query = "emd for Fabrication of Machine for Continuous Production of Textile Waste Based Composite Materials"
similar_texts, distances = search(query, top_k=5)

# Display the results
for i, (text, dist) in enumerate(zip(similar_texts, distances)):
    print(f"Result {i+1}:")
    print(f"Text: {text}")
    print(f"Distance: {dist}\n")


Result 1:
Text: Fabrication of Machine for Continuous 01 Rs. 500/- Rs.90,000/- Production of Textile Waste Based Composite Materials Note: The quantity of required equipment/item may vary as per requirement. *Exemption of Tender fee & EMD will only be given to MSME/NSIC registered bidders. The bidder need to fill the Annexure-H also duly stamped and signed failing which, the bid will not be considered valid for technical evaluation. 3 Table Data: [['Sr.
Distance: 0.7218655943870544

Result 2:
Text: EMD in the form of a Demand Draft in favour of the Security – A/c, Dr B R Ambedkar NIT, payable at Jalandhar (refundable separate) and Tender Fee in the form of a Demand Draft in favour of the Director, Dr B R Ambedkar NIT, payable at Jalandhar (Non- refundable separate) should also be submitted in physical form to the following address as per scheduled time given for physical submission of EMD and Tender fee. The Envelope should be super- scribed as EMD and Tender Fee for Fabrication of Mac

In [207]:

# Set up API key for Google Generative AI
genai.configure(api_key="AIzaSyA-0rPJIY5Y4tmIlaoG5DgmjjuaxBcFHIA")

# Step 1: Initialize the SentenceTransformer model for query embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
# Print the generated response
print(answer.text)

import google.generativeai as genai
from sentence_transformers import SentenceTransformer, util

# Set up API key for Google Generative AI
genai.configure(api_key="AIzaSyA-0rPJIY5Y4tmIlaoG5DgmjjuaxBcFHIA")

# Step 1: Initialize the SentenceTransformer model for query embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Placeholder for similar_texts - assume this is generated by the `search` function
similar_texts, distances = search(query, top_k=10)  # Replace `search` function with your actual function

# Step 2: Combine all similar texts into one context
combined_context = " ".join(similar_texts)
print(combined_context)

# Define the comprehensive query to extract information on all categories of tender information
query = ("Please provide a detailed overview of the tender, including basic information such as reference numbers, "
         "titles, institution, timeline-related details (submission dates, deadlines), financial requirements (fees, "
         "payment terms), eligibility criteria (registrations, certifications), technical specifications, and contact "
         "information.")

# Step 4: Generate response and save to JSON
def get_gemini_response(question, context):
    # Create a model instance with 'gemini'
    model = genai.GenerativeModel('models/gemini-1.0-pro-latest')

    # Formulate the prompt to guide the model's response
    prompt = (f"Please summarize all the important information regarding the tender based on the following context. "
              f"Include details such as reference number, title, submission deadline, payment terms, eligibility "
              f"criteria, fees, and any other relevant details to help someone understand the key aspects of the "
              f"tender contract:\n\n{context}\n\nQuestion: {question}")

    # Generate response using the model
    response = model.generate_content([prompt, question])
    return response

# Get the response from Gemini AI
answer = get_gemini_response(query, combined_context)

# Store the generated response in a dictionary
response_data = {
    "question": query,
    "context": combined_context,
    "response": answer.text
}

# Save the response data in a JSON file
with open("tender_info_response1.json", "w") as json_file:
    json.dump(response_data, json_file, indent=4)

# Print confirmation message
print("Response saved to tender_info_response1.json")


**Reference Number:** NITJ/DRC/PUR/TT/36/2024

**Title:** Fabrication of Machine for Continuous Production of Textile Waste Based Composite Materials

**Institution:** National Institute of Technology, Jalandhar (NITJ)

**Timeline:**

* Start Date for e-tender Submission: 16.10.2024 at 03:00 PM
* Last Date for Online Bid Submission: 06.11.2024 at 03:00 PM
* Opening of Technical e-Bid (online): 07.11.2024 at 03:00 PM

**Financial Requirements:**

* Tender Fee: As per Annexure-I of the tender document
* EMD (Earnest Money Deposit): As per Annexure-I of the tender document

**Eligibility Criteria:**

* Registered on the CPP Portal for e-tendering
* Only e-tenders will be accepted
* Class-I and Class-II Local Suppliers are eligible to participate
* Non-Local Suppliers are not eligible
* Self-Declaration for local content must be submitted with the Technical Bid

**Fee and EMD:** As per Annexure-I of the tender document

**Technical Specifications:**

* Detailed technical specifications are