In [2]:
!pip install pymupdf sentence-transformers faiss-cpu openai google-generativeai

import os
import fitz  # PyMuPDF
import re
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from google.colab import files # Import files for upload
import google.generativeai as genai # Import for Gemini API
from google.colab import userdata # Import for Colab secrets

# 1. PDF TEXT EXTRACTION  #


def extract_text_from_pdf(pdf_path):
    # Check if the file exists before attempting to open
    if not os.path.exists(pdf_path):
        print(f"Error: The file '{pdf_path}' was not found.")
        print("Please upload your PDF manual.")
        uploaded = files.upload()
        if not uploaded:
            raise FileNotFoundError(f"No file uploaded. Expected '{pdf_path}'.")
        # Assuming the user uploads a single file and we want to use it
        uploaded_filename = list(uploaded.keys())[0]
        # If the uploaded file has a different name, rename it to 'manual.pdf'
        if uploaded_filename != "manual.pdf":
            os.rename(uploaded_filename, "manual.pdf")
            print(f"Uploaded file '{uploaded_filename}' renamed to 'manual.pdf'.")
        pdf_path = "manual.pdf" # Update path to the newly available file

    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text() + "\n"
    return text


###########################
# 2. CHUNKING
###########################

def chunk_text(text, chunk_size=800, max_paragraph_words=500):
    # Split text into paragraphs based on double newlines
    paragraphs = re.split(r'\n\n+', text)
    chunks = []

    for para in paragraphs:
        words = para.split()
        if len(words) > max_paragraph_words:
            # If a paragraph is too long, chunk it by words
            chunks.extend([" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)])
        elif len(words) > 0:
            # Otherwise, keep the paragraph as a single chunk
            chunks.append(" ".join(words))
    return chunks


###########################
# 2a. TEXT CLEANING
###########################

def clean_text(text):
    # Remove multiple spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase to ensure consistency for embedding and retrieval
    text = text.lower()
    return text


###########################
# 3. EMBEDDINGS + FAISS DB
###########################

def build_faiss_index(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))
    return index, model


###########################
# 4. RETRIEVAL FUNCTION
###########################

def retrieve(query, index, model, chunks, k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    return [chunks[i] for i in indices[0]]


###########################
# 5. LLM JSON EXTRACTION
###########################

def extract_structured_data(query, retrieved_text):
    # WARNING: Hardcoding API keys is NOT RECOMMENDED for security reasons.
    # It's best practice to use Colab secrets for API keys.
    genai.configure(api_key="AIzaSyDmLIgHZ9-K3JbhGxsGBQ9jk9EjhFG9xFE")

    # Reverting to 'gemini-pro' as 'gemini-1.0-pro' was also not found.
    # Check the output from genai.list_models() to confirm available models
    gemini_model = genai.GenerativeModel('gemini-2.5-flash')

    prompt = f"""
Extract vehicle specifications from the following text based on the user's query.
If the exact component from the query is not explicitly mentioned with a value,
but a relevant value (e.g., torque) is found in close proximity to a general
component (e.g., 'brakes' or 'disc brakes'), infer that general component.

Return ONLY a JSON list with fields:
component, spec_type, value, unit.

If no relevant values are found, return an empty list [].

Query: "{query}"

Text:
{retrieved_text}
"""

    response = gemini_model.generate_content(
        prompt,
        generation_config={
            "temperature": 0
        }
    )

    return response.text.strip()


###########################
# 6. MAIN EXECUTION
###########################

def run_pipeline(pdf_path, query):
    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    print("--- Extracted Text ---")
    print(text[:500] + "...") # Print first 500 chars for brevity
    print("----------------------")

    print("Cleaning text...")
    text = clean_text(text)
    print("--- Cleaned Text ---")
    print(text[:500] + "...") # Print first 500 chars of cleaned text
    print("--------------------")

    print("Chunking...")
    chunks = chunk_text(text)
    print("--- Chunks (first 3) ---")
    for i, chunk in enumerate(chunks[:3]):
        print(f"Chunk {i+1}: {chunk[:200]}...") # Print first 200 chars of each chunk
    print("------------------------")

    # Add a check for empty chunks
    if not chunks:
        print("Error: No text could be extracted or chunked from the PDF. Cannot build FAISS index.")
        return

    print("Building embeddings + FAISS index...")
    index, model = build_faiss_index(chunks)

    print("Retrieving relevant chunks...")
    relevant_chunks = retrieve(query, index, model, chunks, k=3)
    print("--- Retrieved Chunks ---")
    for i, chunk in enumerate(relevant_chunks):
        print(f"Relevant Chunk {i+1}: {chunk[:200]}...") # Print first 200 chars of each relevant chunk
    print("------------------------")

    retrieved_text = "\n\n---\n\n".join(relevant_chunks)

    print("\nAsking LLM to extract structure...")
    result = extract_structured_data(query, retrieved_text)

    print("\n================ JSON RESULT ================")
    print(result)
    print("=============================================")


###########################
# RUN PIPELINE
###########################

if __name__ == "__main__":
    pdf_path = "/content/sample-service-manual 1.pdf"  # This path will be used, and if not found, upload will be prompted
    query = "Torque for brake caliper bolts"
    run_pipeline(pdf_path, query)

Extracting text from PDF...
Error: The file '/content/sample-service-manual 1.pdf' was not found.
Please upload your PDF manual.


Saving sample-service-manual 1.pdf to sample-service-manual 1.pdf
Uploaded file 'sample-service-manual 1.pdf' renamed to 'manual.pdf'.
--- Extracted Text ---
Suspension System 
Inspection and Verification 
1.
Road test. 
z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are 
apparent, refer to Section 100-04 . 
2.
Inspect tires. 
z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer to the 
Vehicle Certification (VC) label. 
z Verify that all tires are sized to specification. Refer to the VC label. 
z Inspect the tires for incorrect wear and damage. Install new tires as ne...
----------------------
Cleaning text...
--- Cleaned Text ---
suspension system inspection and verification 1. road test. z verify the customer concern by carrying out a road test on a smooth road. if any vibrations are apparent, refer to section 100-04 . 2. inspect tires. z check the tire pressure with all normal loads in the vehicle an