### Import requiremnets

In [None]:
!pip install -q pymupdf sentence-transformers faiss-cpu langgraph langchain-core pydantic groq google-generativeai streamlit

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import re
import json
import fitz  # PyMuPDF
import faiss
import numpy as np
from typing import List, TypedDict
from sentence_transformers import SentenceTransformer
from pydantic import BaseModel, Field
from langgraph.graph import StateGraph, END
from langchain_core.messages import HumanMessage


### setup groq api key

In [None]:
from groq import Groq
GROQ_API_KEY = "put your Groq api key here"
client = Groq(api_key=GROQ_API_KEY)
MODEL_NAME = "llama3-70b-8192"

### setup gemini api key

In [None]:
import google.generativeai as genai
GEMINI_API_KEY = "Put your gemini key here"
genai.configure(api_key=GEMINI_API_KEY)
MODEL_NAME = "gemini-2.5-flash"

In [None]:
# here we are defining Schema for a single extracted vehicle specification.
class SpecItem(BaseModel):
    component: str = Field(...)  # name of vehivle component or part
    spec_type: str = Field(...)    #type of specificationn such as torque
    value: str = Field(...)        #numeric value of specification
    unit: str = Field(...)        #unit corresp. to that value


### extract text from pdf

In [None]:
# here we extract text from each page and store it as list of dic having page no and text in that page
def extract_pdf_text(pdf_path: str) -> List[dict]:
    doc = fitz.open(pdf_path)
    pages = []
    for page_num, page in enumerate(doc):
        text = page.get_text("text")
        pages.append({
            "page": page_num + 1,
            "text": text
        })
    return pages

### chunking

In [None]:
def chunk_text(pages, chunk_size=500, overlap=100):
    chunks = []      #list ti store all text chunk
    for page in pages:
        text = page["text"]    #extract text from chunks
        words = text.split()   #split into words
    # slide over words with overlap to preserve context
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:i + chunk_size]
            chunk = " ".join(chunk_words)
            #Store chunk with page metadata
            chunks.append({
                "page": page["page"],
                "text": chunk
            })
    return chunks


### convert text to embedding

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
def embed_chunks(chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    return np.array(embeddings).astype("float32")   ## Convert to float32 numpy array (required for FAISS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### build vector database

In [None]:
#build faiss index from vector embeddings
def build_faiss_index(embeddings):   #embeddins-2d array of shape -num_chunks,embedd dim
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # initialize faiss index using L2 euclidean distance
    index.add(embeddings)      #add alll embeddings to index
    return index    #faiss index with added embeddings

### Retrieval

In [None]:
#here we convert query into vector embedding
def retrieve(query, index, chunks, k=5):
    query_embedding = embedding_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, k)  # Search FAISS index for top-k similar chunks
    results = []   # Collect retrieved chunks using returned indices
    for idx in indices[0]:
        results.append(chunks[idx])
    return results

### llm setup and prompting

#### for groq llm

In [None]:
def call_llm(context, query):
    prompt = f"""
You are an automotive specification extraction system.

Your task is to extract ONLY structured specification data from the provided context.

You MUST return STRICTLY valid JSON.
Do NOT include explanations.
Do NOT include markdown.
Do NOT include ```json fences.

Return a JSON LIST in EXACTLY this format:

[
  {{
    "component": "string",
    "spec_type": "Torque | Fluid Capacity | Part Number | Other",
    "value": "numeric value only",
    "unit": "unit string"
  }}
]

Important Rules:
- If both Nm and lb-ft exist, create TWO separate entries.
- Do NOT group multiple units inside one object.
- "value" must contain only the number.
- "unit" must contain only the unit.
- No additional fields allowed.
- Output must start with [ and end with ].

Context:
{context}

Query:
{query}
"""
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip()

#### for gemini llm

In [None]:
def call_llm(context, query):
    model = genai.GenerativeModel(MODEL_NAME)
    prompt = f"""
Extract vehicle specifications from the context.

Return STRICTLY valid JSON list.
No markdown.
No explanations.

Format:

[
  {{
    "component": "string",
    "spec_type": "Torque | Fluid Capacity | Part Number | Other",
    "value": "numeric value only",
    "unit": "unit string"
  }}
]

Rules:
- Separate entries for each unit.
- No nested dictionaries.
- No extra fields.
- Output JSON only.

Context:
{context}

Query:
{query}
"""
    response = model.generate_content(prompt)
    return response.text.strip()


### define langraph with state ,node,edges

In [None]:
#in langraph we have to define state
class GraphState(TypedDict):
    query: str
    retrieved_chunks: List[dict]
    llm_output: str

In [None]:
# retrival node
def retrieval_node(state):
    results = retrieve(state["query"], faiss_index, chunks)
    return {"retrieved_chunks": results}

In [None]:
#llm call node
def llm_node(state):
    context = "\n\n".join([c["text"] for c in state["retrieved_chunks"]])
    output = call_llm(context, state["query"])
    return {"llm_output": output}

In [None]:
#langraph setup
workflow = StateGraph(GraphState)
workflow.add_node("retrieve", retrieval_node)
workflow.add_node("llm", llm_node)
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "llm")
workflow.add_edge("llm", END)
app = workflow.compile()

In [None]:
PDF_PATH = "/content/sample-service-manual 1.pdf"   ## add the path of your pdf
pages = extract_pdf_text(PDF_PATH)
chunks = chunk_text(pages)
embeddings = embed_chunks(chunks)
faiss_index = build_faiss_index(embeddings)
print("Index built successfully!")

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Index built successfully!


In [None]:
query = "Torque for brake caliper bolts"
result = app.invoke({"query": query})
print(result["llm_output"])

[
  {
    "component": "Brake caliper flow bolt (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "35",
    "unit": "Nm"
  },
  {
    "component": "Brake caliper flow bolt (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "26",
    "unit": "lb-ft"
  },
  {
    "component": "Brake caliper guide pin bolts (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "33",
    "unit": "Nm"
  },
  {
    "component": "Brake caliper guide pin bolts (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "24",
    "unit": "lb-ft"
  },
  {
    "component": "Brake caliper support bracket bolts (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "150",
    "unit": "Nm"
  },
  {
    "component": "Brake caliper support bracket bolts (Rear Disc Brake)",
    "spec_type": "Torque",
    "value": "111",
    "unit": "lb-ft"
  },
  {
    "component": "Brake caliper anchor plate bolts (Front Disc Brake)",
    "spec_type": "Torque",
    "value": "250",
    "unit": "Nm"
  },
  

In [None]:
def safe_json_parse(text):
    try:
        return json.loads(text)
    except:
        text = re.search(r"\[.*\]", text, re.DOTALL)
        if text:
            return json.loads(text.group())
        else:
            return []

structured_output = safe_json_parse(result["llm_output"])
print(structured_output)


[{'component': 'Brake caliper flow bolt (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '35', 'unit': 'Nm'}, {'component': 'Brake caliper flow bolt (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '26', 'unit': 'lb-ft'}, {'component': 'Brake caliper guide pin bolts (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '33', 'unit': 'Nm'}, {'component': 'Brake caliper guide pin bolts (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '24', 'unit': 'lb-ft'}, {'component': 'Brake caliper support bracket bolts (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '150', 'unit': 'Nm'}, {'component': 'Brake caliper support bracket bolts (Rear Disc Brake)', 'spec_type': 'Torque', 'value': '111', 'unit': 'lb-ft'}, {'component': 'Brake caliper anchor plate bolts (Front Disc Brake)', 'spec_type': 'Torque', 'value': '250', 'unit': 'Nm'}, {'component': 'Brake caliper anchor plate bolts (Front Disc Brake)', 'spec_type': 'Torque', 'value': '184', 'unit': 'lb-ft'}, {'component': 'Brake caliper fl