In [1]:
%pip install langchain langchain-community langchain-openai pymupdf faiss-cpu pydantic python-dotenv
%pip install langchain-ollama
%pip install langchain-groq

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.1-py3-none-any.whl.metadata (2.6 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metad

In [2]:
%pip install sentence-transformers langchain-huggingface
!pip install ipywidgets

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.1.0-py3-none-any.whl (29 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.1.0
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [3]:
# cell -2 Imports and API Setup
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# NOTICE: No OpenAI imports here anymore!
from langchain_ollama import ChatOllama # New free LLM
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional
# # Securely enter your API Key if not already set in environment
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

# Configuration
PDF_PATH = "sample-service-manual.pdf"

In [4]:
# Cell 3: Load and Inspect PDF Text

#This step helps me to  verify that PyMuPDF is actually reading the tables correctly. If the output looks garbled here, the LLM won't be able to read it either.

print(f"Loading PDF: {PDF_PATH}...")
loader = PyMuPDFLoader(PDF_PATH)
documents = loader.load()

# This helps you check if table rows are being read line-by-line or column-by-column.
print(f"--- Preview of Page 24 ---")
print(documents[1].page_content[:1000])

Loading PDF: sample-service-manual.pdf...
--- Preview of Page 24 ---
Symptom Chart ‚Äî Suspension System 
Condition 
Possible Sources 
Action 
z Incorrect thrust 
angle (dogtracking) 
z Rear 
suspension 
components 
z INSPECT the rear suspension 
system. CHECK the rear alignment 
for the correct thrust angle. 
REPAIR or INSTALL new 
suspension components as 
necessary. REFER to Section 204-
02 . 
z Vehicle drifts/pulls 
z Unevenly loaded 
or overloaded 
vehicle 
z Tires/tire 
pressure 
z Alignment is not 
within 
specification 
z Brake drag 
z Steering 
components 
z GO to Pinpoint Test A . 
z Front bottoming or 
riding low 
z Worn, damaged 
or incorrect 
springs 
z MEASURE the ride height. REFER 
to Ride Height Measurement in this 
section. INSTALL new springs as 
necessary. Refer to the appropriate 
section in Group 204 for the 
procedure. 
z Worn front 
shock absorbers 
z INSTALL new shock absorbers as 
necessary. Refer to the appropriate 
section in Group 204 for the 
procedure. 
z

In [6]:
# cell -4 Chunking Strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,      # Large enough to hold a small table
    chunk_overlap=300,    # Prevents cutting context at the edges
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)
print(f"Document split into {len(chunks)} chunks.")

Document split into 966 chunks.


In [7]:
# Cell 5: Vector Store Creation (Indexing)
# --- UPDATED CELL 5: Vector Store with Sentence Transformers ---
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

print("Loading local embedding model (this may take a minute)...")

# We use a model specifically designed for sentence similarity
# "all-MiniLM-L6-v2" is small (80MB), fast, and very accurate for this type of task.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("Creating vector store...")
# This part stays the same, but now it uses the local model to do the math
vector_store = FAISS.from_documents(chunks, embeddings)
print("Vector store created successfully using Sentence Transformers!")


Loading local embedding model (this may take a minute)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating vector store...
Vector store created successfully using Sentence Transformers!


In [8]:
# Cell 6: Test Retrieval (Debugging Step)
test_query = "Torque specifications for suspension"
results = vector_store.similarity_search(test_query, k=10)

print(f"--- Top Retrieval Result for '{test_query}' ---")
print(results[0].page_content)

--- Top Retrieval Result for 'Torque specifications for suspension' ---
Torque Specifications 
a Refer to the procedure in this section 
SECTION 204-02: Rear Suspension 
2014 F-150 Workshop Manual 
SPECIFICATIONS 
Procedure revision date: 10/25/2013 
Description 
Nm lb-ft lb-in 
Shock absorber nuts 
90 
66 
‚Äî
Shock absorber shield bolts (SVT Raptor) 
4 
‚Äî
35 
Spring shackle-to-frame nut 
185 136 
‚Äî
Spring-to-frame nut 
350 258 
‚Äî
Spring-to-shackle nut 
185 136 
‚Äî
Spring U-bolt nuts a 
‚Äî
‚Äî
‚Äî
Jounce bumper-to-frame bolt 
35 
26 
‚Äî
Page 1 sur 1
2014 F-150 Workshop Manual
2014-03-01
file:///C:/TSO/tsocache/VDTOM2_10764/SE2~us~en~file=SE242001.HTM~gen~ref.HT...


In [10]:
# Cell 7: Define Output Structure

class VehicleSpec(BaseModel):
    """Information about a specific vehicle specification."""
    component: str = Field(..., description="The specific part or component name (e.g., 'Brake Caliper Bolt').")
    spec_type: str = Field(..., description="The type of specification (e.g., 'Torque', 'Capacity', 'Clearance').")
    value: str = Field(..., description="The numerical value of the specification.")
    unit: Optional[str] = Field(None, description="The unit of measurement (e.g., 'Nm', 'lb-ft', 'L').")

class SpecList(BaseModel):
    """A list of extracted vehicle specifications."""
    specs: List[VehicleSpec]

In [11]:
# --- CELL 8: Main Extraction Loop (Robust Groq Version) ---
import json
import time
import re
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# ==========================================
# 1. SETUP CLOUD LLM (Groq Llama 3.3)
# ==========================================

GROQ_API_KEY = "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"

if GROQ_API_KEY.startswith("PASTE"):
    print("‚ö†Ô∏è PLEASE PASTE YOUR ACTUAL GROQ API KEY ABOVE!")

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile",
    api_key=GROQ_API_KEY
)

# ==========================================
# 2. HELPER: JSON Extractor
# ==========================================
def extract_json_from_text(text):
    """
    Uses regex to find the largest JSON object {} inside a text block.
    Ignores conversational filler like 'Here is your data:'
    """
    try:
        # 1. Remove markdown code blocks if present
        text = text.replace("```json", "").replace("```", "")

        # 2. Use Regex to find the JSON object
        # This looks for the first '{' and the last '}'
        match = re.search(r"\{.*\}", text, re.DOTALL)

        if match:
            json_str = match.group(0)
            return json.loads(json_str)
        else:
            # If no brackets found, try parsing the whole text
            return json.loads(text)

    except Exception:
        return None # Failed to find/parse JSON

# ==========================================
# 3. MASTER PROMPT
# ==========================================
prompt_template = """
You are a highly accurate technical data extractor.
Your task is to analyze the provided text and extract ALL specifications related to the user's query.

CRITICAL INSTRUCTIONS:
1. The text may contain long tables (20+ rows). You MUST extract EVERY row.
2. **UNIT PATTERN RULE**:
   - Service manuals typically list values in this order: **Nm** (Metric) -> **lb-ft** (Imperial) -> **lb-in** (Small Imperial).
   - If you see "12 ‚Äî 106", the first number (12) is **Nm**, the dash means skip **lb-ft**, and 106 is **lb-in**.
3. **SEPARATION RULE**:
   - "value": MUST contain ONLY the number (e.g., "17"). NO text.
   - "unit": MUST contain ONLY the unit code (e.g., "Nm").

Output Format:
{{
    "specs": [
        {{ "component": "part name", "spec_type": "Torque", "value": "17", "unit": "Nm" }}
    ]
}}

If no relevant data is found, return: {{ "specs": [] }}

Context:
{context}

Query: {question}
"""

queries = [
    "Torque specifications for front suspension",
    "Torque specifications for braking system",
    "Fluid capacities"
]

all_extracted_data = []

print("üöÄ Starting Batch Extraction Job (Groq Cloud)...")

for query in queries:
    print(f"   Processing: {query}...")
    start_ts = time.time()

    # A. Retrieve
    docs = vector_store.similarity_search(query, k=3)
    context_text = "\n\n".join([d.page_content for d in docs])

    # B. Generate
    prompt = ChatPromptTemplate.from_template(prompt_template)
    chain = prompt | llm

    try:
        response = chain.invoke({"context": context_text, "question": query})

        # C. Parse (Using Robust Helper)
        data = extract_json_from_text(response.content)

        if data:
            items = data.get("specs", [])
            if items:
                all_extracted_data.extend(items)
                print(f"   ‚úÖ Found {len(items)} items in {time.time()-start_ts:.2f}s.")
            else:
                print("   ‚ö†Ô∏è Valid JSON, but no items found.")
        else:
            print("   ‚ùå Error: Could not parse JSON from model response.")
            # Optional: print(response.content[:100]) # Uncomment to debug

    except Exception as e:
        print(f"   ‚ùå Network/API Error: {e}")

# 4. Save to File
output_file = "vehicle_specs.json"
with open(output_file, "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"\nüéâ DONE! Saved {len(all_extracted_data)} total specs to '{output_file}'")

üöÄ Starting Batch Extraction Job (Groq Cloud)...
   Processing: Torque specifications for front suspension...
   ‚úÖ Found 30 items in 1.74s.
   Processing: Torque specifications for braking system...
   ‚úÖ Found 43 items in 2.58s.
   Processing: Fluid capacities...
   ‚úÖ Found 2 items in 0.38s.

üéâ DONE! Saved 75 total specs to 'vehicle_specs.json'


In [12]:
# Cell 9: Save and View Results

import json

# Save to JSON file
output_file = "vehicle_specs.json"
with open(output_file, "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"Saved data to {output_file}")

# Display first 5 results
print(json.dumps(all_extracted_data[:5], indent=2))

Saved data to vehicle_specs.json
[
  {
    "component": "Brake disc shield bolts",
    "spec_type": "Torque",
    "value": "17",
    "unit": "Nm"
  },
  {
    "component": "Brake disc shield bolts",
    "spec_type": "Torque",
    "value": "150",
    "unit": "lb-in"
  },
  {
    "component": "Brake hose bracket bolt",
    "spec_type": "Torque",
    "value": "12",
    "unit": "Nm"
  },
  {
    "component": "Brake hose bracket bolt",
    "spec_type": "Torque",
    "value": "106",
    "unit": "lb-in"
  },
  {
    "component": "Lower arm forward and rearward nuts",
    "spec_type": "Torque",
    "value": "350",
    "unit": "Nm"
  }
]


In [13]:

vector_store.save_local("faiss_db_index")
print("‚úÖ Index saved to folder 'faiss_db_index'")

‚úÖ Index saved to folder 'faiss_db_index'


In [15]:
%pip install langchain-groq



In [22]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import json
import time
import re

# -----------------------------
# Configuration
# -----------------------------
GROQ_API_KEY = "placeholder_key"

# -----------------------------
# CSS: High Contrast Dark Mode
# -----------------------------
STYLE = """
<style>
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@500;700&family=Inter:wght@400;600&display=swap');

:root {
    --bg-app: #0f172a;       /* Dark Slate Background */
    --bg-card: #1e293b;      /* Lighter Slate Card */
    --text-main: #f1f5f9;    /* White text */
    --text-sub: #94a3b8;     /* Gray text */
    --accent: #06b6d4;       /* Cyan Neon */
    --border: #334155;       /* Border Color */

    /* Button Colors */
    --btn-bg: #334155;
    --btn-text: #ffffff;
}

/* Container */
.cyber-container {
    background-color: var(--bg-app);
    color: var(--text-main);
    padding: 20px;
    border-radius: 8px;
    border: 1px solid var(--border);
    font-family: 'Inter', sans-serif;
    max-width: 1000px;
}

/* Header */
.cyber-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    border-bottom: 1px solid var(--border);
    padding-bottom: 15px;
    margin-bottom: 20px;
}
.cyber-title {
    font-family: 'JetBrains Mono', monospace;
    font-size: 20px;
    color: var(--accent);
    font-weight: 700;
}

/* INPUT OVERRIDES - Force visibility */
.widget-dropdown > select,
.widget-text > input {
    background-color: #020617 !important;
    color: #fff !important;
    border: 1px solid var(--border) !important;
    height: 40px !important;
    border-radius: 6px !important;
    font-family: 'Inter', sans-serif !important;
}

/* BUTTON OVERRIDES - FORCE VISIBILITY */
/* This targets the specific utility buttons to ensure text is white */
.utility-btn {
    background-color: var(--btn-bg) !important;
    color: var(--btn-text) !important;
    border: 1px solid #475569 !important;
    font-weight: 600 !important;
    border-radius: 6px !important;
    font-family: 'Inter', sans-serif !important;
    font-size: 11px !important; /* Slightly smaller text for utilities */
}
.utility-btn:hover {
    background-color: #475569 !important;
    border-color: var(--text-main) !important;
}

/* Main Action Button */
.action-btn {
    font-weight: 700 !important;
    font-size: 13px !important;
}

/* Results Table */
.results-box {
    background: var(--bg-card);
    border: 1px solid var(--border);
    border-radius: 6px;
    min-height: 150px;
    overflow: hidden;
}
.results-head {
    background: rgba(255,255,255,0.05);
    padding: 10px 15px;
    font-size: 12px;
    color: var(--text-sub);
    border-bottom: 1px solid var(--border);
    text-transform: uppercase;
    letter-spacing: 1px;
}
.tech-table { width: 100%; border-collapse: collapse; font-size: 14px; }
.tech-table th { text-align: left; padding: 12px 15px; color: var(--text-sub); border-bottom: 1px solid var(--border); }
.tech-table td { padding: 12px 15px; color: var(--text-main); border-bottom: 1px solid #334155; }
.tech-val { color: var(--accent); font-family: 'JetBrains Mono', monospace; }

</style>
"""

# -----------------------------
# Components
# -----------------------------

# 1. Header
header = widgets.HTML(f"""
{STYLE}
<div class='cyber-container'>
    <div class='cyber-header'>
        <div class='cyber-title'>‚ö° VEHICLE SPECIFICATION AI <span style='font-size:12px; opacity:0.7; color:#fff'>v3.2</span></div>
        <div style='font-size:12px; color:#94a3b8'>SYSTEM ONLINE</div>
    </div>
""")

footer = widgets.HTML("</div>") # Closes container

# 2. Controls
dropdown = widgets.Dropdown(
    options=[
        ("--- Select Preset ---", ""),
        ("Torque: Front Suspension", "Torque specs for front suspension"),
        ("Torque: Brake System", "Torque specs for brakes"),
        ("Fluids: Engine Oil", "Engine oil capacity and type")
    ],
    value="",
    layout=widgets.Layout(width='100%')
)

query_in = widgets.Text(
    placeholder="Enter custom query...",
    layout=widgets.Layout(width='100%')
)

# Buttons - NOW WITH TEXT DESCRIPTIONS AND CUSTOM CLASSES
search_btn = widgets.Button(
    description='SCAN DOCS',
    icon='bolt',
    button_style='info',
    layout=widgets.Layout(width='120px', height='40px')
)
search_btn.add_class('action-btn')

copy_btn = widgets.Button(
    description='COPY JSON',  # Added Text
    icon='copy',
    layout=widgets.Layout(width='110px', height='40px')
)
copy_btn.add_class('utility-btn') # Force colors

clear_btn = widgets.Button(
    description='CLEAR', # Added Text
    icon='trash',
    layout=widgets.Layout(width='90px', height='40px')
)
clear_btn.add_class('utility-btn') # Force colors

# Layout Wrapper
l1 = widgets.HTML("<div style='color:#94a3b8; font-size:11px; margin-bottom:5px; font-weight:600'>QUERY PRESET</div>")
l2 = widgets.HTML("<div style='color:#94a3b8; font-size:11px; margin-bottom:5px; font-weight:600'>CUSTOM INPUT</div>")
l3 = widgets.HTML("<div style='color:#94a3b8; font-size:11px; margin-bottom:5px; font-weight:600'>ACTIONS</div>")

# Boxes
box1 = widgets.VBox([l1, dropdown], layout=widgets.Layout(flex='3', min_width='200px'))
box2 = widgets.VBox([l2, query_in], layout=widgets.Layout(flex='4', min_width='250px'))
btn_row = widgets.HBox([search_btn, copy_btn, clear_btn], layout=widgets.Layout(gap='8px'))
box3 = widgets.VBox([l3, btn_row], layout=widgets.Layout(flex='0 0 auto'))

controls = widgets.HBox([box1, box2, box3], layout=widgets.Layout(width='100%', align_items='flex-end', gap='15px', margin='0 0 20px 0'))

# 3. Output
status = widgets.HTML("<div style='color:#64748b; margin-bottom:10px; font-size:13px'>Waiting for input...</div>")
results = widgets.HTML("""
<div class='results-box'>
    <div class='results-head'>Results Console</div>
    <div style='padding:40px; text-align:center; color:#475569'>
        Select a preset or type a query to begin.
    </div>
</div>
""")
raw_json = widgets.Textarea(layout=widgets.Layout(display='none'))

# -----------------------------
# Logic
# -----------------------------
def on_click_search(b):
    if not query_in.value and not dropdown.value:
        status.value = "<span style='color:#ef4444'>‚ö† Input required</span>"
        return

    search_btn.disabled = True
    search_btn.description = "SCANNING..."
    results.value = "<div class='results-box'><div style='padding:20px; color:#94a3b8'>Processing...</div></div>"

    time.sleep(0.5) # Sim

    # Mock result
    mock_data = {
        "specs": [
            {"component": "Front Hub", "spec_type": "Torque", "value": "120", "unit": "Nm"},
            {"component": "Caliper Bolt", "spec_type": "Torque", "value": "90", "unit": "Nm"}
        ]
    }
    raw_json.value = json.dumps(mock_data, indent=2)

    rows = ""
    for s in mock_data['specs']:
        rows += f"<tr><td>{s['component']}</td><td>{s['spec_type']}</td><td class='tech-val'>{s['value']}</td><td>{s['unit']}</td></tr>"

    table = f"<table class='tech-table'><thead><tr><th>Component</th><th>Type</th><th>Value</th><th>Unit</th></tr></thead><tbody>{rows}</tbody></table>"

    results.value = f"<div class='results-box'><div class='results-head' style='color:#06b6d4'>‚úì Extraction Successful</div>{table}</div>"
    status.value = "<span style='color:#06b6d4'>Done.</span>"

    search_btn.disabled = False
    search_btn.description = "SCAN DOCS"

def on_change_drop(change):
    if change['new']: query_in.value = change['new']

def on_clear(b):
    query_in.value = ""
    dropdown.value = ""
    raw_json.value = ""
    raw_json.layout.display = 'none'
    results.value = "<div class='results-box'><div class='results-head'>Results Console</div><div style='padding:40px; text-align:center; color:#475569'>Cleared.</div></div>"
    status.value = "<div style='color:#64748b'>Ready.</div>"

def on_copy(b):
    if raw_json.value:
        raw_json.layout.display = 'block'
        status.value = "<span style='color:#22c55e'>JSON revealed below. Select all and Copy.</span>"
    else:
        status.value = "<span style='color:#eab308'>No data to copy yet.</span>"

# Bind
search_btn.on_click(on_click_search)
dropdown.observe(on_change_drop, names='value')
clear_btn.on_click(on_clear)
copy_btn.on_click(on_copy)

# Display
ui = widgets.VBox([header, controls, status, results, raw_json, footer])
clear_output()
display(ui)

VBox(children=(HTML(value="\n\n<style>\n@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:w‚Ä¶