In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

In [2]:
# Target URL
BASE_URL = "https://data.lhc.gov.pk/reported_judgments/judgments_approved_for_reporting"

# Folder to save PDFs
SAVE_DIR = "lhc_pdfs"
os.makedirs(SAVE_DIR, exist_ok=True)

In [3]:

# Get the page
response = requests.get(BASE_URL, timeout=30)
if response.status_code != 200:
    raise Exception(f"Failed to load page: {response.status_code}")

In [4]:

# Parse HTML
soup = BeautifulSoup(response.content, "html.parser")
pdf_links = []

In [5]:

# Find all anchor tags with hrefs ending in .pdf
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.lower().endswith(".pdf"):
        full_url = urljoin(BASE_URL, href)
        pdf_links.append(full_url)

print(f"Found {len(pdf_links)} PDFs")

Found 50 PDFs


In [6]:




# Download each PDF
for pdf_url in tqdm(pdf_links, desc="Downloading PDFs"):
    filename = pdf_url.split("/")[-1]
    save_path = os.path.join(SAVE_DIR, filename)

    try:
        with requests.get(pdf_url, stream=True, timeout=30) as r:
            r.raise_for_status()
            with open(save_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
    except Exception as e:
        print(f"❌ Failed to download {pdf_url}: {e}")


Downloading PDFs: 100%|██████████| 50/50 [00:23<00:00,  2.16it/s]


Extraxting text from pdfs


In [7]:
pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl (18.5 MB)
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
    --------------------------------------- 0.3/18.5 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.5 MB 2.1 MB/s eta 0:00:09
   -- ------------------------------------- 1.0/18.5 MB 2.4 MB/s eta 0:00:08
   --- ------------------------------------ 1.6/18.5 MB 2.3 MB/s eta 0:00:08
   ---- ----------------------------------- 2.1/18.5 MB 2.3 MB/s eta 0:00:08
   ----- ---------------------------------- 2.4/18.5 MB 2.3 MB/s eta 0:00:08
   ------ --------------------------------- 2.9/18.5 MB 2.3 MB/s eta 0:00:07
   ------ --------------------------------- 3.1/18.5 MB 2.3 MB/s eta 0:00:07
   -------- ------------------------------- 3.9/18.5 MB 2.3 MB/s eta 0:00:07
   --------- ----------

In [8]:
import fitz  # PyMuPDF

PDF_DIR = "lhc_pdfs"
EXTRACTED_DIR = "extracted_texts"
os.makedirs(EXTRACTED_DIR, exist_ok=True)

In [9]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

In [10]:
# Loop through PDFs and extract text
for filename in os.listdir(PDF_DIR):
    if filename.endswith(".pdf"):
        path = os.path.join(PDF_DIR, filename)
        print(f"📄 Extracting from: {filename}")
        try:
            text = extract_text_from_pdf(path)
            # Save as .txt (optional)
            with open(os.path.join(EXTRACTED_DIR, filename.replace(".pdf", ".txt")), "w", encoding="utf-8") as f:
                f.write(text)
        except Exception as e:
            print(f"❌ Error with {filename}: {e}")

📄 Extracting from: 2025LHC3591.pdf
📄 Extracting from: 2025LHC3608.pdf
📄 Extracting from: 2025LHC3612.pdf
📄 Extracting from: 2025LHC3628.pdf
📄 Extracting from: 2025LHC3636.pdf
📄 Extracting from: 2025LHC3644.pdf
📄 Extracting from: 2025LHC3661.pdf
📄 Extracting from: 2025LHC3670.pdf
📄 Extracting from: 2025LHC3677.pdf
📄 Extracting from: 2025LHC3687.pdf
📄 Extracting from: 2025LHC3697.pdf
📄 Extracting from: 2025LHC3708.pdf
📄 Extracting from: 2025LHC3719.pdf
📄 Extracting from: 2025LHC3751.pdf
📄 Extracting from: 2025LHC3760.pdf
📄 Extracting from: 2025LHC3768.pdf
📄 Extracting from: 2025LHC3791.pdf
📄 Extracting from: 2025LHC3801.pdf
📄 Extracting from: 2025LHC3823.pdf
📄 Extracting from: 2025LHC3828.pdf
📄 Extracting from: 2025LHC3836.pdf
📄 Extracting from: 2025LHC3845.pdf
📄 Extracting from: 2025LHC3866.pdf
📄 Extracting from: 2025LHC3872.pdf
📄 Extracting from: 2025LHC3883.pdf
📄 Extracting from: 2025LHC3888.pdf
📄 Extracting from: 2025LHC3892.pdf
📄 Extracting from: 2025LHC3906.pdf
📄 Extracting from: 2

In [None]:
# ifZ7uhK8Va1D0BsVBNUbwtH2dysmkph4j9RHbvX8

eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.3ufGsS34vNHGU_KvlSrSSv2LjsSLBLJ4q9J_z8BY_P8

In [6]:
import cohere
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

import uuid

# Load Cohere
co = cohere.Client("ifZ7uhK8Va1D0BsVBNUbwtH2dysmkph4j9RHbvX8")

client = QdrantClient(
    url="https://4ae1bf46-a1be-419c-8f7a-751a29d868c2.eu-west-1-0.aws.cloud.qdrant.io",  # from Qdrant Cloud
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.3ufGsS34vNHGU_KvlSrSSv2LjsSLBLJ4q9J_z8BY_P8"                      # from API Keys
)

In [7]:
collection_name = "lhc_judgments"

# ✅ Recreate the collection (automatically deletes if it exists)
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=384,
        distance=Distance.COSINE
    )
)
print(f"✅ Recreated collection: {collection_name}")


  client.recreate_collection(


✅ Recreated collection: lhc_judgments


In [8]:
info = client.get_collection(collection_name=collection_name)
print(f"Vectors after recreation: {info.points_count}")  # Should be 0


Vectors after recreation: 0


In [52]:
print(client)


<qdrant_client.qdrant_client.QdrantClient object at 0x000001F520CCF770>


In [None]:
# pip install --upgrade qdrant-client


Note: you may need to restart the kernel to use updated packages.


In [10]:
from pathlib import Path

def chunk_text(text, max_tokens=500):
    sentences = text.split(". ")
    chunks, chunk = [], ""
    for sentence in sentences:
        if len((chunk + sentence).split()) > max_tokens:
            chunks.append(chunk.strip())
            chunk = sentence
        else:
            chunk += sentence + ". "
    if chunk:
        chunks.append(chunk.strip())
    return chunks

texts_dir = Path("extracted_texts")
all_chunks = []

In [11]:
for file in texts_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
    chunks = chunk_text(text)
    for chunk in chunks:
        all_chunks.append({"text": chunk, "source": file.name})

added delay bcz hitting limit

In [12]:
import time

def get_embeddings_with_retry(texts, retries=10, delay=10):
    for attempt in range(retries):
        try:
            return co.embed(
                texts=texts,
                model="embed-english-light-v3.0",
                input_type="search_document"
            ).embeddings
        except Exception as e:
            if hasattr(e, "status_code") and e.status_code == 429:
                print(f"⚠️ Rate limit hit (429). Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                time.sleep(delay)
            elif "rate limit" in str(e).lower():
                print(f"⚠️ Rate limit message. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
                time.sleep(delay)
            else:
                raise e
    raise RuntimeError("❌ Failed after multiple retries.")


In [15]:
batch_size = 10 


# ✅ Now your loop
for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    texts = [x["text"] for x in batch]

    embeddings = get_embeddings_with_retry(texts)

    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=emb,
            payload={
                "text": chunk["text"],
                "source": chunk["source"]
            }
        )
        for emb, chunk in zip(embeddings, batch)
    ]

    client.upsert(collection_name=collection_name, points=points)

    print(f"📤 Uploaded batch {i // batch_size + 1}: {len(points)} vectors")


    time.sleep(2)  # Optional: still good to pause to avoid hitting limits


📤 Uploaded batch 1: 10 vectors
📤 Uploaded batch 2: 10 vectors
📤 Uploaded batch 3: 10 vectors
📤 Uploaded batch 4: 10 vectors
📤 Uploaded batch 5: 10 vectors
📤 Uploaded batch 6: 10 vectors
📤 Uploaded batch 7: 10 vectors
📤 Uploaded batch 8: 10 vectors
📤 Uploaded batch 9: 10 vectors
📤 Uploaded batch 10: 10 vectors
📤 Uploaded batch 11: 10 vectors
📤 Uploaded batch 12: 10 vectors
📤 Uploaded batch 13: 10 vectors
📤 Uploaded batch 14: 10 vectors
📤 Uploaded batch 15: 10 vectors
📤 Uploaded batch 16: 10 vectors
📤 Uploaded batch 17: 10 vectors
📤 Uploaded batch 18: 10 vectors
📤 Uploaded batch 19: 10 vectors
📤 Uploaded batch 20: 10 vectors
📤 Uploaded batch 21: 10 vectors
📤 Uploaded batch 22: 10 vectors
📤 Uploaded batch 23: 10 vectors
📤 Uploaded batch 24: 10 vectors
📤 Uploaded batch 25: 10 vectors
📤 Uploaded batch 26: 10 vectors
📤 Uploaded batch 27: 10 vectors
📤 Uploaded batch 28: 10 vectors
⚠️ Rate limit hit (429). Retrying in 10 seconds... (Attempt 1/10)
📤 Uploaded batch 29: 10 vectors
📤 Uploaded batc

In [16]:
# After uploading all batches
collection_info = client.get_collection(collection_name=collection_name)
print(f"Total vectors stored in Qdrant: {collection_info.vectors_count}")
print(f"Expected vectors: {len(all_chunks)}")


Total vectors stored in Qdrant: None
Expected vectors: 389


In [17]:
# Check how many vectors were actually stored
collection_info = client.get_collection(collection_name=collection_name)
print("✅ Collection info:", collection_info)

# This is the actual stored vector count
count = client.count(collection_name=collection_name, exact=True).count
print(f"✅ Total vectors stored in Qdrant: {count}")
print(f"🔢 Expected vectors: {len(all_chunks)}")


✅ Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=389 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quant

In [18]:
collection_info = client.get_collection(collection_name=collection_name)
print(f"✅ Total vectors stored in Qdrant: {collection_info.points_count}")
print(f"🔢 Expected vectors: {len(all_chunks)}")


✅ Total vectors stored in Qdrant: 389
🔢 Expected vectors: 389


QUERY FUNCTION

In [25]:
def search_query(query, client, collection_name, co, top_k=3):
    # Embed the user query
    query_embed = co.embed(
        texts=[query],
        model="embed-english-light-v3.0",
        input_type="search_query"
    ).embeddings[0]
    
    # Search in Qdrant
    results = client.search(
        collection_name=collection_name,
        query_vector=query_embed,
        limit=top_k
    )
    
    # Return retrieved text chunks
    return [hit.payload.get("text", "") for hit in results]


In [28]:
import requests

def generate_answer_groq(context, question, groq_api_key):
    context_str = "\n".join(context)
    prompt = f"Context:\n{context_str}\n\nQuestion: {question}\nAnswer:"

    headers = {
        "Authorization": f"Bearer {groq_api_key}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "llama3-8b-8192",
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers=headers,
        json=data
    )

    # Debug if 'choices' is missing
    try:
        return response.json()["choices"][0]["message"]["content"]
    except KeyError:
        print("❌ Error from Groq API:", response.status_code, response.text)
        return "Failed to get a valid response from the LLM."


In [30]:
question = "What is the limitation period for filing a writ petition?"
context = search_query(question, client, collection_name, co)
answer = generate_answer_groq(context, question, "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")

print("🤖 Answer:", answer)


  results = client.search(


🤖 Answer: Based on the court's judgment, the limitation period for filing a writ petition is not explicitly stated. However, the court mentioned Article 120 of the Limitation Act, 1908, which prescribes a period of six years for filing a suit for declaration. This implies that if a writ petition is seeking a declaration, it would likely need to be filed within six years from the date of the cause of action.

However, if the writ petition is seeking other relief, such as a direction to the authority to take a particular action, the limitation period may be different. In general, the limitation period for filing a writ petition is governed by the Rules of the Supreme Court and the High Court, which may vary depending on the nature of the petition and the relief sought.
