In [1]:
!pip install -q qdrant-client cohere python-dotenv requests tqdm PyMuPDF


In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

# ✅ Connect to local Qdrant (no API key needed)
client = QdrantClient(host="localhost", port=6333)
print("✅ Connected to local Qdrant")


✅ Connected to local Qdrant


In [1]:
import os
from dotenv import load_dotenv

load_dotenv()


True

In [2]:
cohere_api_key = os.getenv("COHERE_API_KEY")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

In [None]:
# BASE_URL = "https://data.lhc.gov.pk/reported_judgments/judgments_approved_for_reporting"
# PDF_DIR = "lhc_pdfs"
# TEXT_DIR = "extracted_texts"

# os.makedirs(PDF_DIR, exist_ok=True)
# os.makedirs(TEXT_DIR, exist_ok=True)


In [6]:
import os
import shutil

# Create target folder if it doesn't exist
os.makedirs("extracted_texts", exist_ok=True)

# Move all .txt files from the current directory to extracted_texts/
for filename in os.listdir():
    if filename.endswith(".txt"):
        source = filename
        destination = os.path.join("extracted_texts", filename)
        shutil.move(source, destination)
        print(f"Moved {filename} to extracted_texts/")


In [7]:
from pathlib import Path

def chunk_text(text, max_tokens=500):
    sentences = text.split(". ")
    chunks, chunk = [], ""
    for sentence in sentences:
        if len((chunk + sentence).split()) > max_tokens:
            chunks.append(chunk.strip())
            chunk = sentence
        else:
            chunk += sentence + ". "
    if chunk:
        chunks.append(chunk.strip())
    return chunks

texts_dir = Path("extracted_texts")
all_chunks = []

for file in texts_dir.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
    chunks = chunk_text(text)
    for chunk in chunks:
        all_chunks.append({"text": chunk, "source": file.name})


In [8]:
print(f"✅ Total text chunks: {len(all_chunks)}")


✅ Total text chunks: 499


In [10]:
import cohere

cohere_api_key = "ifZ7uhK8Va1D0BsVBNUbwtH2dysmkph4j9RHbvX8"
co = cohere.Client(cohere_api_key)

print("✅ Cohere client initialized")


✅ Cohere client initialized


In [11]:
collection_name = "lhc_judgments"

if client.collection_exists(collection_name=collection_name):
    client.delete_collection(collection_name=collection_name)
    print(f"🗑️ Deleted existing collection: {collection_name}")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=384,
        distance=Distance.COSINE
    )
)

print(f"✅ Created collection: {collection_name}")


✅ Created collection: lhc_judgments


In [12]:
import uuid
import time
from qdrant_client.models import PointStruct
from datetime import datetime

def get_embeddings_with_retry(texts, retries=10, delay=10):
    for attempt in range(retries):
        try:
            return co.embed(
                texts=texts,
                model="embed-english-light-v3.0",
                input_type="search_document"
            ).embeddings
        except Exception as e:
            if hasattr(e, "status_code") and e.status_code == 429:
                print(f"⚠️ Rate limit. Retrying in {delay}s...")
                time.sleep(delay)
            elif "rate limit" in str(e).lower():
                print(f"⚠️ Retry due to rate limit: {e}")
                time.sleep(delay)
            else:
                raise e
    raise RuntimeError("❌ Failed after multiple retries.")

batch_size = 10

for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    texts = [x["text"] for x in batch]
    embeddings = get_embeddings_with_retry(texts)

    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=emb,
            payload={
                "text": chunk["text"],
                "source": chunk["source"],
                "url": f"https://data.lhc.gov.pk/reported_judgments/judgments_approved_for_reporting/{chunk['source'].replace('.txt', '.pdf')}",
                "updated_at": datetime.now().strftime("%Y-%m-%d")
            }
        )
        for emb, chunk in zip(embeddings, batch)
    ]

    client.upsert(collection_name=collection_name, points=points)
    print(f"📤 Uploaded batch {i // batch_size + 1}")
    time.sleep(2)


📤 Uploaded batch 1
📤 Uploaded batch 2
📤 Uploaded batch 3
📤 Uploaded batch 4
📤 Uploaded batch 5
📤 Uploaded batch 6
📤 Uploaded batch 7
📤 Uploaded batch 8
📤 Uploaded batch 9
📤 Uploaded batch 10
📤 Uploaded batch 11
📤 Uploaded batch 12
📤 Uploaded batch 13
📤 Uploaded batch 14
📤 Uploaded batch 15
📤 Uploaded batch 16
📤 Uploaded batch 17
📤 Uploaded batch 18
📤 Uploaded batch 19
📤 Uploaded batch 20
📤 Uploaded batch 21
📤 Uploaded batch 22
📤 Uploaded batch 23
📤 Uploaded batch 24
📤 Uploaded batch 25
📤 Uploaded batch 26
📤 Uploaded batch 27
📤 Uploaded batch 28
📤 Uploaded batch 29
⚠️ Rate limit. Retrying in 10s...
📤 Uploaded batch 30
📤 Uploaded batch 31
📤 Uploaded batch 32
📤 Uploaded batch 33
📤 Uploaded batch 34
📤 Uploaded batch 35
📤 Uploaded batch 36
📤 Uploaded batch 37
📤 Uploaded batch 38
📤 Uploaded batch 39
📤 Uploaded batch 40
📤 Uploaded batch 41
📤 Uploaded batch 42
📤 Uploaded batch 43
📤 Uploaded batch 44
📤 Uploaded batch 45
📤 Uploaded batch 46
📤 Uploaded batch 47
📤 Uploaded batch 48
📤 Uploaded batc

In [5]:
print("🔐 Using API key:", groq_api_key[:5], "...")  # Shows only first 5 chars


🔐 Using API key: gsk_j ...


In [15]:
def search_query(query, client, collection_name, co, top_k=3):
    query_embed = co.embed(
        texts=[query],
        model="embed-english-light-v3.0",
        input_type="search_query"
    ).embeddings[0]

    results = client.search(
        collection_name=collection_name,
        query_vector=query_embed,
        limit=top_k,
        with_payload=True
    )

    return [hit.payload.get("text", "") for hit in results]


In [27]:
def rerank_context(query, context_list, co):
    if not context_list:
        print("⚠️ No context provided for reranking.")
        return []

    results = co.rerank(
        query=query,
        documents=context_list,
        top_n=min(5, len(context_list)),
        model="rerank-english-v3.0"
    )

    print("✅ Got rerank results:", results)
    if results:
        print("👀 Sample item:", results[0])
    else:
        print("⚠️ Rerank results is empty.")

    return [doc["text"] if isinstance(doc, dict) else doc for _, doc in results]


In [23]:
def query_rewriter(conversation_history, groq_api_key):
    prompt = f"""You are an AI assistant. Rewrite the following conversation into a concise and clear search query:

    Conversation:
    {conversation_history}

    Rewritten Query:"""

    headers = {
        "Authorization": f"Bearer {groq_api_key}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "llama3-8b-8192",
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers=headers,
        json=data
    )

    try:
        return response.json()["choices"][0]["message"]["content"]
    except Exception:
        print("❌ Query rewriting failed:", response.status_code, response.text)
        return None


In [24]:
# # Simulate conversation
# conversation = f"User: {question}"  # You can expand this if you track full history

# # Rewrite query
# rewritten_query = query_rewriter(conversation, groq_api_key)
# print("📝 Rewritten Query:", rewritten_query)

# # Search original and rewritten in parallel
# orig_results = search_query(question, client, collection_name, co)
# rewrite_results = search_query(rewritten_query, client, collection_name, co) if rewritten_query else []

# # Merge and deduplicate
# combined_context = list(dict.fromkeys(orig_results + rewrite_results))


In [25]:


def generate_answer_groq(context, question, groq_api_key):
    context_str = "\n".join(context)
    prompt = f"""Context:
    {context_str}

    Each context chunk ends with metadata (PDF name, URL, and updated date).
    When answering, include relevant source(s) to justify your answer.

    Question: {question}
    Answer:"""

    headers = {
        "Authorization": f"Bearer {groq_api_key}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "llama3-8b-8192",
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers=headers,
        json=data
    )

    try:
        return response.json()["choices"][0]["message"]["content"]
    except KeyError:
        print("❌ Error from Groq:", response.status_code, response.text)
        return "❌ No valid answer returned."

In [None]:
# question = "What is the limitation period for filing a writ petition?"
# context = combined_context
# answer = generate_answer_groq(context, question, groq_api_key)

# print("🤖 Answer:", answer)


  results = client.search(


🤖 Answer: The limitation period for filing a writ petition is not explicitly mentioned in the given context. However, the context does mention the period of limitation for filing a reference application under Section 47 of the Sales Tax Act, 1990, which is 30 days from the communication of the order of the Appellate Tribunal or the Commissioner (Appeals).

In the absence of specific information regarding the limitation period for filing a writ petition, it is necessary to consult other sources. According to the Civil Procedure Code (CPC), a writ petition is a special proceeding under Order 45, Rule 1 of the CPC. The CPC provides a limitation period of 90 days for filing a revision application (Order 47, Rule 1), which may be applied as a general guideline. However, the limitation period for filing a writ petition under Article 199 of the Constitution of Pakistan is not clearly defined in the CPC.

In a landmark case, "Asad Ali and 9 others vs. The Bank of Punjab and others" (PLD 2020 S

In [20]:
import requests


In [21]:
question = "What is the limitation period for filing a writ petition?"
conversation = f"User: {question}"

# Rewrite user query
rewritten_query = query_rewriter(conversation, groq_api_key)
print("📝 Rewritten Query:", rewritten_query)

# Vector search using both original and rewritten
orig_results = search_query(question, client, collection_name, co)
rewrite_results = search_query(rewritten_query, client, collection_name, co) if rewritten_query else []
combined_context = list(dict.fromkeys(orig_results + rewrite_results))

# Rerank using Cohere
context = rerank_context(question, combined_context, co)

# Answer using LLM
answer = generate_answer_groq(context, question, groq_api_key)
print("🤖 Answer:", answer)


📝 Rewritten Query: Here is the rewritten query:

"What is the limitation period for filing a writ petition?"

This query is concise and clear, and it directly confronts the user's question about the limitation period for filing a writ petition.


  results = client.search(


🤖 Answer: According to the Supreme Court's guidelines, the limitation period for filing a writ petition is 90 days from the date of the judgment or order that is being challenged. [1]

Specifically, Rule 3 of Order XXXIX of the Code of Civil Procedure, 1908 states that a writ petition must be presented to the Supreme Court "within sixty days from the date of the judgment or order complained of, or within sixty days from the date of the receipt of the notice or from the date of the happening of the event complained of, as the case may be". [2]

However, in the case of Apex Dental College & Hospital versus State of Rajasthan and others, the Supreme Court allowed the petition to be filed beyond the 60-day limitation period, considering it as an isolated case. [3]

In summary, the general limitation period for filing a writ petition is 60 days from the date of the judgment or order complained of, but the Supreme Court may allow exceptions in specific cases.

References:

[1] Supreme Court 