In [1]:
!fuser -k 8000/tcp

In [2]:
!pip install transformers fastapi uvicorn python-magic PyPDF2 sentence-transformers pinecone-client python-dotenv groq

Collecting fastapi
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting groq
  Downloading groq-0.15.0-py3-none-any.whl.metadata (14 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.

In [3]:
!pip install tiktoken protobuf python-multipart

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Installing collected packages: python-multipart, tiktoken
Successfully installed python-multipart-0.0.20 tiktoken-0.8.0


In [4]:
!pip install --upgrade transformers sentencepiece

Collecting transformers
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.1


In [5]:
!pip install pyngrok nest_asyncio

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [6]:
!ngrok config add-authtoken <your_auth_token>

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [7]:
PINECONE_API_KEY="your_pinecone_api_key"
PINECONE_ENV='aws'
GROQ_API_KEY="your_groq_api_key"

In [8]:
import json
import traceback
from typing import List, Dict
from groq import Groq
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone


class RAGPipeline:
    def __init__(
        self,
        embedding_model_name="all-MiniLM-L6-v2",
        pinecone_api_key=None,
        pinecone_env=None,
        groq_api_key=None,
    ):
        # Initialize embedding model
        self.embedding_model = SentenceTransformer(embedding_model_name)

        # Initialize Pinecone
        self.pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_env)
        self.index = self.pc.Index("legal-llm")

        # Initialize Groq client
        self.client = Groq(api_key=groq_api_key)

    def query_pinecone(self, prompt: str, top_k: int = 10) -> List[Dict]:
        """Vector search with metadata filtering and scoring"""
        query_embedding = self.embedding_model.encode(prompt).tolist()

        query_results = self.index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True,
            filter={
                # Optional: Add metadata filtering logic
                # "domain": "legal"
            },
        )

        results = [
            {
                "text": match.metadata["text"],
                "score": match.score,
                "metadata": match.metadata,
            }
            for match in query_results.matches
        ]

        return sorted(results, key=lambda x: x["score"], reverse=True)

    def get_context(self, user_prompt: str, top_k: int = 3) -> str:
        """Retrieve and process top contextual results"""
        vector_results = self.query_pinecone(user_prompt, top_k)

        return "\n\n".join([result["text"] for result in vector_results])

    def generate_response(self, user_query: str, context: str) -> Dict:
        """Response generation with structured output"""
        system_prompt = f"""
        You are an expert legal analyst. Provide precise, evidence-based responses.

        Context: {context}

        Response Guidelines:
        - Analyze the query using ONLY the provided context
        - Structure response as JSON with:
          1. "answer": Comprehensive legal explanation
          2. "reasoning": Logical breakdown
          3. "confidence_score": 0-1 rating
          4. "key_sources": Relevant context snippets
        - Be concise but thorough
        - Explicitly state if context is insufficient
        """

        try:
            response = self.client.chat.completions.create(
                model="llama3-70b-8192",
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_query},
                ],
                max_tokens=2048,
                temperature=0.3,
            )

            return json.loads(response.choices[0].message.content)

        except Exception as e:
            return {"error": str(e), "trace": traceback.format_exc()}

In [10]:
import os
import torch
import uvicorn
import asyncio
import nest_asyncio
import magic

from torch.amp import autocast
from fastapi import FastAPI, UploadFile, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from io import BytesIO
from PyPDF2 import PdfReader
from pydantic import BaseModel

from contextlib import asynccontextmanager
from pydantic import Field
from time import time
from pyngrok import ngrok
from concurrent.futures import ThreadPoolExecutor
from uvicorn import Config, Server

nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:5173"],  # Replace with your frontend's URL
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Translation model and resources
translation_model = "facebook/mbart-large-50-many-to-one-mmt"
model = None
tokenizer = None
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize RAG pipeline
pinecone_api_key = PINECONE_API_KEY
pinecone_env = PINECONE_ENV
groq_api_key = GROQ_API_KEY

rag_pipeline = RAGPipeline(
    embedding_model_name="all-MiniLM-L6-v2",
    pinecone_api_key=pinecone_api_key,
    pinecone_env=pinecone_env,
    groq_api_key=groq_api_key,
)

# ThreadPoolExecutor for parallel tasks
executor = ThreadPoolExecutor(max_workers=4)

# Pydantic model for query request
class QueryRequest(BaseModel):
    user_query: str = Field(
        ..., min_length=5, title="User Query", description="The legal question to be analyzed"
    )

# Asynchronous context manager to load and release model resources
@asynccontextmanager
async def lifespan(app: FastAPI):
    global model, tokenizer, embedding_model
    model = MBartForConditionalGeneration.from_pretrained(translation_model).to(device)
    tokenizer = MBart50TokenizerFast.from_pretrained(translation_model)
    embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
    print("Models and tokenizer loaded.")
    yield
    del model, tokenizer, embedding_model
    torch.cuda.empty_cache()
    print("Models and tokenizer resources released.")

app.router.lifespan_context = lifespan

# Function to detect MIME type
def get_mime_type(file: UploadFile):
    mime = magic.Magic(mime=True)
    file_content = file.file.read(2048)
    mime_type = mime.from_buffer(file_content)
    file.file.seek(0)
    return mime_type

# Function to read PDF content
def read_pdf(file: UploadFile):
    pdf_content = BytesIO(file.file.read())
    reader = PdfReader(pdf_content)
    return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()).strip()

# Function to split content into chunks
def create_chunks(text: str, chunk_size: int, overlap: int):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1
        if current_length + word_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            overlap_words = (
                current_chunk[-overlap:]
                if overlap <= len(current_chunk)
                else current_chunk
            )
            current_chunk = overlap_words
            current_length = sum(len(w) + 1 for w in current_chunk)

        current_chunk.append(word)
        current_length += word_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Asynchronous function to translate chunks
async def translate_chunks(chunks, batch_size):
    translations = []

    async def translate_batch(batch):
        encoded_input = tokenizer(
            batch, return_tensors="pt", padding=True, truncation=True, max_length=1024
        ).to(device)

        with autocast(device_type="cuda" if torch.cuda.is_available() else "cpu"):
            generated_tokens = model.generate(
                encoded_input["input_ids"], max_length=128, num_beams=1
            )

        return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    tasks = [
        translate_batch(chunks[i: i + batch_size])
        for i in range(0, len(chunks), batch_size)
    ]
    results = await asyncio.gather(*tasks)
    for result in results:
        translations.extend(result)

    return translations

# Function to store embeddings in Pinecone
def store_embeddings(translated_chunks):
    try:
        if not pinecone_api_key:
            raise ValueError(
                "PINECONE_API_KEY not found in environment variables. Ensure it is set in the .env file."
            )

        pc = Pinecone(api_key=pinecone_api_key)
        index_name = "legal-llm"

        if index_name in pc.list_indexes().names():
            pc.delete_index(index_name)

        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )

        index = pc.Index(index_name)
        embeddings = embedding_model.encode(
            translated_chunks, convert_to_tensor=True, show_progress_bar=True
        ).cpu().numpy()

        batch_size = 256  # Increased batch size
        for i in range(0, len(translated_chunks), batch_size):
            batch = [
                (f"doc_{j}", embeddings[j], {"text": translated_chunks[j]})
                for j in range(i, min(i + batch_size, len(translated_chunks)))
            ]
            index.upsert(vectors=batch)

        print("Translated chunks embedded and stored in Pinecone.")

    except Exception as e:
        print(f"Error storing embeddings in Pinecone: {str(e)}")

    return index_name

# API Endpoints
@app.get("/")
def read_root():
    return {"message": "Welcome to the LegalDoc-Translate-Query-Assistant portal!"}

@app.post("/translate-pdf/")
async def process_pdf(
    file: UploadFile,
    background_tasks: BackgroundTasks,
    chunk_size: int = 500,
    overlap: int = 50,
    batch_size: int = 256,
):
    mime_type = get_mime_type(file)
    if mime_type != "application/pdf":
        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a valid PDF file.")

    if file.size > 10 * 1024 * 1024:  # 10 MB limit
        raise HTTPException(status_code=400, detail="File size exceeds 10 MB.")

    try:
        start_time = time()
        text = read_pdf(file)
        chunks = create_chunks(text, chunk_size, overlap)
        translated_chunks = await translate_chunks(chunks, batch_size)
        processing_time = time() - start_time

        # Run embedding storage in a separate thread
        executor.submit(store_embeddings, translated_chunks)

        return {
            "mime_type": mime_type,
            "translated_chunks": translated_chunks,
            "processing_time": f"{processing_time:.2f} seconds",
            "pinecone_status": "Embedding storage initiated in the background.",
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing the PDF: {str(e)}")

@app.post("/query/")
async def query_llm(request: QueryRequest):
    try:
        context = rag_pipeline.get_context(request.user_query)
        if not context:
            raise HTTPException(status_code=404, detail="No relevant context found for the query.")

        response = rag_pipeline.generate_response(request.user_query, context)
        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")

if __name__ == "__main__":
    async def main():
        try:
            public_url = ngrok.connect(8000).public_url
            print(f"Public URL: {public_url}")

            config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
            server = uvicorn.Server(config)

            await server.serve()
        except OSError as e:
            if "address already in use" in str(e):
                print("Port 8000 is already in use. Please stop the existing process or choose another port.")
            else:
                raise e

    try:
        asyncio.run(main())
    except RuntimeError as e:
        if "This event loop is already running" in str(e):
            print("Using an alternative execution due to running event loop.")
            task = asyncio.create_task(main())
            await task
    except KeyboardInterrupt:
        print("Shutting down server.")
        ngrok.disconnect(public_url)

Public URL: https://f333-34-90-206-123.ngrok-free.app


INFO:     Started server process [264]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Models and tokenizer loaded.
INFO:     103.158.138.147:0 - "GET / HTTP/1.1" 200 OK
INFO:     103.158.138.147:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     103.158.138.147:0 - "POST /translate-pdf/ HTTP/1.1" 200 OK


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Translated chunks embedded and stored in Pinecone.
INFO:     103.158.138.147:0 - "POST /translate-pdf/ HTTP/1.1" 200 OK


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Translated chunks embedded and stored in Pinecone.
INFO:     103.158.138.147:0 - "POST /translate-pdf/ HTTP/1.1" 200 OK


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Translated chunks embedded and stored in Pinecone.


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [264]


Models and tokenizer resources released.
