<a href="https://colab.research.google.com/github/LIKITH43/legalaa/blob/main/LEGAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required dependencies
!pip install streamlit chromadb torch sentence-transformers pdfplumber gradio_client python-dotenv

import os
import streamlit as st
import chromadb
import logging
import torch
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from gradio_client import Client
from dotenv import load_dotenv
import pdfplumber

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class LegalRAG:
    def __init__(self, embedding_model='sentence-transformers/all-MiniLM-L6-v2',
                 llm_model='meta-llama/Meta-Llama-3-8B-Instruct',
                 db_path='./chroma_db', collection_name='legal_docs'):
        load_dotenv()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {self.device}")

        self.embedding_model = SentenceTransformer(embedding_model)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
        self.llm = Client("KingNish/Very-Fast-Chatbot")

        self.chroma_client = chromadb.PersistentClient(path=db_path)
        self.collection = self.chroma_client.get_or_create_collection(name=collection_name)

    def extract_text(self, uploaded_file) -> str:
        try:
            with pdfplumber.open(uploaded_file) as pdf:
                return '\n\n'.join([page.extract_text() or '' for page in pdf.pages])
        except Exception as e:
            logger.error(f"PDF extraction error: {e}")
            return ""

    def process_document(self, text: str) -> list:
        try:
            chunks = self.text_splitter.split_text(text)
            embeddings = self.embedding_model.encode(chunks, convert_to_tensor=False)
            return [{"id": str(hash(chunk)), "chunk": chunk, "embedding": embedding} for chunk, embedding in zip(chunks, embeddings)]
        except Exception as e:
            logger.error(f"Chunking & embedding error: {e}")
            return []

    def upload_document(self, document_data: list) -> bool:
        try:
            existing_ids = set(self.collection.get(ids=[d['id'] for d in document_data])['ids'])
            new_data = [d for d in document_data if d['id'] not in existing_ids]

            if new_data:
                self.collection.add(ids=[d['id'] for d in new_data],
                                    embeddings=[d['embedding'] for d in new_data],
                                    documents=[d['chunk'] for d in new_data])
                logger.info(f"Uploaded {len(new_data)} new chunks.")
            return True
        except Exception as e:
            logger.error(f"ChromaDB upload error: {e}")
            return False

    def retrieve_and_summarize(self, query: str) -> str:
        try:
            query_embedding = self.embedding_model.encode(query, convert_to_tensor=False).tolist()
            results = self.collection.query(query_embeddings=[query_embedding], n_results=3)

            if not results["documents"]:
                return "No relevant information found."
            context = '\n\n'.join(results["documents"][0])

            template = f'''
            You are a Legal AI Assistant. Answer based on the legal context:

            **Context:** {context}

            **User Query:** {query}

            Provide structured legal analysis.
            '''

            response = self.llm.predict(Query=template, api_name="/predict")
            return response.strip()
        except Exception as e:
            logger.error(f"Retrieval error: {e}")
            return "Error generating summary."

# Streamlit Deployment
def main():
    st.set_page_config(page_title="LegalBot", page_icon="⚖️", layout="wide")
    st.title("🏛️ LegalBot: Document Summarizer")

    rag_system = LegalRAG()

    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Upload a legal document for analysis.")
    if uploaded_file:
        with st.spinner("Processing..."):
            text = rag_system.extract_text(uploaded_file)
            document_data = rag_system.process_document(text)
            if rag_system.upload_document(document_data):
                st.success("Document processed successfully!")

    query = st.text_input("Enter your legal query:", placeholder="Ask a question...")
    if query:
        with st.spinner("Generating Summary..."):
            summary = rag_system.retrieve_and_summarize(query)
        st.markdown("### 📄 Summary")
        st.markdown(summary)

if __name__ == "__main__":
    main()


Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio_client
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.

2025-03-18 09:09:23.440 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded as API: https://kingnish-very-fast-chatbot.hf.space ✔


2025-03-18 09:09:33.628 Session state does not function when running a script without `streamlit run`
