In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import os
from typing import List, Tuple
import uuid
import joblib

# Initialize models and clients
class RAGSystem:
    def __init__(self, tsv_path: str, openai_api_key: str, reload: bool = False):
        """
        Initialize the RAG system with TSV file and OpenAI API key.
        
        Args:
            tsv_path (str): Path to the TSV file
            openai_api_key (str): OpenAI API key
            reload (bool): 
        """
        self.model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
        self.client = OpenAI(api_key=openai_api_key)
        self.index = None
        self.documents = []
        self.metadata = []
        self.dimension = 768
        self.load_and_index_documents(tsv_path, reload)

    def load_and_index_documents(self, tsv_path: str, reload: bool = False) -> None:
        base_path = os.path.splitext(os.path.splitext(tsv_path)[0])[0]
        index_path = f"{base_path}.faiss"
        data_path = f"{base_path}_data.pkl"
    
        if not reload and os.path.exists(index_path) and os.path.exists(data_path):
            self.index = faiss.read_index(index_path)
            data = joblib.load(data_path)
            self.documents = data['documents']
            self.metadata = data['metadata']
            return
    
        # Load and index from scratch
        df = pd.read_csv(tsv_path, sep="\t", compression="gzip")
        if not all(col in df.columns for col in ['name', 'type', 'content']):
            raise ValueError("TSV must contain 'name', 'type', and 'content' columns")
    
        self.documents = df['content'].tolist()
        self.metadata = df[['name', 'type']].to_dict('records')
    
        embeddings = self.model.encode(self.documents, batch_size=32, show_progress_bar=True)
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(np.array(embeddings, dtype=np.float32))
    
        faiss.write_index(self.index, index_path)
        joblib.dump({'documents': self.documents, 'metadata': self.metadata}, data_path)

    def retrieve(self, query: str, k: int = 5) -> List[Tuple[str, dict, float]]:
        """
        Retrieve top-k relevant documents for a given query.
        
        Args:
            query (str): User query
            k (int): Number of documents to retrieve
            
        Returns:
            List of (content, metadata, score) tuples
        """
        # Encode query
        query_embedding = self.model.encode([query])[0]
        
        # Search FAISS index
        distances, indices = self.index.search(np.array([query_embedding], dtype=np.float32), k)
        
        # Collect results
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx < len(self.documents):
                score = 1 / (1 + distance)  # Convert distance to similarity score
                results.append((self.documents[idx], self.metadata[idx], score))
        
        return results

    def generate_response(self, query: str, retrieved_docs: List[Tuple[str, dict, float]]) -> str:
        """
        Generate a response using OpenAI API with retrieved documents as context.
        
        Args:
            query (str): User query
            retrieved_docs: List of (content, metadata, score) tuples
            
        Returns:
            str: Generated response
        """
        # Prepare context from retrieved documents
        context = "\n\n".join([f"Document: {doc[0]}\nMetadata: {doc[1]}" for doc in retrieved_docs])
        
        # Construct prompt
        prompt = f"""You are a legal assistant powered by a RAG system. Use the following context to answer the query accurately and concisely. If the context doesn't provide enough information, state so and provide a general response based on your knowledge.

Context:
{context}

Query:
{query}

Answer:
"""
        
        # Call OpenAI API
        response = self.client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[
                {"role": "system", "content": "You are a helpful legal assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.7
        )
        
        return response.choices[0].message.content.strip()

    def query(self, query: str, k: int = 5) -> dict:
        """
        Process a query through the RAG pipeline.
        
        Args:
            query (str): User query
            k (int): Number of documents to retrieve
            
        Returns:
            dict: Response and retrieved documents
        """
        # Retrieve relevant documents
        retrieved_docs = self.retrieve(query, k)
        
        # Generate response
        answer = self.generate_response(query, retrieved_docs)
        
        return {
            "query": query,
            "answer": answer,
            "retrieved_documents": [
                {"content": doc[0], "metadata": doc[1], "score": doc[2]}
                for doc in retrieved_docs
            ]
        }

2025-07-30 02:12:04.846772: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753841525.042080      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753841525.101739      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!gdown 1--7L-BtJwrQB7yXcfPgV9_zQfn9nK-S5

df = pd.read_csv("bills.tsv.gz", sep="\t", compression="gzip")
df = df.rename(columns={"filename": "name"})
df["type"] = "bill"

def chunk_df_by_words(df, chunk_size=500, overlap=100):
    chunks = []
    for _, row in df.iterrows():
        words = row['content'].split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append({
                'name': row['name'],
                'type': row['type'],
                'chunk': i // (chunk_size - overlap),
                'content': chunk
            })
    return pd.DataFrame(chunks)

df = chunk_df_by_words(df)

df.to_csv('bills.tsv.gz', sep='\t', index=False, compression='gzip')

df.head()

Downloading...
From: https://drive.google.com/uc?id=1--7L-BtJwrQB7yXcfPgV9_zQfn9nK-S5
To: /kaggle/working/bills.tsv.gz
100%|██████████████████████████████████████| 6.17M/6.17M [00:00<00:00, 31.6MB/s]


Unnamed: 0,name,type,chunk,content
0,2010-10-16-2010_E.txt,bill,0,THE GAZETTE OF THE DEMOCRATIC SOCIALIST REPUBL...
1,2010-10-16-2010_E.txt,bill,1,Local Authorities Elections Ordinance (Cap. 26...
2,2010-10-16-2010_E.txt,bill,2,Order made under section 3C of the Local Autho...
3,2010-10-16-2010_E.txt,bill,3,beginning from the words “Where a budget or su...
4,2010-10-16-2010_E.txt,bill,4,"and fraction, the integer shall be deemed to b..."


In [4]:
from kaggle_secrets import UserSecretsClient
openai_api_key = UserSecretsClient().get_secret("openai_api_key")
tsv_path = "bills.tsv.gz"

# Initialize RAG system
rag = RAGSystem(tsv_path, openai_api_key)

# Example query
query = "What are the main objectives of the Jayanthipura association in community welfare and environment?"
result = rag.query(query, k=3)

# Print results
print(f"Query: {result['query']}")
print(f"Answer: {result['answer']}")
print("\nRetrieved Documents:")
for doc in result['retrieved_documents']:
    print(f"\nContent: {doc['content'][:100]}...")
    print(f"Metadata: {doc['metadata']}")
    print(f"Score: {doc['score']:.4f}")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Batches:   0%|          | 0/383 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query: What are the main objectives of the Jayanthipura association in community welfare and environment?
Answer: The provided contexts do not contain specific information regarding the main objectives of the Jayanthipura association in community welfare and environment. Therefore, I cannot provide a definitive answer based on the given documents. 

In general, community welfare and environmental objectives for such associations typically include activities like social development, environmental conservation, promoting harmony among community members, and improving living conditions. For precise objectives, please refer to the official documents or statements directly related to the Jayanthipura association.

Retrieved Documents:

Content: Avamangalyadara Samithi affiliated to the Kotasara Piyangala Raja Maha Viharastha Sanwardena Sabhawa...
Metadata: {'name': '2015-12-75-2015_E.txt', 'type': 'bill'}
Score: 0.0495

Content: religious favour among Buddhists and the people of the area; (