In [2]:
!pip install openai qdrant-client tqdm

Collecting qdrant-client
  Downloading qdrant_client-1.13.2-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading qdrant_client-1.13.2-py3-none-any.whl (306 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.6/306.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any

In [10]:
import json
import os
import openai
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm import tqdm



OPENAI_API_KEY="API-kEY"
QDRANT_API_KEY="API-kEY"
QDRANT_URL="API-kEY"
QDRANT_COLLECTION_NAME="NPCI"

EMBEDDING_MODEL = "text-embedding-3-small"


with open('trimmed_data.txt', 'r', encoding='utf-8') as file:
    data = json.load(file)

client = openai.OpenAI(api_key=OPENAI_API_KEY)


qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)


embedding_dimension = 1536


collection_name = "rbi_circulars"
try:
    qdrant_client.get_collection(collection_name)
    print(f"Collection '{collection_name}' already exists")
except Exception:

    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embedding_dimension,
            distance=models.Distance.COSINE
        )
    )
    print(f"Created new collection: '{collection_name}'")

def process_circular(circular):

    content_text = f"Title: {circular['Subject']}\n"
    content_text += f"Department: {circular['Department']}\n"
    content_text += f"Circular Number: {circular['Circular Number']}\n"
    content_text += f"Date: {circular['Date Of Issue']}\n"
    content_text += f"Meant For: {circular['Meant For']}\n\n"

    if 'details' in circular and 'circular' in circular['details']:
        circular_details = circular['details']['circular']


        if 'contentSections' in circular_details:
            for section in circular_details['contentSections']:
                if 'title' in section and section['title']:
                    content_text += f"Section: {section['title']}\n"
                if 'content' in section and section['content']:
                    content_text += f"{section['content']}\n\n"


    response = client.embeddings.create(
        input=content_text,
        model=EMBEDDING_MODEL
    )
    embedding = response.data[0].embedding


    metadata = {
        "title": circular['Subject'],
        "department": circular['Department'],
        "circular_number": circular['Circular Number'],
        "date": circular['Date Of Issue'],
        "meant_for": circular['Meant For'],
        "link": circular['link'],
        "text": content_text[:1000]  # Store first 1000 chars as preview
    }

    return embedding, metadata, content_text

print(f"Processing {len(data['circulars'])} circulars...")

batch_size = 100
points_to_upload = []

for i, circular in enumerate(tqdm(data['circulars'])):
    try:
        embedding, metadata, content_text = process_circular(circular)

        points_to_upload.append(models.PointStruct(
            id=i,
            vector=embedding,
            payload=metadata
        ))

        if len(points_to_upload) >= batch_size or i == len(data['circulars']) - 1:
            qdrant_client.upsert(
                collection_name=collection_name,
                points=points_to_upload
            )
            print(f"Uploaded batch of {len(points_to_upload)} circulars")
            points_to_upload = []

    except Exception as e:
        print(f"Error processing circular {i}: {str(e)}")

print("Finished uploading all circulars to Qdrant Cloud")

def search_circulars(query, limit=5):

    response = client.embeddings.create(
        input=query,
        model=EMBEDDING_MODEL
    )
    query_embedding = response.data[0].embedding

    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=limit
    )

    results = []
    for result in search_results:
        results.append({
            "score": result.score,
            "circular_number": result.payload.get("circular_number"),
            "title": result.payload.get("title"),
            "link": result.payload.get("link"),
            "preview": result.payload.get("text")
        })

    return results

if __name__ == "__main__":

    query = "What are the recent changes to prudential norms for urban cooperative banks?"
    results = search_circulars(query)

    print("\nSearch Results:")
    for i, result in enumerate(results):
        print(f"\n--- Result {i+1} (Score: {result['score']:.4f}) ---")
        print(f"Circular: {result['circular_number']}")
        print(f"Title: {result['title']}")
        print(f"Link: {result['link']}")
        print(f"Preview: {result['preview'][:200]}...")

Collection 'rbi_circulars' already exists
Processing 50 circulars...


100%|██████████| 50/50 [00:17<00:00,  2.89it/s]

Uploaded batch of 50 circulars
Finished uploading all circulars to Qdrant Cloud

Search Results:

--- Result 1 (Score: 0.6374) ---
Circular: RBI/2024-2025/58DOR.CAP.REC.No.27/09.18.201/2024-25
Title: Prudential Treatment of Bad and Doubtful Debt Reserve by Co-operative Banks
Link: https://m.rbi.org.in//scripts/BS_CircularIndexDisplay.aspx?Id=12716
Preview: Title: Prudential Treatment of Bad and Doubtful Debt Reserve by Co-operative Banks
Department: Department of Regulation
Circular Number: RBI/2024-2025/58DOR.CAP.REC.No.27/09.18.201/2024-25
Date: 02.8....

--- Result 2 (Score: 0.6092) ---
Circular: RBI/2024-2025/53DOR.CRE.REC.28/07.10.002/2024-25
Title: Small Value Loans – Primary (Urban) Co-operative Banks (UCBs)
Link: https://m.rbi.org.in//scripts/BS_CircularIndexDisplay.aspx?Id=12709
Preview: Title: Small Value Loans – Primary (Urban) Co-operative Banks (UCBs)
Department: Department of Regulation
Circular Number: RBI/2024-2025/53DOR.CRE.REC.28/07.10.002/2024-25
Date: 25.7.2024
Mean


  search_results = qdrant_client.search(


In [5]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.18.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [7]:
!pip install markdown2

Collecting markdown2
  Downloading markdown2-2.5.3-py3-none-any.whl.metadata (2.1 kB)
Downloading markdown2-2.5.3-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: markdown2
Successfully installed markdown2-2.5.3


In [13]:
import json
import os
import openai
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm import tqdm
from typing import List, Dict, Any
import gradio as gr
import markdown2
import requests
from bs4 import BeautifulSoup


OPENAI_API_KEY="API-kEY"
QDRANT_API_KEY="API-kEY"
QDRANT_URL="API-kEY"
COLLECTION_NAME = "rbi_circulars"
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-3.5-turbo"

client = openai.OpenAI(api_key=OPENAI_API_KEY)
qdrant_client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)

def get_embedding(text: str) -> List[float]:
    """Generate embeddings for the given text."""
    response = client.embeddings.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    return response.data[0].embedding

def search_circulars(query: str, limit: int = 5) -> List[Dict[str, Any]]:

    query_embedding = get_embedding(query)

    search_results = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding,
        limit=limit
    )

    results = []
    for result in search_results:
        results.append({
            "score": result.score,
            "circular_number": result.payload.get("circular_number"),
            "title": result.payload.get("title"),
            "department": result.payload.get("department"),
            "date": result.payload.get("date"),
            "meant_for": result.payload.get("meant_for"),
            "link": result.payload.get("link"),
            "preview": result.payload.get("text")
        })

    return results

def fetch_full_circular_content(url: str) -> str:
    """Fetch the full content of a circular from its URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        content_div = soup.find('div', class_='content')
        if content_div:
            return content_div.get_text(strip=True)
        else:
            return "Full content could not be extracted. Please visit the original link."
    except Exception as e:
        return f"Error fetching content: {str(e)}"

def generate_response(query: str, retrieved_docs: List[Dict[str, Any]]) -> str:

    context = ""
    for i, doc in enumerate(retrieved_docs):
        context += f"Document {i+1}:\n"
        context += f"Title: {doc['title']}\n"
        context += f"Circular Number: {doc['circular_number']}\n"
        context += f"Department: {doc['department']}\n"
        context += f"Date: {doc['date']}\n"
        context += f"Preview: {doc['preview']}\n\n"


    prompt = f"""You are an RBI policy expert. Use the following RBI circulars to answer the user's question.
If the information is not in the circulars, say you don't know.

User Query: {query}

Retrieved Circulars:
{context}

Please provide a comprehensive answer based on the information in these circulars.
"""


    response = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in RBI policies and circulars."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=1000
    )

    return response.choices[0].message.content

def format_results_html(results: List[Dict[str, Any]]) -> str:

    html = "<div style='font-family: Arial, sans-serif;'>"

    for i, result in enumerate(results):
        relevance = int(result["score"] * 100)
        html += f"""
        <div style='margin-bottom: 20px; padding: 15px; border-radius: 8px; background-color: #f9f9f9; border-left: 5px solid #2c5282;'>
            <h3 style='color: #2c5282; margin-top: 0;'>{result["title"]}</h3>
            <div style='display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 10px;'>
                <span style='background-color: #e2e8f0; padding: 5px 10px; border-radius: 15px; font-size: 12px;'>
                    <strong>Circular:</strong> {result["circular_number"]}
                </span>
                <span style='background-color: #e2e8f0; padding: 5px 10px; border-radius: 15px; font-size: 12px;'>
                    <strong>Department:</strong> {result["department"]}
                </span>
                <span style='background-color: #e2e8f0; padding: 5px 10px; border-radius: 15px; font-size: 12px;'>
                    <strong>Date:</strong> {result["date"]}
                </span>
                <span style='background-color: #e2e8f0; padding: 5px 10px; border-radius: 15px; font-size: 12px;'>
                    <strong>Relevance:</strong> {relevance}%
                </span>
            </div>
            <div style='margin-bottom: 10px;'>
                <p style='margin: 0;'>{result["preview"][:300]}...</p>
            </div>
            <div>
                <a href='{result["link"]}' target='_blank' style='color: #3182ce; text-decoration: none;'>View Original Circular →</a>
            </div>
        </div>
        """

    html += "</div>"
    return html

def rag_query(query, num_results=5):
    """Main RAG function that handles the entire process."""
    if not query.strip():
        return "Please enter a query.", ""

    retrieved_docs = search_circulars(query, limit=num_results)

    if not retrieved_docs:
        return "No relevant circulars found.", ""

    llm_response = generate_response(query, retrieved_docs)

    formatted_results = format_results_html(retrieved_docs)

    return llm_response, formatted_results

def create_interface():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# RBI Circulars RAG System")
        gr.Markdown("Query the RBI Circulars database using natural language")

        with gr.Row():
            with gr.Column(scale=4):
                query_input = gr.Textbox(
                    label="Your Query",
                    placeholder="E.g., What are the recent changes to prudential norms for urban cooperative banks?",
                    lines=2
                )
            with gr.Column(scale=1):
                num_results = gr.Slider(
                    minimum=1,
                    maximum=10,
                    value=5,
                    step=1,
                    label="Number of Results"
                )
                submit_btn = gr.Button("Search", variant="primary")

        with gr.Row():
            with gr.Column():
                response_output = gr.Markdown(label="AI Response")

        with gr.Row():
            results_output = gr.HTML(label="Retrieved Circulars")

        submit_btn.click(
            fn=rag_query,
            inputs=[query_input, num_results],
            outputs=[response_output, results_output]
        )

        gr.Examples(
            examples=[
                ["What are the recent changes to prudential norms for urban cooperative banks?"],
                ["Explain the guidelines for digital lending"],
                ["What are the regulations for NBFCs regarding loan recovery?"],
                ["Latest updates on UPI payment systems"]
            ],
            inputs=query_input
        )

    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d61553d8c125f3b561.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
