<a href="https://colab.research.google.com/github/NadineWaleed/chatbot/blob/main/merged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers faiss-cpu --quiet
!pip install langchain langchain-community langchain-core chromadb groq pypdf mistralai

In [None]:
import os
import json
import re
import time
from datetime import datetime
from pathlib import Path
import faiss
import numpy as np
from groq import Groq
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from mistralai import Mistral
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
from mistralai.models import OCRResponse
from IPython.display import Markdown, display

In [None]:
client = Mistral(api_key="1H5nc5Yx3SH9nPTttAav1QgudKvM81WJ")
os.environ["GROQ_API_KEY"] = "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"
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
print("Groq API initialized.")
from huggingface_hub import HfApi
api = HfApi()
api_token = "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"
me = api.whoami(token=api_token)
print(me)

In [None]:
folder_path = Path("/content/AllDocs")
pdf_files = list(folder_path.glob("*.pdf"))
assert pdf_files, "No PDF files found in the specified folder."


In [None]:
pdf_files

In [None]:
all_markdowns = []

for pdf_file in pdf_files:
    print(f"Processing: {pdf_file.name}")

    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_file.stem,
            "content": pdf_file.read_bytes(),
        },
        purpose="ocr",
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

    pdf_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
    )

    response_dict = json.loads(pdf_response.model_dump_json())
    json_string = json.dumps(response_dict, indent=4)
    with open('ocr_output.json', 'w') as f:
        f.write(json_string)

    print("OCR processing complete")
    for page in response_dict['pages']:
        content = page['markdown']
        content = re.sub(r'^(#+)\s*', r'\1 ', content, flags=re.MULTILINE)
        all_markdowns.append(content)


combined_markdown = "\n\n".join(all_markdowns)
with open('combined_markdown.txt', 'w', encoding='utf-8') as f:
    f.write(combined_markdown)

print("✅ All PDFs processed and combined into markdown.")


In [None]:
with open("combined_markdown.txt", "r", encoding="utf-8") as f:
    document_text = f.read()


In [None]:
# === Setup ===
import os
import re
import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer
from groq import Groq
from langchain.memory import ConversationBufferMemory

# === Config ===
os.environ["GROQ_API_KEY"] = "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"
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
embedder = SentenceTransformer("intfloat/e5-small-v2")
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# === Load and preprocess document ===
with open("combined_markdown.txt", "r", encoding="utf-8") as f:
    document_text = f.read()

# === Chunk by headers ===
def chunk_text_with_headers(text, chunk_size=500, overlap=50):
    chunks = []
    current_header = "Unknown"
    buffer = []

    lines = text.split('\n')
    for line in lines:
        if re.match(r'^#+\s+.*', line.strip()):
            current_header = line.strip().replace("#", "").strip()
        buffer.append(line)

        if len(" ".join(buffer).split()) >= chunk_size:
            chunk = "\n".join(buffer).strip()
            chunks.append({"text": chunk, "header": current_header})
            buffer = buffer[-overlap:]

    if buffer:
        chunks.append({"text": "\n".join(buffer).strip(), "header": current_header})

    return chunks

headered_chunks = chunk_text_with_headers(document_text)

# === Embedding ===
def embed_chunks(chunks):
    chunk_texts = [f"passage: {c['text']}" for c in chunks]
    embeddings = embedder.encode(chunk_texts, show_progress_bar=True)
    return np.array(embeddings), chunk_texts, [c['header'] for c in chunks]

def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

embeddings, chunk_texts, headers = embed_chunks(headered_chunks)
index = build_faiss_index(embeddings)

# === Smart Prompt Handling ===
def extract_section_from_query(query):
    match = re.search(r"(?:about|of|regarding|on)\s+(.*?components)", query.lower())
    if match:
        return match.group(1).strip().title()
    return "Components"

def is_structural_component_question(query: str) -> bool:
    query = query.lower()
    return bool(re.search(r"(what|list).*(components|modules|parts|features).*of", query))

def build_prompt(query, context):
    if is_structural_component_question(query):
        section_name = extract_section_from_query(query)
        return f"""
You are given structured text extracted from a document. It may contain headings and bullet points.

Your task is to extract **all detailed information related to the section titled \"{section_name}\"**.
This includes any sub-points or bullet entries listed under it.

Be exhaustive:
- Maintain structure and indentation
- Include every bullet, sub-point, and associated description
- Add brief explanations if available in context

Return a complete list using bullet points or numbered lists.

Only use the provided context. Do not make up information.

Context:
{context}

Question:
{query}
""".strip()
    else:
        return f"""
You are a precise assistant. Use ONLY the context below to answer the question in a complete and well-explained way.

Rules:
- Include all relevant points.
- Use bullet points or paragraphs for structure.
- Do not make up any information.
- Be clear and detailed.
- Use clean formatting.

Context:
{context}

Question:
{query}
""".strip()

# === Token Safe Querying ===
def truncate_text(text, max_words=400):
    words = text.split()
    return " ".join(words[:max_words])

def count_tokens(text):
    return len(text.split())

def query_groq(prompt: str, model: str = "llama3-70b-8192", temperature: float = 0.3, max_tokens: int = 4000) -> str:
    response = groq_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise and helpful assistant. Only answer using the context provided."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

def clean_response(response):
    response = re.sub(r'#+\s+', '', response)
    response = re.sub(r'[-*]\s+', '• ', response)
    lines = response.split('\n')
    for i in range(len(lines)):
        if re.match(r'^\d+\.\s+', lines[i]):
            lines[i] = re.sub(r'^\d+\.\s+', lambda m: f"{m.group(0)}", lines[i])
    return "\n".join(lines)

# === Final RAG Function ===
def ask_question_with_groq(query: str, index, chunk_texts, headers, top_k: int = 3, max_tokens_allowed: int = 5500):
    query_embedding = embedder.encode(f"query: {query}")
    distances, indices = index.search(np.array([query_embedding]), top_k)

    context_chunks = []
    total_tokens = 0

    for i in indices[0]:
        chunk = truncate_text(chunk_texts[i])
        tokens = count_tokens(chunk)
        if total_tokens + tokens > max_tokens_allowed:
            break
        context_chunks.append(chunk)
        total_tokens += tokens

    context = "\n\n".join(context_chunks)
    prompt = build_prompt(query, context)
    raw_response = query_groq(prompt)
    cleaned = clean_response(raw_response)

    sources = "\n\nSources:\n" + "\n".join([
        f"- Chunk {i+1}: {headers[idx]}" for i, idx in enumerate(indices[0][:len(context_chunks)])
    ])

    return cleaned + sources

# === Example Usage ===
if __name__ == "__main__":
    questions = [
        "What are the IQVIA components and their functions?",
        "What are the GAHAC components?",
        "Explain POS",
        "State to me all that has ASCII File",
        "What are the business rules for adding new family?",
        "What Main challenges do we face in Universal health coverage?"
    ]
    for q in questions:
        print(f"\n🔍 Question: {q}")
        print(ask_question_with_groq(q, index, chunk_texts, headers))


In [None]:
print(ask_question_with_groq("State to me all that has ASCII File", index, chunk_texts, headers))

In [None]:
# === Setup ===
import os
import re
import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer
from groq import Groq
from langchain.memory import ConversationBufferMemory

# === Config ===
os.environ["GROQ_API_KEY"] = "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"
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
embedder = SentenceTransformer("intfloat/e5-small-v2")
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# === Load and preprocess document ===
with open("combined_markdown.txt", "r", encoding="utf-8") as f:
    document_text = f.read()

# === Chunk by headers ===
def chunk_text_with_headers(text, chunk_size=300, overlap=100):
    chunks = []
    current_header = "Unknown"
    buffer = []

    lines = text.split('\n')
    for line in lines:
        if re.match(r'^#+\s+.*', line.strip()):
            current_header = line.strip().replace("#", "").strip()
        buffer.append(line)

        if len(" ".join(buffer).split()) >= chunk_size:
            chunk = "\n".join(buffer).strip()
            chunks.append({"text": chunk, "header": current_header})
            buffer = buffer[-overlap:]

    if buffer:
        chunks.append({"text": "\n".join(buffer).strip(), "header": current_header})

    return chunks

headered_chunks = chunk_text_with_headers(document_text)

# === Embedding ===
def embed_chunks(chunks):
    chunk_texts = [f"passage: {c['text']}" for c in chunks]
    embeddings = embedder.encode(chunk_texts, show_progress_bar=True)
    return np.array(embeddings), chunk_texts, [c['header'] for c in chunks]

def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

embeddings, chunk_texts, headers = embed_chunks(headered_chunks)
index = build_faiss_index(embeddings)

# === Smart Prompt Handling ===
def extract_section_from_query(query):
    match = re.search(r"(?:about|of|regarding|on)\s+(.*?components)", query.lower())
    if match:
        return match.group(1).strip().title()
    return "Components"

def is_structural_component_question(query: str) -> bool:
    query = query.lower()
    return bool(re.search(r"(what|list).*(components|modules|parts|features).*of", query))

def build_prompt(query, context):
    if is_structural_component_question(query):
        section_name = extract_section_from_query(query)
        return f"""
You are given structured text extracted from a document. It may contain headings and bullet points.

Your task is to extract **all detailed information related to the section titled \"{section_name}\"**.
This includes any sub-points or bullet entries listed under it.

Be exhaustive:
- Maintain structure and indentation
- Include every bullet, sub-point, and associated description
- Add brief explanations if available in context

Return a complete list using bullet points or numbered lists.

Only use the provided context. Do not make up information.

Context:
{context}

Question:
{query}
""".strip()
    else:
        return f"""
You are a precise assistant. Use ONLY the context below to answer the question in a complete and well-explained way.

Rules:
- Include all relevant points.
- Use bullet points or paragraphs for structure.
- Do not make up any information.
- Be clear and detailed.
- Use clean formatting.

Context:
{context}

Question:
{query}
""".strip()

# === Token Safe Querying ===
def truncate_text(text, max_words=400):
    words = text.split()
    return " ".join(words[:max_words])

def count_tokens(text):
    return len(text.split())

def query_groq(prompt: str, model: str = "llama3-70b-8192", temperature: float = 0.3, max_tokens: int = 4000) -> str:
    response = groq_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise and helpful assistant. Only answer using the context provided."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

def clean_response(response):
    response = re.sub(r'#+\s+', '', response)
    response = re.sub(r'[-*]\s+', '• ', response)
    lines = response.split('\n')
    for i in range(len(lines)):
        if re.match(r'^\d+\.\s+', lines[i]):
            lines[i] = re.sub(r'^\d+\.\s+', lambda m: f"{m.group(0)}", lines[i])
    return "\n".join(lines)

# === Final RAG Function ===
def ask_question_with_groq(query: str, index, chunk_texts, headers, top_k: int = 8, max_tokens_allowed: int = 5500):
    query_embedding = embedder.encode(f"query: {query}")
    distances, indices = index.search(np.array([query_embedding]), top_k)

    context_chunks = []
    total_tokens = 0

    for i in indices[0]:
        chunk = truncate_text(chunk_texts[i])
        tokens = count_tokens(chunk)
        if total_tokens + tokens > max_tokens_allowed:
            break
        context_chunks.append(chunk)
        total_tokens += tokens

    context = "\n\n".join(context_chunks)
    prompt = build_prompt(query, context)
    raw_response = query_groq(prompt)
    cleaned = clean_response(raw_response)



    return cleaned

# === Example Usage ===
if __name__ == "__main__":
    questions = [
        "What are the IQVIA components and their functions?",
        "What are the GAHAC components?",
        "Explain POS",
        "State to me all that has ASCII File",
        "What are the business rules for adding new family?",
        "What Main challenges do we face in Universal health coverage?"
    ]
    for q in questions:
        print(f"\n🔍 Question: {q}")
        print(ask_question_with_groq(q, index, chunk_texts, headers))


In [None]:
import os
import re
import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer
from groq import Groq
from langchain.memory import ConversationBufferMemory

os.environ["GROQ_API_KEY"] = "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"
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
embedder = SentenceTransformer("intfloat/e5-small-v2")
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

with open("combined_markdown.txt", "r", encoding="utf-8") as f:
    document_text = f.read()

def chunk_text_with_headers(text, chunk_size=300, overlap=100):
    chunks = []
    current_header = "Unknown"
    buffer = []

    lines = text.split('\n')
    for line in lines:
        if re.match(r'^#+\s+.*', line.strip()):
            current_header = line.strip().replace("#", "").strip()
        buffer.append(line)

        if len(" ".join(buffer).split()) >= chunk_size:
            chunk = "\n".join(buffer).strip()
            chunks.append({"text": chunk, "header": current_header})
            buffer = buffer[-overlap:]

    if buffer:
        chunks.append({"text": "\n".join(buffer).strip(), "header": current_header})

    return chunks

headered_chunks = chunk_text_with_headers(document_text)

def embed_chunks(chunks):
    chunk_texts = [f"passage: {c['text']}" for c in chunks]
    embeddings = embedder.encode(chunk_texts, show_progress_bar=True)
    return np.array(embeddings), chunk_texts, [c['header'] for c in chunks]

def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

embeddings, chunk_texts, headers = embed_chunks(headered_chunks)
index = build_faiss_index(embeddings)

def extract_section_from_query(query):
    match = re.search(r"(?:about|of|regarding|on)\s+(.*?components)", query.lower())
    if match:
        return match.group(1).strip().title()
    return "Components"

def is_structural_component_question(query: str) -> bool:
    query = query.lower()
    return bool(re.search(r"(what|list).*(components|modules|parts|features).*of", query))

def build_prompt(query, context):
    if is_structural_component_question(query):
        section_name = extract_section_from_query(query)
        return f"""
You are given structured text extracted from a document. It may contain headings and bullet points.

Your task is to extract **all detailed information related to the section titled \"{section_name}\"**.
This includes any sub-points or bullet entries listed under it.

Be exhaustive:
- Maintain structure and indentation
- Include every bullet, sub-point, and associated description
- Add brief explanations if available in context

Return a complete list using bullet points or numbered lists.

Only use the provided context. Do not make up information.

Context:
{context}

Question:
{query}
""".strip()
    else:
        return f"""
You are a precise assistant. Use ONLY the context below to answer the question in a complete and well-explained way.

Rules:
- Include all relevant points.
- Use bullet points or paragraphs for structure.
- Do not make up any information.
- Be clear and detailed.
- Use clean formatting.

Context:
{context}

Question:
{query}
""".strip()

def truncate_text(text, max_words=400):
    words = text.split()
    return " ".join(words[:max_words])

def count_tokens(text):
    return len(text.split())

def query_groq(prompt: str, model: str = "llama3-70b-8192", temperature: float = 0.3, max_tokens: int = 4000) -> str:
    response = groq_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise and helpful assistant. Only answer using the context provided."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

def clean_response(response):
    response = re.sub(r'#+\s+', '', response)
    response = re.sub(r'[-*]\s+', '• ', response)
    lines = response.split('\n')
    for i in range(len(lines)):
        if re.match(r'^\d+\.\s+', lines[i]):
            lines[i] = re.sub(r'^\d+\.\s+', lambda m: f"{m.group(0)}", lines[i])
    return "\n".join(lines)

def ask_question_with_groq(query: str, index, chunk_texts, headers, top_k: int = 8, max_tokens_allowed: int = 5500):
    query_embedding = embedder.encode(f"query: {query}")
    distances, indices = index.search(np.array([query_embedding]), top_k)

    context_chunks = []
    total_tokens = 0

    for i in indices[0]:
        chunk = truncate_text(chunk_texts[i])
        tokens = count_tokens(chunk)
        if total_tokens + tokens > max_tokens_allowed:
            break
        context_chunks.append(chunk)
        total_tokens += tokens

    context = "\n\n".join(context_chunks)
    prompt = build_prompt(query, context)
    raw_response = query_groq(prompt)
    cleaned = clean_response(raw_response)



    return cleaned

# === Example Usage ===
if __name__ == "__main__":
    questions = [
        "What are the IQVIA components and their functions?",
        "What are the GAHAC components?",
        "Explain POS",
        "State to me all that has ASCII File",
        "What are the business rules for adding new family?",
        "What Main challenges do we face in Universal health coverage?"
    ]
    for q in questions:
        print(f"\n🔍 Question: {q}")
        print(ask_question_with_groq(q, index, chunk_texts, headers))
