In [2]:
!pip install qdrant-client sentence-transformers PyPDF2 python-docx gradio

Collecting qdrant-client
  Using cached qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from

In [3]:
import os
import uuid
from PyPDF2 import PdfReader
from docx import Document
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import gradio as gr

In [4]:
from google.colab import files
uploaded = files.upload()
os.makedirs("docs", exist_ok=True)
for name in uploaded:
    os.rename(name, f"docs/{name}")

Saving jina ai v4 embedding.pdf to jina ai v4 embedding.pdf


In [5]:
def chunk_text(text, chunk_size=300, overlap=50):
    chunks, start = [], 0
    while start < len(text):
        chunks.append(text[start:start+chunk_size])
        start += chunk_size - overlap
    return chunks
def extract_pdf_chunks(path):
    reader = PdfReader(path)
    chunks = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            for j, chunk in enumerate(chunk_text(text)):
                chunks.append({
                    "text": chunk,
                    "metadata": {
                        "filename": os.path.basename(path),
                        "page": i+1,
                        "chunk_id": f"{i+1}-{j}"
                    }
                })
    return chunks
def extract_docx_chunks(path):
    doc = Document(path)
    text = "\n".join([p.text for p in doc.paragraphs])
    return [{
        "text": chunk,
        "metadata": {
            "filename": os.path.basename(path),
            "page": 1,
            "chunk_id": f"1-{i}"
        }
    } for i, chunk in enumerate(chunk_text(text))]

In [6]:
embedder = SentenceTransformer("jinaai/jina-embeddings-v2-small-en")
def embed_texts(texts):
    return embedder.encode(texts, normalize_embeddings=True)
client = QdrantClient(":memory:")
client.recreate_collection("docs", vectors_config=VectorParams(size=512, distance=Distance.COSINE))

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  client.recreate_collection("docs", vectors_config=VectorParams(size=512, distance=Distance.COSINE))


True

In [7]:
all_chunks = []
for fname in os.listdir("docs"):
    path = os.path.join("docs", fname)
    if fname.endswith(".pdf"):
        all_chunks += extract_pdf_chunks(path)
    elif fname.endswith(".docx"):
        all_chunks += extract_docx_chunks(path)
texts = [c["text"] for c in all_chunks]
embeddings = embed_texts(texts)
points = [
    PointStruct(id=uuid.uuid4().hex, vector=vec, payload=chunk)
    for vec, chunk in zip(embeddings, all_chunks)
]
client.upsert(collection_name="docs", points=points)



UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [8]:
def answer_question(query):
    vec = embed_texts([query])[0]
    results = client.search(collection_name="docs", query_vector=vec, limit=1)
    if not results:
        return "I couldn't find an answer in the uploaded documents."
    result = results[0].payload
    text = result["text"]
    meta = result["metadata"]
    source = f"Source: {meta['filename']} | Page: {meta['page']} | Chunk: {meta['chunk_id']}"
    return f"{text}\n\n{source}"

In [10]:
gr.Interface(fn=answer_question,
             inputs="text",
             outputs="text",
             title=" Document RAG Chatbot",
             description="Ask a question. The answer is retrieved directly from the closest matching text chunk.").launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f6411c56183c85fa69.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


