In [None]:
# vector_store_creator.py

import os
import sys
import uuid
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Configuration ---

# 1. DEFINE FOLDER WITH YOUR TEXT FILES
#    The script will scan this folder for any .txt files.
TEXT_FILES_FOLDER = "Documents_Cleaned_Editable"

# 2. DEFINE THE EMBEDDING MODEL
#    This model is specialized for biomedical and scientific text.
MODEL_NAME = "NeuML/bioclinical-modernbert-base-embeddings"

# 3. DEFINE QDRANT CONFIGURATION
#    This script connects to a Qdrant instance running in Docker.
QDRANT_URL = "http://localhost:8000"
COLLECTION_NAME = "rag_collection"

# 4. DEFINE TEXT CHUNKING PARAMETERS
CHUNK_SIZE = 1024  # Max characters per chunk
CHUNK_OVERLAP = 200 # Characters to overlap between chunks

# 5. DEFINE UPLOAD BATCH SIZE
#    Process and upload documents in batches to avoid connection timeouts.
BATCH_SIZE = 64


def setup_vector_store():
    """
    Sets up the Qdrant vector store and the embedding model.
    
    Returns:
        A tuple containing the Qdrant client and the embedding model.
    """
    print(f"Initializing embedding model: '{MODEL_NAME}'...")
    model = SentenceTransformer(MODEL_NAME)
    
    print(f"Connecting to Qdrant at: {QDRANT_URL}")
    client = QdrantClient(url=QDRANT_URL)
    
    embedding_dim = model.get_sentence_embedding_dimension()
    
    try:
        client.get_collection(collection_name=COLLECTION_NAME)
        print(f"Collection '{COLLECTION_NAME}' already exists.")
    except Exception:
        print(f"Collection '{COLLECTION_NAME}' not found. Creating new collection...")
        client.recreate_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=models.VectorParams(
                size=embedding_dim,
                distance=models.Distance.COSINE
            )
        )
        print(f"Collection '{COLLECTION_NAME}' created successfully.")
        
    return client, model


def create_and_store_embeddings(client: QdrantClient, model: SentenceTransformer, folder_path: str):
    """
    Reads files, chunks their content, generates embeddings, and upserts them into Qdrant in batches.

    Args:
        client: The Qdrant client instance.
        model: The Sentence Transformer model instance.
        folder_path: The path to the folder containing .txt files.
    """
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    all_chunks = []
    all_metadata = []

    print(f"Scanning and chunking files in '{folder_path}'...")
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                if content.strip():
                    # Split the document content into chunks
                    chunks = text_splitter.split_text(content)
                    for chunk in chunks:
                        all_chunks.append(chunk)
                        all_metadata.append({"file_path": file_path})
                    print(f"  - Chunked '{filename}' into {len(chunks)} pieces.")
            except Exception as e:
                print(f"  - Error reading or chunking {filename}: {e}")

    if not all_chunks:
        print("No text content to process. Exiting.")
        return

    print(f"\nGenerating embeddings for {len(all_chunks)} chunks...")
    
    # Generate embeddings for all chunks in a single batch for efficiency
    vectors = model.encode(all_chunks, show_progress_bar=True)
    
    print(f"Uploading {len(all_chunks)} chunks to Qdrant in batches of {BATCH_SIZE}...")
    
    # Upload to Qdrant in smaller batches
    for i in range(0, len(all_chunks), BATCH_SIZE):
        batch_end = i + BATCH_SIZE
        chunk_batch = all_chunks[i:batch_end]
        vector_batch = vectors[i:batch_end]
        metadata_batch = all_metadata[i:batch_end]

        points_to_upsert = []
        for j, vector in enumerate(vector_batch):
            point = models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vector.tolist(),
                payload={
                    "file_path": metadata_batch[j]["file_path"],
                    "content": chunk_batch[j]
                }
            )
            points_to_upsert.append(point)

        # Upsert the batch to the collection
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points_to_upsert,
            wait=True
        )
        print(f"  - Uploaded batch {i//BATCH_SIZE + 1}/{(len(all_chunks) - 1)//BATCH_SIZE + 1}")
    
    print(f"\nSuccessfully uploaded {len(all_chunks)} chunks to Qdrant collection '{COLLECTION_NAME}'.")


def main():
    """
    Main function to run the vector store creation pipeline.
    """
    print("--- Starting Vector Store Creation (Docker with Chunking) ---")

    if not os.path.isdir(TEXT_FILES_FOLDER):
        print(f"❌ Error: Input folder '{TEXT_FILES_FOLDER}' not found.")
        sys.exit(1)

    if not any(f.endswith('.txt') for f in os.listdir(TEXT_FILES_FOLDER)):
        print(f"⚠️ Warning: No .txt files found in '{TEXT_FILES_FOLDER}'.")
        sys.exit(0)

    qdrant_client, embedding_model = setup_vector_store()
    create_and_store_embeddings(qdrant_client, embedding_model, TEXT_FILES_FOLDER)
    
    print("\n--- ✅ Vector Store Creation Finished! ---")
    print(f"Your vector data is stored in the Qdrant container.")
    print(f"Persistent data is saved in the 'qdrant_storage' folder.")


if __name__ == "__main__":
    # --- Installation ---
    # pip install qdrant-client sentence-transformers langchain
    main()


--- Starting Vector Store Creation (Docker with Chunking) ---
Initializing embedding model: 'answerdotai/ModernBERT-base'...


No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


Connecting to Qdrant at: http://localhost:8000
Collection 'my_text_collection' not found. Creating new collection...


  client.recreate_collection(


Collection 'my_text_collection' created successfully.
Scanning and chunking files in 'Documents_Cleaned_Editable'...
  - Chunked 'cleaned_Pharmacoepidemiology and Drug - 2022 - Girman - Real‐world data  Assessing electronic health records and medical claims.txt' into 22 pieces.
  - Chunked 'cleaned_ICH reflection paper on pursuing opportunities for RWD.txt' into 37 pieces.
  - Chunked 'cleaned_FDA’s Real-World Evidence Program Framework.txt' into 108 pieces.
  - Chunked 'cleaned_ICMRA Statement on International Collaboration (RWE).txt' into 8 pieces.

Generating embeddings for 175 chunks...


Batches: 100%|██████████| 6/6 [02:59<00:00, 29.89s/it]


Uploading 175 chunks to Qdrant in batches of 64...
  - Uploaded batch 1/3
  - Uploaded batch 2/3
  - Uploaded batch 3/3

Successfully uploaded 175 chunks to Qdrant collection 'my_text_collection'.

--- ✅ Vector Store Creation Finished! ---
Your vector data is stored in the Qdrant container.
Persistent data is saved in the 'qdrant_storage' folder.
