In [2]:
from pathlib import Path
from pypdf import PdfReader
import json
import uuid

# --- Config ---
PDF_PATH = Path("/Users/mason/Desktop/The Master and His Emissary_ The Divided Brain and the Making of the Western World ( PDFDrive ).pdf")
OUTPUT_JSON_PATH = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/iain_mcgilchrist_master_and_emissary.json")
CHUNK_SIZE = 1000  # Number of characters per chunk

# --- Extract and Chunk ---
def chunk_text(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

reader = PdfReader(str(PDF_PATH))
full_text = " ".join(page.extract_text() or "" for page in reader.pages)
chunks = chunk_text(full_text, CHUNK_SIZE)

data = [{
    "id": str(uuid.uuid4()),
    "text": chunk,
    "metadata": {
        "filename": PDF_PATH.stem,
        "author": "Iain McGilchrist"
    }
} for chunk in chunks if chunk.strip()]

# --- Save ---
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

print(f"✅ Saved {len(data)} chunks to {OUTPUT_JSON_PATH}")


✅ Saved 1835 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/iain_mcgilchrist_master_and_emissary.json


In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# --- Config ---
CHUNKS_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks")
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE = 100

# --- Init ---
model = SentenceTransformer(EMBED_MODEL)
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

def batch_upsert(collection_name, points, batch_size=BATCH_SIZE):
    for i in range(0, len(points), batch_size):
        batch = points[i:i + batch_size]
        client.upsert(collection_name=collection_name, points=batch)

def process_json_file(json_path: Path):
    collection_name = json_path.stem.lower().replace(" ", "_")

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not data:
        print(f"⚠️ Skipping empty file: {json_path.name}")
        return

    # Ensure collection exists
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=Distance.COSINE,
        )
    )

    points = []
    for item in tqdm(data, desc=f"Embedding {json_path.name}"):
        vector = model.encode(item["text"]).tolist()
        points.append(PointStruct(
            id=item["id"],
            vector=vector,
            payload={
                "text": item["text"],
                **item["metadata"]
            }
        ))

    batch_upsert(collection_name, points)
    print(f"✅ Uploaded {len(points)} points to collection: {collection_name}")

process_json_file(Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/iain_mcgilchrist_master_and_emissary.json"))


  from .autonotebook import tqdm as notebook_tqdm
  client.recreate_collection(
Embedding iain_mcgilchrist_master_and_emissary.json: 100%|██████████| 1835/1835 [01:14<00:00, 24.67it/s]


✅ Uploaded 1835 points to collection: iain_mcgilchrist_master_and_emissary
