In [17]:
import json
import uuid
import re
from pathlib import Path
from tqdm import tqdm
from pypdf import PdfReader

# --- Config ---
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
DATA_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/raw_data")
OUTPUT_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks")

# --- Helpers ---
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def clean_text(text: str) -> str:
    # Replace tabs, newlines, and carriage returns with single spaces
    cleaned = re.sub(r'[\t\n\r]+', ' ', text)
    return cleaned.strip()

def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(str(pdf_path))
        text = "\n".join([page.extract_text() or "" for page in reader.pages])
        return clean_text(text)
    except Exception as e:
        print(f"❌ Failed to read {pdf_path.name}: {e}")
        return ""

def process_author_folder(author_path: Path):
    author = author_path.name
    output_path = OUTPUT_DIR / f"{author}.json"
    results = []

    for pdf_file in tqdm(author_path.glob("*.pdf"), desc=f"Chunking {author}"):
        text = extract_text_from_pdf(pdf_file)
        if not text.strip():
            continue

        chunks = chunk_text(text)

        for chunk in chunks:
            results.append({
                "id": str(uuid.uuid4()),
                "text": chunk,
                "metadata": {
                    "filename": pdf_file.name,
                    "author": author
                }
            })

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(f"✅ Saved {len(results)} chunks to {output_path}")

# Example usage:
# from chunk_author_to_json import process_author_folder, DATA_DIR
# process_author_folder(DATA_DIR / "Aristotle")


In [10]:
authorfolders = {
    1:"Byung-Chul Han",
    2:"Carl Schmitt",
    3:"Deleuze and Guattari",
    4:"Ernst Juenger",
    5:"Jacques Ellul",
    6:"Jean Baudrillard",
    7:"Lewis Mumford",
    8:"Marshall McLuhan",
    9:"Nick Land",
    10:"Paul Virilio",
    11:"Peter Sloterdijk",
    12:"Spengler",
    13:"Vilem Flusser",
    14:"Walter Benjamin",
    15:"Walter Ong"
}

In [9]:
authorfolders[3]

'Deleuze and Guattari'

In [18]:
process_author_folder(DATA_DIR / authorfolders[1])

Chunking Byung-Chul Han: 4it [00:09,  2.27s/it]

✅ Saved 798 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Byung-Chul Han.json





In [19]:
process_author_folder(DATA_DIR / authorfolders[2])
process_author_folder(DATA_DIR / authorfolders[3])
process_author_folder(DATA_DIR / authorfolders[4])
process_author_folder(DATA_DIR / authorfolders[5])
process_author_folder(DATA_DIR / authorfolders[6])

Chunking Carl Schmitt: 3it [00:02,  1.31it/s]


✅ Saved 129 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Carl Schmitt.json


Chunking Deleuze and Guattari: 1it [00:22, 22.81s/it]


✅ Saved 1739 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Deleuze and Guattari.json


Chunking Ernst Juenger: 0it [00:00, ?it/s]


✅ Saved 0 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Ernst Juenger.json


Chunking Jacques Ellul: 3it [00:49, 16.61s/it]


✅ Saved 1777 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Jacques Ellul.json


Chunking Jean Baudrillard: 5it [00:08,  1.72s/it]

✅ Saved 1822 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Jean Baudrillard.json





In [20]:
process_author_folder(DATA_DIR / authorfolders[7])
process_author_folder(DATA_DIR / authorfolders[8])
process_author_folder(DATA_DIR / authorfolders[9])
process_author_folder(DATA_DIR / authorfolders[10])
process_author_folder(DATA_DIR / authorfolders[11])

Chunking Lewis Mumford: 6it [01:34, 15.70s/it]


✅ Saved 5790 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Lewis Mumford.json


Chunking Marshall McLuhan: 0it [00:00, ?it/s]incorrect startxref pointer(3)
parsing for Object Streams
Chunking Marshall McLuhan: 5it [00:17,  3.55s/it]


✅ Saved 3179 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Marshall McLuhan.json


Chunking Nick Land: 5it [00:38,  7.70s/it]


✅ Saved 3724 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Nick Land.json


Chunking Paul Virilio: 7it [00:08,  1.28s/it]


✅ Saved 1199 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Paul Virilio.json


Chunking Peter Sloterdijk: 0it [00:00, ?it/s]Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Chunking Peter Sloterdijk: 9it [00:06,  1.66it/s]Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 96 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Chunking Peter Sloterdijk: 10it [00:08,  1.17it/s]

✅ Saved 948 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Peter Sloterdijk.json





In [21]:
process_author_folder(DATA_DIR / authorfolders[12])
process_author_folder(DATA_DIR / authorfolders[13])
process_author_folder(DATA_DIR / authorfolders[14])
process_author_folder(DATA_DIR / authorfolders[15])

Chunking Spengler: 1it [00:48, 48.28s/it]Overwriting cache for 0 166
Chunking Spengler: 5it [01:11, 14.30s/it]


✅ Saved 5065 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Spengler.json


Chunking Vilem Flusser: 4it [00:24,  6.02s/it]


✅ Saved 2376 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Vilem Flusser.json


Chunking Walter Benjamin: 2it [00:24, 12.26s/it]


✅ Saved 2204 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Walter Benjamin.json


Chunking Walter Ong: 5it [00:08,  1.65s/it]

✅ Saved 1033 chunks to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks/Walter Ong.json





In [None]:
# for i in range(len(authorfolders)):
#     process_author_folder(DATA_DIR / authorfolders[i])

In [1]:
import json
from pathlib import Path
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer

# --- Config ---
CHUNKS_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/ResearchAI/chunks")
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE = 100

# --- Init ---
model = SentenceTransformer(EMBED_MODEL)
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

def batch_upsert(collection_name, points, batch_size=BATCH_SIZE):
    for i in range(0, len(points), batch_size):
        batch = points[i:i + batch_size]
        client.upsert(collection_name=collection_name, points=batch)

def process_json_file(json_path: Path):
    collection_name = json_path.stem.lower().replace(" ", "_")

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not data:
        print(f"⚠️ Skipping empty file: {json_path.name}")
        return

    # Ensure collection exists
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=model.get_sentence_embedding_dimension(),
            distance=Distance.COSINE,
        )
    )

    points = []
    for item in tqdm(data, desc=f"Embedding {json_path.name}"):
        vector = model.encode(item["text"]).tolist()
        points.append(PointStruct(
            id=item["id"],
            vector=vector,
            payload={
                "text": item["text"],
                **item["metadata"]
            }
        ))

    batch_upsert(collection_name, points)
    print(f"✅ Uploaded {len(points)} points to collection: {collection_name}")

# --- Run all .json files ---
def upload_all_json_chunks():
    existing_collections = [c.name for c in client.get_collections().collections]

    for json_file in CHUNKS_DIR.glob("*.json"):
        collection_name = json_file.stem.lower().replace(" ", "_")
        if collection_name in existing_collections:
            print(f"⏩ Skipping already uploaded collection: {collection_name}")
            continue

        process_json_file(json_file)

# --- Example usage ---
# from upload_chunks_to_qdrant import upload_all_json_chunks
# upload_all_json_chunks()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
upload_all_json_chunks()

⏩ Skipping already uploaded collection: spengler
⏩ Skipping already uploaded collection: ernst_juenger
⏩ Skipping already uploaded collection: byung-chul_han
⏩ Skipping already uploaded collection: walter_benjamin
⏩ Skipping already uploaded collection: jean_baudrillard
⏩ Skipping already uploaded collection: peter_sloterdijk
⏩ Skipping already uploaded collection: jacques_ellul
⏩ Skipping already uploaded collection: marshall_mcluhan
⏩ Skipping already uploaded collection: walter_ong
⏩ Skipping already uploaded collection: nick_land
⏩ Skipping already uploaded collection: deleuze_and_guattari
⏩ Skipping already uploaded collection: carl_schmitt
⏩ Skipping already uploaded collection: paul_virilio
⏩ Skipping already uploaded collection: vilem_flusser
⏩ Skipping already uploaded collection: lewis_mumford
