5 Books, all by Phyllis Schlafly

Phyllis Schlafly Speaks Vol 1: Her Favorite Speeches
Pub: 2016
/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol1_HerFavoriteSpeeches_Schlafly_2016_final.pdf

Phyllis Schlafly Speaks Vol 2: On Donald Trump
Pub: 2017
/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol2_OnDonaldTrump_Schlafly_2017_final.pdf

Phyllis Schlafly Speaks Vol 3: How the Republican Party Became Pro-Life
Pub: 2018
/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol3_HowTheRepublicanParty_Schlafly_2018_final.pdf

Phyllis Schlafly Speaks Vol 4: Patents and Inventions
Pub: 2018
/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol4_PatentsAndInvention_Schlafly_2018_final.pdf

Phyllis Schlafly Speaks Vol 5: Stopping the ERA
Pub: 2019
/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol5_StoppingTheERA_Schlafly_2019_final.pdf

Store in: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch3

JSON format as follows:

[
  {
    "author": "Phyllis Schlafly",
    "book_title": "TITLE",
    "publication_year": YEAR,
    "text": "EXAMPLE TEXT"
  }
]

Chunk size: 1000 characters

Environment credentials for QDRANT in: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/.env


In [1]:
# Imports

import os
import json
import uuid
import fitz
from pathlib import Path
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance

In [2]:
# Config

BOOKS = [
    ("Phyllis Schlafly Speaks Vol 1: Her Favorite Speeches", 2016, "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol1_HerFavoriteSpeeches_Schlafly_2016_final.pdf"),
    ("Phyllis Schlafly Speaks Vol 2: On Donald Trump", 2017, "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol2_OnDonaldTrump_Schlafly_2017_final.pdf"),
    ("Phyllis Schlafly Speaks Vol 3: How the Republican Party Became Pro-Life", 2018, "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol3_HowTheRepublicanParty_Schlafly_2018_final.pdf"),
    ("Phyllis Schlafly Speaks Vol 4: Patents and Inventions", 2018, "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol4_PatentsAndInvention_Schlafly_2018_final.pdf"),
    ("Phyllis Schlafly Speaks Vol 5: Stopping the ERA", 2019, "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/PSSpeaksVol5_StoppingTheERA_Schlafly_2019_final.pdf"),
]

OUTPUT_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch3")
CHUNK_SIZE = 1000
COLLECTION_NAME = "book_chunks"
load_dotenv("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/.env")

True

In [3]:
# Helper Functions

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text() for page in doc)
    return text

def chunk_text(text, chunk_size = 1000):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""
    for para in paragraphs:
        if not para.strip():
            continue
        if len(current_chunk) + len(para) < chunk_size:
            current_chunk += para.strip() + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para.strip() + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def build_json_chunks(chunks, title, year):
    return [
        {
            "author": "Phyllis Schlafly",
            "book_title": title,
            "publication_year": year,
            "text": chunk
        }
        for chunk in chunks
    ]

def save_chunks_to_json(data, output_path):
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2)

In [4]:
# Proceess each book

all_chunks = []
for title, year, path in BOOKS:
    text = extract_text_from_pdf(path)
    chunks = chunk_text(text, CHUNK_SIZE)
    json_chunks = build_json_chunks(chunks, title, year)
    outpath = OUTPUT_DIR / (Path(path).stem + ".json")
    save_chunks_to_json(json_chunks, outpath)
    all_chunks.extend(json_chunks)

print(f"Total chunks prepared: {len(all_chunks)}")

Total chunks prepared: 1180


In [6]:
# Embedding and upload to Qdrant

model = SentenceTransformer("all-MiniLM-L6-v2")
qdrant = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

def ensure_collection(name):
    collections = qdrant.get_collections().collections
    if not any(c.name == name for c in collections):
        qdrant.recreate_collection(
            collection_name=name,
            vectors_config=VectorParams(size=384, distance=Distance.COSINE)
        )

def embed_and_upload(chunks, collection_name):
    ensure_collection(collection_name)
    texts = [c["text"] for c in chunks]
    vectors = model.encode(texts).tolist()
    payloads = chunks
    points = [
        PointStruct(id=str(uuid.uuid4()), vector=vec, payload=payload)
        for vec, payload in zip(vectors, payloads)
    ]
    qdrant.upload_points(collection_name=collection_name, points=points)
    print(f"Uploaded {len(points)} points to '{collection_name}")

embed_and_upload(all_chunks, COLLECTION_NAME)
print("Done!")


Uploaded 1180 points to 'book_chunks
Done!
