# TIF to PDF Conversion

In [1]:
import os
from PIL import Image

# Base directory where the book subfolders are
base_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/VOL_002"

# Where to save the resulting PDFs
output_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books"
os.makedirs(output_dir, exist_ok=True)

# Loop through each subfolder (each book)
for folder in sorted(os.listdir(base_dir)):
    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    print(f"Processing: {folder}")

    # Load JPEGs first (assumed to be front/back covers, spine, etc.)
    jpgs = sorted([f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg'))])

    # Then load TIFF pages
    tiffs = sorted([f for f in os.listdir(folder_path) if f.lower().endswith(('.tif', '.tiff'))])

    # Combine both, keeping order: covers first, then pages
    image_files = jpgs + tiffs
    if not image_files:
        print(f"No image files found in {folder_path}, skipping.")
        continue

    images = []
    for img_file in image_files:
        img_path = os.path.join(folder_path, img_file)
        img = Image.open(img_path).convert("RGB")
        images.append(img)

    # Save the images into a single PDF
    pdf_filename = f"{folder}.pdf"
    pdf_path = os.path.join(output_dir, pdf_filename)

    images[0].save(pdf_path, save_all=True, append_images=images[1:])
    print(f"Saved PDF: {pdf_path}")


Processing: AMBUSH AT VLADIVOSTOK
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/AMBUSH AT VLADIVOSTOK.pdf
Processing: GRAVE DIGGERS
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/GRAVE DIGGERS.pdf
Processing: MINDSZENTY THE MAN
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/MINDSZENTY THE MAN.pdf
Processing: SAFE NOT SORRY
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/SAFE NOT SORRY.pdf
Processing: STRIKE FROM SPACE
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/STRIKE FROM SPACE.pdf
Processing: THE BETRAYERS
Saved PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/THE BETRAYERS.pdf


# DAR PDF Splitting

Below is the filepath for a large PDF that contains multiple articles.

/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR National Defender articles (DAR magazine).pdf

This is the path to a JSON that has an index of the articles in the PDF:

/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR_full_index.json

First thing that is needed: split the PDF into inidivual articles and place them in this folder:

/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/NatDef_Individual_Articles

The JSON has the following structure:

[
    {
        "start_page":9,
        "end_page":19,
        "title":"What's Wrong With U.S. Intelligence?",
        "author":"Major General George J. Keegan, Jr.",
        "date":"June-July 1977"
    }
]

This repeats for each article in the PDF.

The split PDF's should be named:

DAR_NatDef_Year_Month.pdf

Some of them have multiple months for the same article. That can be detected by selecing everything before the year in the date part of the JSON file.
Start and end pages are INCLUSIVE.

In [1]:
import json
import os
import re
from PyPDF2 import PdfReader, PdfWriter

# File paths
pdf_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR National Defender articles (DAR magazine).pdf"
json_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR_full_index.json"
output_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/NatDef_Individual_Articles"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load the JSON index
with open(json_path, 'r') as f:
    articles = json.load(f)

# Load the PDF
reader = PdfReader(pdf_path)
total_pages = len(reader.pages)
print(f"Total pages in PDF: {total_pages}")

def parse_date_for_filename(date_str):
    """
    Parse date string to extract year and month(s) for filename.
    Examples: "June-July 1977" -> "1977_June-July"
             "January 1980" -> "1980_January"
    """
    # Extract year (last 4 digits)
    year_match = re.search(r'\b(\d{4})\b', date_str)
    if not year_match:
        return "Unknown_Date"
    
    year = year_match.group(1)
    
    # Extract everything before the year as the month(s)
    month_part = date_str[:year_match.start()].strip()
    
    # Clean up month part (remove extra spaces, trailing punctuation)
    month_part = re.sub(r'\s+', ' ', month_part)  # normalize spaces
    month_part = month_part.strip(' -,')  # remove trailing punctuation
    
    return f"{year}_{month_part}"

# Process each article
for i, article in enumerate(articles):
    start_page = article["start_page"]
    end_page = article["end_page"]
    title = article["title"]
    author = article["author"]
    date = article["date"]
    
    print(f"\nProcessing article {i+1}/{len(articles)}: {title}")
    print(f"  Pages: {start_page}-{end_page}")
    print(f"  Date: {date}")
    
    # Validate page numbers
    if start_page < 1 or end_page > total_pages or start_page > end_page:
        print(f"  WARNING: Invalid page range {start_page}-{end_page} (PDF has {total_pages} pages)")
        continue
    
    # Create a new PDF writer
    writer = PdfWriter()
    
    # Add pages (convert to 0-based indexing, and end_page is inclusive)
    for page_num in range(start_page - 1, end_page):
        if page_num < total_pages:
            writer.add_page(reader.pages[page_num])
    
    # Generate filename
    date_formatted = parse_date_for_filename(date)
    filename = f"DAR_NatDef_{date_formatted}.pdf"
    output_path = os.path.join(output_dir, filename)
    
    # Save the extracted pages
    try:
        with open(output_path, 'wb') as output_file:
            writer.write(output_file)
        print(f"  ✓ Saved: {filename}")
    except Exception as e:
        print(f"  ✗ Error saving {filename}: {e}")

print(f"\n✓ Processing complete! Individual articles saved to: {output_dir}")


Total pages in PDF: 641

Processing article 1/148: What's Wrong With U.S. Intelligence?
  Pages: 9-19
  Date: June-July 1977
  ✓ Saved: DAR_NatDef_1977_June-July.pdf

Processing article 2/148: In the Name of Education
  Pages: 20-24
  Date: August-September 1977
  ✓ Saved: DAR_NatDef_1977_August-September.pdf

Processing article 3/148: Weapons Versus Theories and Treaties
  Pages: 25-29
  Date: October 1977
  ✓ Saved: DAR_NatDef_1977_October.pdf

Processing article 4/148: What's at Stake in the U. S. Canal at Panama?
  Pages: 30-36
  Date: November 1977
  ✓ Saved: DAR_NatDef_1977_November.pdf

Processing article 5/148: Election "Reform" and Your Right to Vote
  Pages: 37-40
  Date: December 1977
  ✓ Saved: DAR_NatDef_1977_December.pdf

Processing article 6/148: The Equal Rights Amendment
  Pages: 41-44
  Date: January 1978
  ✓ Saved: DAR_NatDef_1978_January.pdf

Processing article 7/148: The Lessons of Title IX
  Pages: 45-50
  Date: February 1978
  ✓ Saved: DAR_NatDef_1978_February.pd

# Chunking DAR PDF's and Book PDF's

The next thing that needs to be done is to convert the individual articles into chunks to be placed into Qdrant. This will include the DAR articles as well as five books. The first thing that needs to be done is to put the chunks into JSON files that will be saved here, and then embed the chunks and put them into Qdrant.

The books are:
- /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/AMBUSH_AT_VLADIVOSTOK.pdf
- /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/MINDSZENTY_THE_MAN.pdf
- /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/SAFE_NOT_SORRY.pdf
- /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/STRIKE_FROM_SPACE.pdf
- /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books/THE_BETRAYERS.pdf

Below is a JavaScript cell to show the JSON format for the book chunks, as well as listing the authors and publication years.

Each book should get its own JSON file in this folder: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/Books

Once that is done, move on to doing the DAR articles. all of the DAR articles are in the same PDF: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR National Defender articles (DAR magazine).pdf

The index for the DAR articles is in this JSON file: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR_full_index.json

There is a javascript cell below to show the JSON format for the DAR articles. The metadata should be filled in using the index for reference. The text should match the text in the page range listed in the index, so that the correct text is extracted for each article.

The chunks for the DAR articles should be placed in this folder: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/DAR

In [2]:
import json
import os
from PyPDF2 import PdfReader

def create_chunks(text, chunk_size=1000, overlap=100):
    """
    Create overlapping chunks from text.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():  # Only add non-empty chunks
            chunks.append(chunk.strip())
        start = end - overlap
    return chunks

def extract_pdf_text(pdf_path):
    """
    Extract all text from a PDF file.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_pdf_pages_text(pdf_path, start_page, end_page):
    """
    Extract text from specific pages of a PDF (1-based indexing, inclusive).
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page_num in range(start_page - 1, end_page):
        if page_num < len(reader.pages):
            text += reader.pages[page_num].extract_text() + "\n"
    return text

# Book metadata
books = [
    {
        "filename": "STRIKE_FROM_SPACE.pdf",
        "author": "Phyllis Schlafly and Chester Ward",
        "book_title": "Strike From Space",
        "publication_year": 1966
    },
    {
        "filename": "AMBUSH_AT_VLADIVOSTOK.pdf",
        "author": "Phyllis Schlafly and Chester Ward", 
        "book_title": "Ambush at Vladivostok",
        "publication_year": 1976
    },
    {
        "filename": "MINDSZENTY_THE_MAN.pdf",
        "author": "Phyllis Schlafly and Joseph Vecsey",
        "book_title": "Mindszenty the Man",
        "publication_year": 1972
    },
    {
        "filename": "SAFE_NOT_SORRY.pdf",
        "author": "Phyllis Schlafly",
        "book_title": "Safe Not Sorry",
        "publication_year": 1967
    },
    {
        "filename": "THE_BETRAYERS.pdf",
        "author": "Phyllis Schlafly and Chester Ward",
        "book_title": "The Betrayers", 
        "publication_year": 1968
    }
]

# Create output directories
books_output_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/Books"
dar_output_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/DAR"
os.makedirs(books_output_dir, exist_ok=True)
os.makedirs(dar_output_dir, exist_ok=True)

# Process books
books_base_dir = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Books"

print("Processing Books:")
print("=" * 50)

for book in books:
    pdf_path = os.path.join(books_base_dir, book["filename"])
    
    if not os.path.exists(pdf_path):
        print(f"❌ File not found: {pdf_path}")
        continue
        
    print(f"📖 Processing: {book['book_title']}")
    
    # Extract text from PDF
    try:
        text = extract_pdf_text(pdf_path)
        print(f"   Extracted {len(text)} characters")
        
        # Create chunks
        chunks = create_chunks(text)
        print(f"   Created {len(chunks)} chunks")
        
        # Create JSON data
        json_data = []
        for chunk_text in chunks:
            json_data.append({
                "author": book["author"],
                "book_title": book["book_title"],
                "publication_year": book["publication_year"],
                "text": chunk_text
            })
        
        # Save to JSON file
        # Clean filename for JSON (replace spaces with underscores, remove .pdf)
        json_filename = book["filename"].replace(".pdf", ".json").replace(" ", "_")
        json_path = os.path.join(books_output_dir, json_filename)
        
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)
        
        print(f"   ✅ Saved: {json_filename}")
        
    except Exception as e:
        print(f"   ❌ Error processing {book['book_title']}: {e}")

print(f"\n📚 Books processing complete! Files saved to: {books_output_dir}")


Processing Books:
📖 Processing: Strike From Space
   Extracted 340884 characters
   Created 379 chunks
   ✅ Saved: STRIKE_FROM_SPACE.json
📖 Processing: Ambush at Vladivostok
   Extracted 300419 characters
   Created 334 chunks
   ✅ Saved: AMBUSH_AT_VLADIVOSTOK.json
📖 Processing: Mindszenty the Man
   Extracted 423805 characters
   Created 471 chunks
   ✅ Saved: MINDSZENTY_THE_MAN.json
📖 Processing: Safe Not Sorry
   Extracted 325264 characters
   Created 362 chunks
   ✅ Saved: SAFE_NOT_SORRY.json
📖 Processing: The Betrayers
   Extracted 227832 characters
   Created 254 chunks
   ✅ Saved: THE_BETRAYERS.json

📚 Books processing complete! Files saved to: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/Books


In [3]:
# Process DAR articles
dar_pdf_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR National Defender articles (DAR magazine).pdf"
dar_index_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/DAR/DAR_full_index.json"

print("\nProcessing DAR Articles:")
print("=" * 50)

# Load the DAR index
try:
    with open(dar_index_path, 'r') as f:
        dar_articles = json.load(f)
    print(f"📋 Loaded index with {len(dar_articles)} articles")
except Exception as e:
    print(f"❌ Error loading DAR index: {e}")
    dar_articles = []

# Load the DAR PDF
try:
    dar_reader = PdfReader(dar_pdf_path)
    print(f"📄 Loaded DAR PDF with {len(dar_reader.pages)} pages")
except Exception as e:
    print(f"❌ Error loading DAR PDF: {e}")
    dar_reader = None

if dar_reader and dar_articles:
    for i, article in enumerate(dar_articles):
        start_page = article["start_page"]
        end_page = article["end_page"]
        title = article["title"]
        author = article["author"]
        date = article["date"]
        
        print(f"\n📰 Processing article {i+1}/{len(dar_articles)}: {title}")
        print(f"   Author: {author}")
        print(f"   Date: {date}")
        print(f"   Pages: {start_page}-{end_page}")
        
        try:
            # Extract text from the specified page range
            article_text = extract_pdf_pages_text(dar_pdf_path, start_page, end_page)
            print(f"   Extracted {len(article_text)} characters")
            
            # Create chunks
            chunks = create_chunks(article_text)
            print(f"   Created {len(chunks)} chunks")
            
            # Create JSON data for this article
            json_data = []
            for chunk_text in chunks:
                json_data.append({
                    "author": author,
                    "article_title": title,
                    "date": date,
                    "text": chunk_text
                })
            
            # Create filename from title (clean it up for filesystem)
            # Remove/replace problematic characters
            clean_title = title.replace("/", "-").replace(":", "-").replace("?", "").replace("\"", "").replace("*", "").replace("<", "").replace(">", "").replace("|", "")
            clean_title = clean_title.replace("  ", " ").strip()
            
            # Create filename: use article number and cleaned title
            json_filename = f"DAR_Article_{i+1:03d}_{clean_title[:50]}.json"
            json_path = os.path.join(dar_output_dir, json_filename)
            
            # Save to JSON file
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, indent=2, ensure_ascii=False)
            
            print(f"   ✅ Saved: {json_filename}")
            
        except Exception as e:
            print(f"   ❌ Error processing article '{title}': {e}")

print(f"\n📰 DAR articles processing complete! Files saved to: {dar_output_dir}")
print(f"\n🎉 All chunking complete!")
print(f"📚 Books: {books_output_dir}")
print(f"📰 DAR Articles: {dar_output_dir}")



Processing DAR Articles:
📋 Loaded index with 148 articles
📄 Loaded DAR PDF with 641 pages

📰 Processing article 1/148: What's Wrong With U.S. Intelligence?
   Author: Major General George J. Keegan, Jr.
   Date: June-July 1977
   Pages: 9-19
   Extracted 73780 characters
   Created 82 chunks
   ✅ Saved: DAR_Article_001_What's Wrong With U.S. Intelligence.json

📰 Processing article 2/148: In the Name of Education
   Author: Jo-Ann K. Abrigg
   Date: August-September 1977
   Pages: 20-24
   Extracted 30935 characters
   Created 35 chunks
   ✅ Saved: DAR_Article_002_In the Name of Education.json

📰 Processing article 3/148: Weapons Versus Theories and Treaties
   Author: Phyllis Schlafly
   Date: October 1977
   Pages: 25-29
   Extracted 27987 characters
   Created 32 chunks
   ✅ Saved: DAR_Article_003_Weapons Versus Theories and Treaties.json

📰 Processing article 4/148: What's at Stake in the U. S. Canal at Panama?
   Author: Phyllis Schlafly
   Date: November 1977
   Pages: 30-36
   E

## Book Chunk Format:

In [4]:
[
    {
        "author":"Phyllis Schlafly and Chester Ward",
        "book_title":"Strike From Space",
        "publication_year":1966,
        "text":"TEXT of CHUNK"
    },
    {
        "author":"Phyllis Schlafly and Chester Ward",
        "book_title":"Ambush at Vladivostok",
        "publication_year":1976,
        "text":"TEXT of CHUNK"
    },
    {
        "author":"Phyllis Schlafly and Joseph Vecsey",
        "book_title":"Mindsenty the Man",
        "publication_year":1972,
        "text":"TEXT of CHUNK"
    },
    {
        "author":"Phyllis Schlafly",
        "book_title": "Safe Not Sorry",
        "publication_year":1967,
        "text":"TEXT of CHUNK"
    },
    {
        "author":"Phyllis Schlafly and Chester Ward",
        "book_title": "The Betrayers",
        "publication_year":1968,
        "text":"TEXT of CHUNK"
    }
]

[{'author': 'Phyllis Schlafly and Chester Ward',
  'book_title': 'Strike From Space',
  'publication_year': 1966,
  'text': 'TEXT of CHUNK'},
 {'author': 'Phyllis Schlafly and Chester Ward',
  'book_title': 'Ambush at Vladivostok',
  'publication_year': 1976,
  'text': 'TEXT of CHUNK'},
 {'author': 'Phyllis Schlafly and Joseph Vecsey',
  'book_title': 'Mindsenty the Man',
  'publication_year': 1972,
  'text': 'TEXT of CHUNK'},
 {'author': 'Phyllis Schlafly',
  'book_title': 'Safe Not Sorry',
  'publication_year': 1967,
  'text': 'TEXT of CHUNK'},
 {'author': 'Phyllis Schlafly and Chester Ward',
  'book_title': 'The Betrayers',
  'publication_year': 1968,
  'text': 'TEXT of CHUNK'}]

## DAR Chunk Format:

In [5]:
[
    {
        "author":"author name",
        "article_title":"article title",
        "date":"date of article",
        "text":"TEXT of CHUNK"
    }
]

[{'author': 'author name',
  'article_title': 'article title',
  'date': 'date of article',
  'text': 'TEXT of CHUNK'}]

# Embeddings and Upload to Qdrant

Books go into book collection.

DAR articles go into new collection that will be called DAR.

In [6]:
import os
import json
import uuid
from pathlib import Path
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance

# Load environment variables
load_dotenv("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/.env")

# Configuration
BATCH4_DIR = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4")
BOOKS_DIR = BATCH4_DIR / "Books"
DAR_DIR = BATCH4_DIR / "DAR"
BOOK_COLLECTION_NAME = "book_chunks"  # Same as batch3
DAR_COLLECTION_NAME = "DAR"  # New collection for DAR articles

print("🚀 Starting embedding and upload process...")
print(f"📁 Books directory: {BOOKS_DIR}")
print(f"📁 DAR directory: {DAR_DIR}")

# Initialize embedding model (same as batch3)
print("🤖 Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize Qdrant client (same as batch3)
print("🗄️ Connecting to Qdrant...")
qdrant = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

def ensure_collection(name):
    """Create collection if it doesn't exist (same as batch3)"""
    collections = qdrant.get_collections().collections
    if not any(c.name == name for c in collections):
        qdrant.recreate_collection(
            collection_name=name,
            vectors_config=VectorParams(size=384, distance=Distance.COSINE)
        )
        print(f"✅ Created new collection: {name}")
    else:
        print(f"✅ Collection already exists: {name}")

def embed_and_upload(chunks, collection_name, batch_name):
    """Embed and upload chunks to Qdrant (same as batch3)"""
    if not chunks:
        print(f"⚠️ No chunks to upload for {batch_name}")
        return
        
    ensure_collection(collection_name)
    
    print(f"🔄 Processing {len(chunks)} chunks for {batch_name}...")
    texts = [c["text"] for c in chunks]
    
    print("🧮 Generating embeddings...")
    vectors = model.encode(texts).tolist()
    
    payloads = chunks
    points = [
        PointStruct(id=str(uuid.uuid4()), vector=vec, payload=payload)
        for vec, payload in zip(vectors, payloads)
    ]
    
    print(f"⬆️ Uploading {len(points)} points to '{collection_name}'...")
    qdrant.upload_points(collection_name=collection_name, points=points)
    print(f"✅ Successfully uploaded {len(points)} points for {batch_name}")

def load_json_files_from_directory(directory, file_type):
    """Load all JSON files from a directory"""
    chunks = []
    json_files = list(directory.glob("*.json"))
    
    print(f"📋 Found {len(json_files)} {file_type} JSON files")
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                file_chunks = json.load(f)
                chunks.extend(file_chunks)
                print(f"   ✅ Loaded {len(file_chunks)} chunks from {json_file.name}")
        except Exception as e:
            print(f"   ❌ Error loading {json_file.name}: {e}")
    
    return chunks

# Process Books
print("\n📚 Processing Books...")
book_chunks = load_json_files_from_directory(BOOKS_DIR, "book")
embed_and_upload(book_chunks, BOOK_COLLECTION_NAME, "Books")

# Process DAR Articles  
print("\n📰 Processing DAR Articles...")
dar_chunks = load_json_files_from_directory(DAR_DIR, "DAR article")
embed_and_upload(dar_chunks, DAR_COLLECTION_NAME, "DAR Articles")

# Summary
print(f"\n🎉 Upload complete!")
print(f"📚 Books: {len(book_chunks)} chunks uploaded to '{BOOK_COLLECTION_NAME}' collection")
print(f"📰 DAR Articles: {len(dar_chunks)} chunks uploaded to '{DAR_COLLECTION_NAME}' collection")
print(f"📊 Total: {len(book_chunks) + len(dar_chunks)} chunks uploaded")


  from .autonotebook import tqdm as notebook_tqdm


🚀 Starting embedding and upload process...
📁 Books directory: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/Books
📁 DAR directory: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch4/DAR
🤖 Loading embedding model...
🗄️ Connecting to Qdrant...

📚 Processing Books...
📋 Found 5 book JSON files
   ✅ Loaded 254 chunks from THE_BETRAYERS.json
   ✅ Loaded 362 chunks from SAFE_NOT_SORRY.json
   ✅ Loaded 471 chunks from MINDSZENTY_THE_MAN.json
   ✅ Loaded 379 chunks from STRIKE_FROM_SPACE.json
   ✅ Loaded 334 chunks from AMBUSH_AT_VLADIVOSTOK.json
✅ Collection already exists: book_chunks
🔄 Processing 1800 chunks for Books...
🧮 Generating embeddings...
⬆️ Uploading 1800 points to 'book_chunks'...
✅ Successfully uploaded 1800 points for Books

📰 Processing DAR Articles...
📋 Found 148 DAR article JSON files
   ✅ Loaded 26 chunks from DAR_Article_065_The United Nation's Rip-Off.json
   ✅ Loaded 33 chunks from DAR_Article_040_Freeze or Anti-Freeze-

  qdrant.recreate_collection(


✅ Created new collection: DAR
🔄 Processing 4001 chunks for DAR Articles...
🧮 Generating embeddings...
⬆️ Uploading 4001 points to 'DAR'...
✅ Successfully uploaded 4001 points for DAR Articles

🎉 Upload complete!
📚 Books: 1800 chunks uploaded to 'book_chunks' collection
📰 DAR Articles: 4001 chunks uploaded to 'DAR' collection
📊 Total: 5801 chunks uploaded
