In [1]:
from pathlib import Path
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from uuid import uuid4
import os
import json
import sys
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
try:
    load_dotenv("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/.env")
    
    QDRANT_URL = os.getenv("QDRANT_URL")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    
    if not QDRANT_URL or not QDRANT_API_KEY:
        raise ValueError("QDRANT_URL or QDRANT_API_KEY environment variables not found")
    
    # Initialize Qdrant client and embedding model
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
    print(f"✅ Successfully connected to Qdrant at {QDRANT_URL}")
    
    model = SentenceTransformer("all-MiniLM-L6-v2")
    print(f"✅ Loaded embedding model: all-MiniLM-L6-v2 (output dim: {model.get_sentence_embedding_dimension()})")
except Exception as e:
    print(f"❌ Error during initialization: {e}")
    sys.exit(1)


✅ Successfully connected to Qdrant at https://3031677a-6463-44f9-ba66-42977581720e.us-east-1-0.aws.cloud.qdrant.io
✅ Loaded embedding model: all-MiniLM-L6-v2 (output dim: 384)


In [3]:
def ensure_collection(name):
    """Create collection if it doesn't exist."""
    try:
        collections = [c.name for c in client.get_collections().collections]
        if name not in collections:
            client.recreate_collection(
                collection_name=name,
                vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE)
            )
            print(f"✅ Created collection: {name}")
        else:
            print(f"ℹ️ Collection {name} already exists")
    except Exception as e:
        print(f"❌ Error ensuring collection {name}: {e}")
        raise

def embed_and_upload(chunks, collection_name):
    """Embed text chunks and upload them to Qdrant."""
    if not chunks:
        print(f"⚠️ Warning: No chunks to upload to {collection_name}")
        return
    
    print(f"📦 Processing {len(chunks)} chunks for collection {collection_name}...")
    ensure_collection(collection_name)
    
    batch_size = 100
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        texts = [chunk["text"] for chunk in batch]
        
        try:
            embeddings = model.encode(texts, show_progress_bar=True).tolist()
        except Exception as e:
            print(f"❌ Error embedding batch starting at index {i}: {e}")
            continue
        
        points = []
        for chunk, embedding in zip(batch, embeddings):
            payload = chunk.get("metadata", {}).copy()
            payload["text"] = chunk["text"]
            points.append(
                PointStruct(
                    id=str(uuid4()),
                    vector=embedding,
                    payload=payload
                )
            )
        
        try:
            client.upload_points(collection_name=collection_name, points=points)
            print(f"✅ Uploaded batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1} to {collection_name}")
        except Exception as e:
            print(f"❌ Error uploading batch to {collection_name}: {e}")


In [4]:
def load_commentaries(commentaries_path, net_tv_path):
    """Load commentary and NET-TV chunks into a single list."""
    chunks = []
    
    # Load Commentaries (normal)
    with open(commentaries_path, "r", encoding="utf-8") as f:
        commentary_data = json.load(f)
        for entry in commentary_data:
            chunks.append({
                "text": entry["text"],
                "metadata": entry["metadata"]
            })
    
    # Load NET-TV (full_text remapping)
    with open(net_tv_path, "r", encoding="utf-8") as f:
        net_tv_data = json.load(f)
        for entry in net_tv_data:
            chunks.append({
                "text": entry["full_text"],
                "metadata": {
                    "author": entry.get("metadata", {}).get("author", "Unknown"),
                    "title": entry.get("metadata", {}).get("title", "Unknown"),
                    "publication": entry.get("metadata", {}).get("publication", "NET-TV"),
                    "date_recorded": entry.get("metadata", {}).get("date_recorded", "Unknown")
                }
            })
    
    print(f"✅ Loaded {len(chunks)} commentary + NET-TV chunks")
    return chunks

def load_books(book_paths):
    """Load book chunks from multiple JSON files."""
    chunks = []
    
    for book_path in book_paths:
        with open(book_path, "r", encoding="utf-8") as f:
            book_data = json.load(f)
            for entry in book_data:
                chunks.append({
                    "text": entry["text"],
                    "metadata": {
                        "author": entry.get("author", "Unknown"),
                        "book_title": entry.get("book_title", "Unknown"),
                        "publication_year": entry.get("publication_year", "Unknown"),
                        "doc_type": "Phyllis Schlafly Book",
                        "source_file": Path(book_path).name
                    }
                })
    
    print(f"✅ Loaded {len(chunks)} book chunks")
    return chunks

def load_columns(columns_path):
    """Load othercolumns.json into chunks."""
    with open(columns_path, "r", encoding="utf-8") as f:
        columns_data = json.load(f)
    
    chunks = []
    for entry in columns_data:
        chunks.append({
            "text": entry["text"],
            "metadata": entry["metadata"]
        })
    
    print(f"✅ Loaded {len(chunks)} columns chunks")
    return chunks


In [5]:
# Define your file paths
commentaries_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/commentaries/2002.json"
net_tv_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/NET-TV.json"

book_paths = [
    "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/allegiance.json",
    "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/choice_not_echo_2014.json",
    "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/how_mass_immigration.json"
]

columns_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/othercolumns.json"

# Load the chunks
commentary_chunks = load_commentaries(commentaries_path, net_tv_path)
book_chunks = load_books(book_paths)
columns_chunks = load_columns(columns_path)

# Upload
embed_and_upload(commentary_chunks, "commentaries")
embed_and_upload(book_chunks, "book_chunks")
embed_and_upload(columns_chunks, "columns_chunks")


✅ Loaded 323 commentary + NET-TV chunks
✅ Loaded 866 book chunks
✅ Loaded 20 columns chunks
📦 Processing 323 chunks for collection commentaries...


  client.recreate_collection(


✅ Created collection: commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]


✅ Uploaded batch 1/4 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.33it/s]


✅ Uploaded batch 2/4 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


✅ Uploaded batch 3/4 to commentaries


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


✅ Uploaded batch 4/4 to commentaries
📦 Processing 866 chunks for collection book_chunks...
ℹ️ Collection book_chunks already exists


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


✅ Uploaded batch 1/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.05it/s]


✅ Uploaded batch 2/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.51it/s]


✅ Uploaded batch 3/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.34it/s]


✅ Uploaded batch 4/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


✅ Uploaded batch 5/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.50it/s]


✅ Uploaded batch 6/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.26it/s]


✅ Uploaded batch 7/9 to book_chunks


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


✅ Uploaded batch 8/9 to book_chunks


Batches: 100%|██████████| 3/3 [00:02<00:00,  1.07it/s]


✅ Uploaded batch 9/9 to book_chunks
📦 Processing 20 chunks for collection columns_chunks...
✅ Created collection: columns_chunks


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]


✅ Uploaded batch 1/1 to columns_chunks


In [6]:
def load_remaining_commentaries(commentaries_dir, skip_files=["2002.json"]):
    """Load only the remaining commentary JSONs (2003–2024)."""
    chunks = []
    
    commentaries_dir = Path(commentaries_dir)
    if not commentaries_dir.exists():
        print(f"⚠️ Commentary folder {commentaries_dir} does not exist")
        return chunks
    
    json_files = [f for f in commentaries_dir.glob("*.json") if f.name not in skip_files]
    print(f"📂 Found {len(json_files)} remaining commentary JSON files")
    
    for file in tqdm(json_files, desc="Loading remaining commentaries"):
        try:
            with open(file, "r", encoding="utf-8") as f:
                commentary_data = json.load(f)
                for entry in commentary_data:
                    chunks.append({
                        "text": entry["text"],
                        "metadata": entry["metadata"]
                    })
        except Exception as e:
            print(f"❌ Error loading {file.name}: {e}")
    
    print(f"✅ Loaded {len(chunks)} remaining commentary chunks")
    return chunks

# 🔁 Run this to upload the rest
remaining_commentary_chunks = load_remaining_commentaries(
    "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/commentaries/"
)

embed_and_upload(remaining_commentary_chunks, "commentaries")


📂 Found 22 remaining commentary JSON files


Loading remaining commentaries: 100%|██████████| 22/22 [00:00<00:00, 112.26it/s]

✅ Loaded 5700 remaining commentary chunks
📦 Processing 5700 chunks for collection commentaries...





ℹ️ Collection commentaries already exists


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


✅ Uploaded batch 1/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


✅ Uploaded batch 2/57 to commentaries


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


✅ Uploaded batch 3/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


✅ Uploaded batch 4/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


✅ Uploaded batch 5/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.33it/s]


✅ Uploaded batch 6/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


✅ Uploaded batch 7/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


✅ Uploaded batch 8/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.20it/s]


✅ Uploaded batch 9/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.30it/s]


✅ Uploaded batch 10/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.31it/s]


✅ Uploaded batch 11/57 to commentaries


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


✅ Uploaded batch 12/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


✅ Uploaded batch 13/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


✅ Uploaded batch 14/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]


✅ Uploaded batch 15/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.24it/s]


✅ Uploaded batch 16/57 to commentaries


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


✅ Uploaded batch 17/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


✅ Uploaded batch 18/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]


✅ Uploaded batch 19/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.32it/s]


✅ Uploaded batch 20/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.30it/s]


✅ Uploaded batch 21/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]


✅ Uploaded batch 22/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


✅ Uploaded batch 23/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


✅ Uploaded batch 24/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


✅ Uploaded batch 25/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


✅ Uploaded batch 26/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


✅ Uploaded batch 27/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


✅ Uploaded batch 28/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


✅ Uploaded batch 29/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


✅ Uploaded batch 30/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


✅ Uploaded batch 31/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


✅ Uploaded batch 32/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.30it/s]


✅ Uploaded batch 33/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


✅ Uploaded batch 34/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


✅ Uploaded batch 35/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


✅ Uploaded batch 36/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


✅ Uploaded batch 37/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


✅ Uploaded batch 38/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


✅ Uploaded batch 39/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


✅ Uploaded batch 40/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


✅ Uploaded batch 41/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.36it/s]


✅ Uploaded batch 42/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


✅ Uploaded batch 43/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s]


✅ Uploaded batch 44/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


✅ Uploaded batch 45/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s]


✅ Uploaded batch 46/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.31it/s]


✅ Uploaded batch 47/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.39it/s]


✅ Uploaded batch 48/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


✅ Uploaded batch 49/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


✅ Uploaded batch 50/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]


✅ Uploaded batch 51/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


✅ Uploaded batch 52/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


✅ Uploaded batch 53/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


✅ Uploaded batch 54/57 to commentaries


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s]


✅ Uploaded batch 55/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.33it/s]


✅ Uploaded batch 56/57 to commentaries


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


✅ Uploaded batch 57/57 to commentaries


In [7]:
def load_interviews(interview_path):
    """Load interview chunks from JSON file with flat structure."""
    chunks = []
    
    path = Path(interview_path)
    if not path.exists():
        print(f"⚠️ Interview file {path} does not exist")
        return chunks
    
    try:
        with open(path, "r", encoding="utf-8") as f:
            interview_data = json.load(f)
            for entry in interview_data:
                metadata = entry.copy()
                text = metadata.pop("text")  # remove text from metadata
                chunks.append({
                    "text": text,
                    "metadata": metadata
                })
        print(f"✅ Loaded {len(chunks)} interview chunks")
        return chunks
    except Exception as e:
        print(f"❌ Error loading interview chunks: {e}")
        return []


In [8]:
interview_path = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/batch2/interview.json"

interview_chunks = load_interviews(interview_path)
embed_and_upload(interview_chunks, "interviews")


✅ Loaded 770 interview chunks
📦 Processing 770 chunks for collection interviews...


  client.recreate_collection(


✅ Created collection: interviews


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]


✅ Uploaded batch 1/8 to interviews


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


✅ Uploaded batch 2/8 to interviews


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]


✅ Uploaded batch 3/8 to interviews


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


✅ Uploaded batch 4/8 to interviews


Batches: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


✅ Uploaded batch 5/8 to interviews


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.08it/s]


✅ Uploaded batch 6/8 to interviews


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


✅ Uploaded batch 7/8 to interviews


Batches: 100%|██████████| 3/3 [00:03<00:00,  1.06s/it]


✅ Uploaded batch 8/8 to interviews
