In [1]:
import os
import json
import re

# Define the root directory containing JSON files
ROOT_DIR = "data/scraped_data"

def clean_json_file(json_path):
    """Removes chunks that contain only '=' characters from the JSON file."""
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        # Filter out chunks where the "text" field contains only '=' (any number of them)
        cleaned_chunks = [chunk for chunk in chunks if not re.fullmatch(r"=+", chunk.get("text", "").strip())]

        if len(cleaned_chunks) != len(chunks):
            print(f"🧹 Cleaned {len(chunks) - len(cleaned_chunks)} chunks from {json_path}")

            # Save the cleaned data back to the same JSON file
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(cleaned_chunks, f, indent=2, ensure_ascii=False)

    except json.JSONDecodeError:
        print(f"⚠️ Skipping {json_path}: Invalid JSON format.")

def clean_all_json_files(root_dir):
    """Loops through all subfolders in the root directory and cleans each JSON file."""
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".json"):
                json_path = os.path.join(subdir, file)
                clean_json_file(json_path)

# Run the cleaning process
if __name__ == "__main__":
    clean_all_json_files(ROOT_DIR)
    print("✅ Finished cleaning all JSON files.")


🧹 Cleaned 10 chunks from data/scraped_data/brand.vanderbilt.edu/brand.vanderbilt.edu_part1.txt.json
🧹 Cleaned 10 chunks from data/scraped_data/giving.vanderbilt.edu/giving.vanderbilt.edu_part1.txt.json
🧹 Cleaned 13 chunks from data/scraped_data/gradschool.vanderbilt.edu/gradschool.vanderbilt.edu_part1.txt.json
🧹 Cleaned 13 chunks from data/scraped_data/hr.vanderbilt.edu/hr.vanderbilt.edu_part1.txt.json
🧹 Cleaned 15 chunks from data/scraped_data/medschool.vanderbilt.edu/medschool.vanderbilt.edu_part1.txt.json
🧹 Cleaned 10 chunks from data/scraped_data/peabody.vanderbilt.edu/peabody.vanderbilt.edu_part1.txt.json
🧹 Cleaned 17 chunks from data/scraped_data/business.vanderbilt.edu/business.vanderbilt.edu_part1.txt.json
🧹 Cleaned 33 chunks from data/scraped_data/news.vanderbilt.edu/news.vanderbilt.edu_part1.txt.json
🧹 Cleaned 2 chunks from data/scraped_data/studentorg.vanderbilt.edu/studentorg.vanderbilt.edu_part1.txt.json
🧹 Cleaned 20 chunks from data/scraped_data/law.vanderbilt.edu/law.van

In [6]:
import os
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import uuid

# Define the directory where ChromaDB is stored
CHROMA_DB_DIR = "../chroma_db"
COLLECTION_NAME = "WebsiteData"  # Adjust based on your setup



# Initialize the vector store
print("🔍 Connecting to ChromaDB...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=CHROMA_DB_DIR)

# Create a dummy document
dummy_text = "==="  # Content with only '=' characters
dummy_doc = Document(
    page_content=dummy_text,
    metadata={"id": str(uuid.uuid4()), "source": "dummy_test", "chunk_number": 999}
)

# Add to vector store
print(f"📦 Adding dummy chunk: {dummy_doc.page_content}")
vector_store.add_documents([dummy_doc])

print("✅ Dummy chunk added successfully.")



🔍 Connecting to ChromaDB...
📦 Adding dummy chunk: ===
✅ Dummy chunk added successfully.


In [11]:
import os
import re
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the directory where ChromaDB is stored
CHROMA_DB_DIR = "../chroma_db"  # Adjust based on your actual path
COLLECTION_NAME = "WebsiteData"  # Ensure this matches your collection name

# Initialize the vector store
print("🔍 Connecting to ChromaDB...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=CHROMA_DB_DIR)

# Retrieve all documents and their metadata (without IDs)
print(f"📥 Retrieving documents from '{COLLECTION_NAME}' collection...")
all_docs = vector_store.get(include=["documents", "metadatas"])  # No "ids" option

documents = all_docs.get("documents", [])
metadatas = all_docs.get("metadatas", [])

print(f"📌 Found {len(documents)} documents in vector store.")

# Extract stored IDs from metadata
stored_ids = [meta.get("id") for meta in metadatas if "id" in meta]  # Retrieve IDs if stored

# Filter documents that contain only '=' characters
to_remove_ids = [
    stored_ids[i] for i, doc in enumerate(documents) if re.fullmatch(r"=+", doc.strip()) and stored_ids[i]
]

if not to_remove_ids:
    print("✅ No invalid documents found. Nothing to remove.")
else:
    print(f"🗑️ Removing {len(to_remove_ids)} invalid documents...")

    # Remove only if valid IDs exist
    valid_to_remove = [id_ for id_ in to_remove_ids if id_ in stored_ids]

    if valid_to_remove:
        vector_store.delete(ids=valid_to_remove)
        print(f"✅ Successfully removed {len(valid_to_remove)} documents.")
    else:
        print("⚠️ No matching document IDs found in the database. Nothing to remove.")


🔍 Connecting to ChromaDB...
📥 Retrieving documents from 'WebsiteData' collection...
📌 Found 6760 documents in vector store.
🗑️ Removing 674 invalid documents...


OperationalError: attempt to write a readonly database