In [21]:
import os
import json
import re

# Define the root directory containing JSON files
ROOT_DIR = "data/scraped_data"

def clean_json_file(json_path):
    """Removes chunks that contain only '=' characters from the JSON file."""
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        # Filter out chunks where the "text" field contains only '=' (any number of them)
        cleaned_chunks = [chunk for chunk in chunks if not re.fullmatch(r"=+", chunk.get("text", "").strip())]

        if len(cleaned_chunks) != len(chunks):
            print(f"🧹 Cleaned {len(chunks) - len(cleaned_chunks)} chunks from {json_path}")

            # Save the cleaned data back to the same JSON file
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(cleaned_chunks, f, indent=2, ensure_ascii=False)

    except json.JSONDecodeError:
        print(f"⚠️ Skipping {json_path}: Invalid JSON format.")

def clean_all_json_files(root_dir):
    """Loops through all subfolders in the root directory and cleans each JSON file."""
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".json"):
                json_path = os.path.join(subdir, file)
                clean_json_file(json_path)

# Run the cleaning process
if __name__ == "__main__":
    clean_all_json_files(ROOT_DIR)
    print("✅ Finished cleaning all JSON files.")


🧹 Cleaned 10 chunks from data/scraped_data/brand.vanderbilt.edu/brand.vanderbilt.edu_part1.txt.json
🧹 Cleaned 10 chunks from data/scraped_data/giving.vanderbilt.edu/giving.vanderbilt.edu_part1.txt.json
🧹 Cleaned 13 chunks from data/scraped_data/gradschool.vanderbilt.edu/gradschool.vanderbilt.edu_part1.txt.json
🧹 Cleaned 13 chunks from data/scraped_data/hr.vanderbilt.edu/hr.vanderbilt.edu_part1.txt.json
🧹 Cleaned 15 chunks from data/scraped_data/medschool.vanderbilt.edu/medschool.vanderbilt.edu_part1.txt.json
🧹 Cleaned 10 chunks from data/scraped_data/peabody.vanderbilt.edu/peabody.vanderbilt.edu_part1.txt.json
🧹 Cleaned 17 chunks from data/scraped_data/business.vanderbilt.edu/business.vanderbilt.edu_part1.txt.json
🧹 Cleaned 33 chunks from data/scraped_data/news.vanderbilt.edu/news.vanderbilt.edu_part1.txt.json
🧹 Cleaned 2 chunks from data/scraped_data/studentorg.vanderbilt.edu/studentorg.vanderbilt.edu_part1.txt.json
🧹 Cleaned 20 chunks from data/scraped_data/law.vanderbilt.edu/law.van

In [2]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Define ChromaDB directory and collection name
CHROMA_DB_DIR = "../chroma_db"
COLLECTION_NAME = "WebsiteData"

# Initialize ChromaDB
print("🔍 Connecting to ChromaDB...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=CHROMA_DB_DIR)

# Count the number of documents
doc_count = vector_store._collection.count()
print(f"📌 Total documents in '{COLLECTION_NAME}': {doc_count}")


🔍 Connecting to ChromaDB...
📌 Total documents in 'WebsiteData': 6086
