In [1]:
import pandas as pd
df = pd.read_csv("/Users/adithyakatari/Desktop/suchitra/cdc_health_topics_cleaned.csv")

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
import os
os.environ["CHROMA_TELEMETRY"] = "FALSE"
import chromadb
from chromadb.config import Settings

In [4]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if len(chunk.strip()) > 50:
            chunks.append(chunk)

    return chunks


In [5]:
import chromadb
from chromadb.config import Settings

PERSIST_DIR = "/Users/adithyakatari/Desktop/suchitra/chroma_db"

client = chromadb.PersistentClient(
    path=PERSIST_DIR,
    settings=Settings(anonymized_telemetry=False),
    tenant="default_tenant",
    database="default_database",
)

from chromadb.utils import embedding_functions

embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

collection = client.get_or_create_collection(
    name="cdc_diseases",
    embedding_function=embedding_fn
)

# Add a small test first (to confirm it persists)
collection.add(
    ids=["__persist_test__"],
    documents=["persist test"]
)

# flush if supported by your version
if hasattr(client, "persist"):
    client.persist()

print("Collections:", [c.name for c in client.list_collections()])
print("Count:", collection.count())




Collections: ['cdc_diseases']
Count: 1


In [6]:
def detect_sections(text: str, url: str):
    text = text.lower()
    url = url.lower()

    has_symptoms = (
        "symptom" in text
        or "/symptom" in url
    )

    has_prevention = (
        "prevention" in text
        or "prevent" in url
    )

    has_risk_factors = (
        "risk factor" in text
        or "risk factors" in text
        or "/risk-factor" in url
        or "/risk-factors" in url
    )

    return {
        "has_symptoms": has_symptoms,
        "has_prevention": has_prevention,
        "has_risk_factors": has_risk_factors
    }



In [7]:
df = pd.read_csv("cdc_health_topics_cleaned.csv")

documents, metadatas, ids = [], [], []

for idx, row in df.iterrows():
    disease = str(row["disease_name"]).strip()
    url = str(row["url"]).strip()

    if not url.startswith("http"):
        continue

    try:
        response = requests.get(url, timeout=15)
        soup = BeautifulSoup(response.text, "html.parser")

        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        text = soup.get_text(separator=" ", strip=True)
        section_flags = detect_sections(text, url)
        chunks = chunk_text(text)

        for c_idx, chunk in enumerate(chunks):
            documents.append(chunk)
            metadatas.append({
                "disease": disease,
                "url": url,
                **section_flags
            })
            ids.append(f"{disease.lower().replace(' ', '_')}_{idx}_{c_idx}")

    except Exception as e:
        print(f"❌ Failed for {disease}: {e}")


In [8]:
documents

["Additive Manufacturing (3D Printing) | Manufacturing | CDC Skip directly to site content Skip directly to search An official website of the United States government Here's how you know Official websites use .gov A .gov website belongs to an official government organization in the United States. Secure .gov websites use HTTPS A lock ( ) or https:// means you've safely connected to the .gov website. Share sensitive information only on official, secure websites. National Institute for Occupational Safety and Health (NIOSH) Manufacturing Explore This Topic Search Search Clear Search For Everyone Manufacturing 3D Printing (Additive Manufacturing) Biomanufacturing and Synthetic Biology Hazardous Energy Control View all Home search clear search National Institute for Occupational Safety and Health (NIOSH) Manufacturing Menu clear search For Everyone Manufacturing 3D Printing (Additive Manufacturing) Biomanufacturing and Synthetic Biology Hazardous Energy Control View All Manufacturing April

In [9]:
len(documents)

3693

In [10]:
documents[0]

"Additive Manufacturing (3D Printing) | Manufacturing | CDC Skip directly to site content Skip directly to search An official website of the United States government Here's how you know Official websites use .gov A .gov website belongs to an official government organization in the United States. Secure .gov websites use HTTPS A lock ( ) or https:// means you've safely connected to the .gov website. Share sensitive information only on official, secure websites. National Institute for Occupational Safety and Health (NIOSH) Manufacturing Explore This Topic Search Search Clear Search For Everyone Manufacturing 3D Printing (Additive Manufacturing) Biomanufacturing and Synthetic Biology Hazardous Energy Control View all Home search clear search National Institute for Occupational Safety and Health (NIOSH) Manufacturing Menu clear search For Everyone Manufacturing 3D Printing (Additive Manufacturing) Biomanufacturing and Synthetic Biology Hazardous Energy Control View All Manufacturing April 

In [11]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)


In [12]:
results = collection.query(
    query_texts=["how to prevent cancer"],
    where={"has_prevention": True},
    n_results=10
)


In [13]:
results

{'ids': [['cancer_clusters_183_2',
   'obesity_and_cancer_933_1',
   'liver_cancer_774_1',
   'occupational_cancer_934_0',
   'neutropenia_916_0',
   'preventing_infections_in_cancer_patients_1059_0',
   'tobacco_and_cancer_1297_0',
   'cancer_182_0',
   'comprehensive_cancer_control_275_0',
   'national_comprehensive_cancer_control_program_888_0']],
 'embeddings': None,
 'documents': [["cancer, number of cases, and the age, sex, race, address, occupation, and age at diagnosis of the people with cancer. CDC/ATSDR has created a decision-making tool that can be used by health departments for determining the need to further assess an unusual pattern of cancer. The tool contains a set of criteria that can be used to assess the cancer(s) of concern and/or related environmental risk factors. These criteria promote further assessment of unusual patterns of cancer. Reducing your risk of developing cancer In general, cancers have a variety of risk factors related to behavioral, lifestyle, occup

In [14]:
for i in range(len(results["ids"][0])):
    print("=" * 80)
    print(f"Result {i+1}")
    print("ID:", results["ids"][0][i])
    print("Disease:", results["metadatas"][0][i]["disease"])
    print("URL:", results["metadatas"][0][i]["url"])
    print("Distance:", results["distances"][0][i])
    print("\nTEXT:\n", results["documents"][0][i])

Result 1
ID: cancer_clusters_183_2
Disease: Cancer Clusters
URL: https://www.cdc.gov/cancer-environment/about/
Distance: 0.4205291271209717

TEXT:
 cancer, number of cases, and the age, sex, race, address, occupation, and age at diagnosis of the people with cancer. CDC/ATSDR has created a decision-making tool that can be used by health departments for determining the need to further assess an unusual pattern of cancer. The tool contains a set of criteria that can be used to assess the cancer(s) of concern and/or related environmental risk factors. These criteria promote further assessment of unusual patterns of cancer. Reducing your risk of developing cancer In general, cancers have a variety of risk factors related to behavioral, lifestyle, occupational, and environmental risks. For many cancers, you can reduce your risk of developing cancer by adopting a healthy lifestyle that includes avoiding tobacco use, excessive alcohol consumption, and sun exposure; increasing physical activity

In [15]:
import re

def rerank_chunks(query, documents, metadatas, top_k=5):
    query = query.lower()
    keywords = set(re.findall(r"\w+", query))

    scored = []

    for doc, meta in zip(documents, metadatas):
        text = doc.lower()
        text_words = set(re.findall(r"\w+", text))

        # keyword overlap score
        keyword_score = len(keywords.intersection(text_words))

        # boost if prevention-related
        boost = 2 if meta.get("has_prevention") else 0

        final_score = keyword_score + boost
        scored.append((final_score, doc, meta))

    # sort by score (descending)
    scored.sort(key=lambda x: x[0], reverse=True)

    return scored[:top_k]


In [16]:
reranked = rerank_chunks(
    query="how to prevent cancer",
    documents=results["documents"][0],
    metadatas=results["metadatas"][0],
    top_k=5
)


In [17]:
reranked

[(6,
  'The risk of cancer increases with the more excess weight a person gains and the longer a person is overweight. How to lower your risk To reduce your risk of obesity-associated cancer, you can follow a healthy eating plan and get regular physical activity . If you are concerned about your weight or your child\'s weight, talk to your doctor. "Talk to Someone" Simulation Talk to Someone: Physical Activity and Nutrition Talk to Someone: Physical Activity and Nutrition gives tips for cancer survivors to improve physical activity and healthy eating. How doctors can help their patients prevent or treat obesity Health care providers can help patients reach a healthy weight in several ways. For example, they can: Measure patients\' weight, height, and BMI. Explain that keeping a healthy weight can lower a person\'s cancer risk. Connect patients and families with community services that provide healthy food and ways to be active. Refer patients with a high BMI who want to lose weight to 

In [18]:
import os
os.listdir(".")


['cdc_health_topics_cleaned.csv',
 'requirements.txt',
 'sources links.docx',
 'cdc_health_topics.csv',
 'datapreprocessing.ipynb',
 'vector_database.ipynb',
 'chroma_db',
 'Web_Extraction.ipynb',
 'context_builder.ipynb']

In [19]:
print("Collections:", [c.name for c in client.list_collections()])


Collections: ['cdc_diseases']


In [20]:
import os
print("CWD:", os.getcwd())
print("Persist dir absolute:", os.path.abspath("./chroma_db"))
print("Files:", os.listdir("."))


CWD: /Users/adithyakatari/Desktop/suchitra
Persist dir absolute: /Users/adithyakatari/Desktop/suchitra/chroma_db
Files: ['cdc_health_topics_cleaned.csv', 'requirements.txt', 'sources links.docx', 'cdc_health_topics.csv', 'datapreprocessing.ipynb', 'vector_database.ipynb', 'chroma_db', 'Web_Extraction.ipynb', 'context_builder.ipynb']


In [21]:
#client.delete_collection("cdc_diseases")