In [None]:
import json
import re
from tqdm import tqdm


In [None]:
INPUT_PATH = "medical_rag_dataset.json"
OUTPUT_PATH = "medical_final_dataset.json"

with open(INPUT_PATH, "r") as f:
    data = json.load(f)

print("Loaded samples:", len(data))


In [None]:
def normalize_question(q):
    q = q.strip()

    # Remove "(are)" and similar junk
    q = re.sub(r"\(are\)", "", q, flags=re.IGNORECASE)

    # Remove extra spaces before punctuation
    q = re.sub(r"\s+\?", "?", q)

    # Collapse multiple question marks
    q = re.sub(r"\?+", "?", q)

    # Normalize spaces
    q = re.sub(r"\s+", " ", q)

    # Capitalize first letter
    q = q[0].upper() + q[1:]

    return q


In [None]:
def is_filler(sentence):
    s = sentence.lower()
    fillers = [
        "anything that increases your chance",
        "is called a risk factor",
        "does not mean that you will",
        "talk to your doctor",
        "you may",
        "your chance of getting",
    ]
    return any(f in s for f in fillers)


In [None]:
def clean_answer(answer, question, max_sentences=3):
    sentences = [s.strip() for s in answer.split(". ") if s.strip()]
    q = question.lower()

    cleaned = []

    for i, s in enumerate(sentences):
        # Remove filler sentences
        if is_filler(s):
            continue

        # Remove definition sentence for non-definition questions
        if i == 0 and any(k in q for k in ["symptom", "risk", "treatment", "sign"]):
            if "is a disease" in s.lower():
                continue

        cleaned.append(s)

        if len(cleaned) >= max_sentences:
            break

    return ". ".join(cleaned).strip()


In [None]:
final_data = []
seen = set()

for item in tqdm(data):
    question = normalize_question(item["instruction"])
    answer = clean_answer(item["response"], question)

    if not question or not answer:
        continue

    key = (question.lower(), answer.lower())
    if key in seen:
        continue
    seen.add(key)

    final_data.append({
        "instruction": question,
        "context": item.get("context", ""),
        "response": answer,
        "metadata": item.get("metadata", {})
    })


In [None]:
with open(OUTPUT_PATH, "w") as f:
    json.dump(final_data, f, indent=2)

print("âœ… Final dataset saved")
print("Final samples:", len(final_data))
print("Path:", OUTPUT_PATH)
