In [2]:
# 03_relational_and_embedding_check.ipynb

# ✅ Notebook Goal:
# Validate that foreign key relationships (SQLite) and embedded structures (MongoDB) are correct

# --- MongoDB Setup ---
import pymongo
from pprint import pprint

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["openfda_converted"]
collection = db["full_reports"]

# --- SQLite Setup ---
import sqlite3
import pandas as pd

conn = sqlite3.connect("../../sql/openfda_base_updated.db")

In [10]:
# --- 1. SQLite Foreign Key Consistency ---
print("\n✅ Checking SQLite foreign key relationships:")
table_fk_pairs = [
    ("reaction", "safetyreportid", "report"),
    ("drug", "safetyreportid", "report"),
    ("drug_optional", "drug_id", "drug"),
    ("drug_openfda", "drug_id", "drug"),
    ("drug_activesubstance", "drug_id", "drug"),
    ("report_duplicate", "safetyreportid", "report"),
    ("summary", "safetyreportid", "report"),
    ("patient_optional", "safetyreportid", "report"),
    ("primarysource_literature_reference", "safetyreportid", "report")
]

# Define known primary key mappings
primary_keys = {
    "report": "safetyreportid",
    "drug": "id"
}

for child_table, fk_col, parent_table in table_fk_pairs:
    parent_pk = primary_keys[parent_table]
    query = f"""
    SELECT {child_table}.{fk_col}
    FROM {child_table}
    LEFT JOIN {parent_table}
    ON {child_table}.{fk_col} = {parent_table}.{parent_pk}
    WHERE {parent_table}.{parent_pk} IS NULL
    LIMIT 5;
    """
    df = pd.read_sql_query(query, conn)
    if df.empty:
        print(f"✅ {child_table}.{fk_col} → {parent_table}.{parent_pk}: OK")
    else:
        print(f"❌ Foreign key error in {child_table}: Orphan records detected")
        print(df)


✅ Checking SQLite foreign key relationships:
✅ reaction.safetyreportid → report.safetyreportid: OK
✅ drug.safetyreportid → report.safetyreportid: OK
✅ drug_optional.drug_id → drug.id: OK
✅ drug_openfda.drug_id → drug.id: OK
✅ drug_activesubstance.drug_id → drug.id: OK
✅ report_duplicate.safetyreportid → report.safetyreportid: OK
✅ summary.safetyreportid → report.safetyreportid: OK
✅ patient_optional.safetyreportid → report.safetyreportid: OK
✅ primarysource_literature_reference.safetyreportid → report.safetyreportid: OK


In [7]:
# --- 2. MongoDB Embedded Object Check ---
print("\n✅ Checking embedded fields in MongoDB:")

def count_exists(path):
    return collection.count_documents({path: {"$exists": True}})

embedded_paths = [
    "patient.reaction", "patient.drug",
    "patient.drug.activesubstance", "patient.drug.openfda", "patient.summary",
    "patient.summary.case_event_date_extracted", "reportduplicate"
]

for path in embedded_paths:
    count = count_exists(path)
    print(f"Documents with {path}: {count}")


✅ Checking embedded fields in MongoDB:
Documents with patient.reaction: 35999
Documents with patient.drug: 35999
Documents with patient.drug.activesubstance: 35999
Documents with patient.drug.openfda: 35670
Documents with patient.summary: 16948
Documents with patient.summary.case_event_date_extracted: 16948
Documents with reportduplicate: 11561


In [9]:
# --- Optional: Check nested list structure lengths ---
print("\n📊 Avg number of reactions per report (MongoDB):")
pipeline = [
    {"$project": {"reaction_count": {"$size": {"$ifNull": ["$patient.reaction", []]}}}},
    {"$group": {"_id": None, "avg": {"$avg": "$reaction_count"}}}
]
result = list(collection.aggregate(pipeline))
print(f"Average reactions per report: {result[0]['avg']:.2f}")

print("\n📊 Avg number of reactions per report (SQLite):")
query = """
SELECT COUNT(*) AS reaction_count
FROM reaction
GROUP BY safetyreportid
"""
df = pd.read_sql_query(query, conn)
avg_reaction_count = df["reaction_count"].mean()
print(f"Average reactions per report: {avg_reaction_count:.2f}")


📊 Avg number of reactions per report (MongoDB):
Average reactions per report: 3.50

📊 Avg number of reactions per report (SQLite):
Average reactions per report: 3.50
