## 01_structure_check.ipynb

✅ Notebook Goal:
Verify structural consistency between MongoDB and SQLite databases
- Check counts
- Unique IDs
- Presence of records in major tables

In [1]:
# --- MongoDB Setup ---
import pymongo
from pprint import pprint

mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_db = mongo_client["openfda_converted"]
mongo_collection = mongo_db["full_reports"]

# --- SQLite Setup ---
import sqlite3
import pandas as pd

sqlite_path = "../../sql/openfda_base_updated.db"
conn = sqlite3.connect(sqlite_path)

In [2]:
# --- 1. Total Report Count ---
mongo_total = mongo_collection.count_documents({})
print(f"MongoDB - Total reports: {mongo_total}")

sqlite_total = pd.read_sql_query("SELECT COUNT(*) as total FROM report", conn).iloc[0]['total']
print(f"SQLite - Total reports: {sqlite_total}")

MongoDB - Total reports: 35999
SQLite - Total reports: 36000


In [3]:
# --- 2. Unique Report ID Check ---
sqlite_duplicates = pd.read_sql_query("""
    SELECT safetyreportid, COUNT(*) as cnt
    FROM report
    GROUP BY safetyreportid
    HAVING cnt > 1
""", conn)

if sqlite_duplicates.empty:
    print("SQLite - All safetyreportid values are unique ✅")
else:
    print("SQLite - Duplicate safetyreportid values found ❌")
    display(sqlite_duplicates)

SQLite - All safetyreportid values are unique ✅


In [4]:
# --- 3. Count of Records in Major Tables ---
tables = [
    "reaction", "drug", "drug_optional", "drug_openfda", "drug_activesubstance",
    "report_duplicate", "summary", "patient_optional", "primarysource_literature_reference"
]

for table in tables:
    count = pd.read_sql_query(f"SELECT COUNT(*) as total FROM {table}", conn).iloc[0]['total']
    print(f"SQLite - {table}: {count} rows")


SQLite - reaction: 126049 rows
SQLite - drug: 188847 rows
SQLite - drug_optional: 188847 rows
SQLite - drug_openfda: 188847 rows
SQLite - drug_activesubstance: 185151 rows
SQLite - report_duplicate: 26353 rows
SQLite - summary: 16948 rows
SQLite - patient_optional: 36000 rows
SQLite - primarysource_literature_reference: 3309 rows


In [5]:
# --- 4. MongoDB Sample Report Structure ---
print("\nSample MongoDB report structure:")
sample_doc = mongo_collection.find_one()
pprint(sample_doc, depth=2)

# Cleanup
conn.close()
mongo_client.close()


Sample MongoDB report structure:
{'_id': ObjectId('68246feda116cc7161242716'),
 'companynumb': 'US-SECURA BIO, INC.-2021US002331',
 'fulfillexpeditecriteria': 2,
 'occurcountry': 'US',
 'patient': {'drug': [...], 'reaction': [...]},
 'primarysource': {'qualification': 5, 'reportercountry': 'US'},
 'primarysourcecountry': 'US',
 'receiptdate': datetime.datetime(2024, 3, 15, 0, 0),
 'receiptdateformat': 102,
 'receivedate': datetime.datetime(2021, 7, 12, 0, 0),
 'receivedateformat': 102,
 'receiver': {'receiverorganization': 'FDA', 'receivertype': 6},
 'reporttype': 1,
 'safetyreportid': 19520083,
 'safetyreportversion': 2,
 'sender': {'senderorganization': 'FDA-Public Use', 'sendertype': 2},
 'serious': 2,
 'seriousnesscongenitalanomali': 2,
 'seriousnessdeath': 2,
 'seriousnessdisabling': 2,
 'seriousnesshospitalization': 2,
 'seriousnesslifethreatening': 2,
 'seriousnessother': 2,
 'transmissiondate': datetime.datetime(2024, 4, 10, 0, 0),
 'transmissiondateformat': 102}
