In [3]:
# ✅ Notebook Goal:
# Use the field presence CSVs to verify which fields are populated in both SQLite and MongoDB

import pandas as pd
import pymongo
import sqlite3

# --- Load CSV Files ---
value_fields = pd.read_csv("../../reports/evaluation_results/value_fields_presence.csv")
object_fields = pd.read_csv("../../reports/evaluation_results/object_fields_presence.csv")

# --- MongoDB Setup ---
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_collection = mongo_client["openfda_converted"]["full_reports"]


sqlite_path = "../../sql/openfda_base_updated.db"
conn = sqlite3.connect(sqlite_path)



In [37]:
# select all the safetyreport ids from drug where medicinalproduct is COPIKTRA
query = """
SELECT DISTINCT id FROM drug WHERE medicinalproduct = 'VENOFER'
"""
results = conn.execute(query).fetchall()
# convert the results to a list of strings
safetyreportids = [row[0] for row in results]

print(safetyreportids)



[34, 35, 36, 56815, 61507, 63240, 79105, 106574, 117790, 142958, 145559]


In [38]:
dupa = set()

for id in safetyreportids:
    query = f"""
    SELECT *
    FROM drug_openfda
    WHERE drug_id = '{id}'
    """
    results = conn.execute(query).fetchall()
    # print(type(results))
    # print(results[0][2:])
    dupa.add(results[0][2:])

print(dupa)


{('NDA021135', 'VENOFER', 'IRON SUCROSE', 'Fresenius Medical Care Holdings, Inc., American Regent, Inc.', 'M0011718, N0000177913', '49230-530-01, 49230-530-10, 49230-530-25, 49230-534-01, 49230-534-10, 49230-534-25, 0517-2325-01, 0517-2325-10, 0517-2340-01, 0517-2340-10, 0517-2340-25, 0517-2310-01, 0517-2310-05, 0517-2340-99', 'Iron [CS]', 'Parenteral Iron Replacement [EPC]', None, None, '49230-530, 49230-534, 0517-2310, 0517-2325, 0517-2340', 'HUMAN PRESCRIPTION DRUG', 'INTRAVENOUS', '1741267, 1741268, 1741270, 1741271, 1741261, 1741263', 'd169adb0-d1dc-4c19-99b4-f1dce90db380, 1cb19ad9-7a23-43bd-ab28-955cd2fa3750, 91f50fe2-b85d-460d-86a8-469cc16f6a99', 'f1ab1a22-2b99-4d27-8b5a-9c3bcd5e3040, 626dc9e5-c6b4-4f9c-9bf4-774fd3ae619a, b998ef36-81fa-40a1-9cd1-6d6156f91564', 'IRON SUCROSE', 'FZ7NYF5N8L')}


##  check fda for all drugs.

In [36]:
query = """
SELECT DISTINCT application_number
FROM drug_openfda
"""
results = conn.execute(query).fetchall()
print(len(results))

2601


In [None]:
from pymongo import MongoClient
import json
from collections import defaultdict

client = MongoClient("mongodb://localhost:27017")
collection = client["openfda_converted"]["full_reports"]

def serialize_openfda(openfda):
    if not isinstance(openfda, dict):
        return None
    # Sort keys to ensure consistent serialization
    return json.dumps({k: v for k, v in sorted(openfda.items()) if v not in [None, "", [], {}, "null"]}, sort_keys=True)

variant_tracker = defaultdict(set)

cursor = collection.find({}, {"patient.drug": 1})

for doc in cursor:
    drugs = doc.get("patient", {}).get("drug", [])
    for drug in drugs:
        name = drug.get("medicinalproduct")
        openfda = drug.get("openfda")
        if not name:
            continue
        serialized = serialize_openfda(openfda)
        if serialized:
            variant_tracker[name].add(serialized)

# Check for drugs with more than one unique metadata fingerprint
conflicting = {name: variants for name, variants in variant_tracker.items() if len(variants) > 1}

print(f"🧪 Drugs with multiple openfda variants: {len(conflicting)}")
for name in list(conflicting)[:10]:
    print(f" - {name}: {len(conflicting[name])} variants")


In [41]:
from pymongo import MongoClient
import json
from collections import defaultdict

client = MongoClient("mongodb://localhost:27017")
collection = client["openfda_converted"]["full_reports"]

def serialize_openfda(openfda):
    return json.dumps(openfda, sort_keys=True)

variant_tracker = defaultdict(set)
total_drugs = 0

cursor = collection.find({}, {"patient.drug": 1}, batch_size=1000)

few = 0

for doc in cursor:
    for drug in doc.get("patient", {}).get("drug", []):
        few += 1
        if few < 5:
            print(drug)
        name = drug.get("medicinalproduct")
        openfda = drug.get("openfda", None)
        if not name or not isinstance(openfda, dict):
            continue
        fingerprint = serialize_openfda(openfda)
        variant_tracker[name].add(fingerprint)
        total_drugs += 1

# Extract products with >1 unique fingerprint
conflicting = {name: variants for name, variants in variant_tracker.items() if len(variants) > 1}

print(f"🔍 Total drug entries with openfda: {total_drugs}")
print(f"🧪 Drugs with multiple openfda variants: {len(conflicting)}")
if conflicting:
    for name, variants in list(conflicting.items())[:10]:
        print(f" - {name}: {len(variants)} variants")


{'drugcharacterization': 1, 'medicinalproduct': 'COPIKTRA', 'drugbatchnumb': '1697765A', 'drugauthorizationnumb': 211155, 'drugstructuredosagenumb': 75.0, 'drugstructuredosageunit': 3, 'drugseparatedosagenumb': 2.0, 'drugintervaldosageunitnumb': 1.0, 'drugintervaldosagedefinition': 804, 'drugdosagetext': '75 MG, BID', 'drugdosageform': 'Capsule', 'drugadministrationroute': 48, 'drugindication': 'Non-Hodgkin^s lymphoma', 'actiondrug': 5, 'drugrecurreadministration': 3, 'drugadditional': 3, 'activesubstance': {'activesubstancename': 'DUVELISIB'}, 'openfda': {'application_number': ['NDA211155'], 'brand_name': ['COPIKTRA'], 'generic_name': ['DUVELISIB'], 'manufacturer_name': ['Secura Bio, Inc'], 'product_ndc': ['73116-215', '73116-225'], 'product_type': ['HUMAN PRESCRIPTION DRUG'], 'route': ['ORAL'], 'substance_name': ['DUVELISIB'], 'rxcui': ['2058514', '2058520', '2058523', '2058525'], 'spl_id': ['de5c746e-bd30-4b3c-a473-b8fe1f70153c'], 'spl_set_id': ['e3c5ac56-e1f6-473a-bc73-687836534780