In [53]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import pymongo
from pprint import pprint

# --- Connect to SQLite ---
conn = sqlite3.connect("../../sql/openfda_base_redesigned_gpt_fullrun.db")
cursor = conn.cursor()

In [54]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["openfda_converted"]
collection = db["full_reports"]

In [55]:
query = """
SELECT COUNT(DISTINCT medicinalproduct) AS unique_medicinalproducts
FROM drug_catalog ;
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Unique medicinal products: {result[0]}")

Unique medicinal products: 8156


In [73]:
unique_toolarge = {'DULOXETINE', 'SULFAMETHOXAZOLE', 'DULOXETINE HYDROCHLORIDE', 'HYDROMORPHONE', 'BACLOFEN', 'CELLCEPT', 'RIBOFLAVIN', 'BROMHEXINE HYDROCHLORIDE', 'HUMAN IMMUNOGLOBULIN G', 'OMEGA-3 FATTY ACIDS', 'RANITIDINE', 'HORSE CHESTNUT', 'THIAMINE HYDROCHLORIDE', 'SULFADIAZINE', 'RETINOL', 'PYRIDOXINE HYDROCHLORIDE', 'TIZANIDINE', 'WARFARIN', 'TRIMETHOPRIM', 'LORATADINE', 'GLICLAZIDE', 'CYMBALTA', 'FUROSEMIDE', 'MOMETASONE FUROATE', 'DOCONEXENT', 'LORAZEPAM', 'POTASSIUM CHLORIDE', 'HYDROCHLOROTHIAZIDE', 'ALKA-SELTZER', 'VITAMIN B', 'DEXPANTHENOL', 'SULFAMETHOXAZOLE\\TRIMETHOPRIM', 'CHOLECALCIFEROL', 'FISH OIL', 'BIFIDOBACTERIUM BIFIDUM', 'ALEVE', 'NORTRIPTYLINE HYDROCHLORIDE', 'ERGOCALCIFEROL', 'TRIAMTERENE', 'BIFIDOBACTERIUM ANIMALIS LACTIS', 'ORYCTOLAGUS CUNICULUS SKIN', 'ALKA-SELTZER HEARTBURN RELIEFCHEWS', 'FRUCTOSE', 'LYRICA', 'VITAMIN B12', 'NIACINAMIDE\\PYRIDOXINE HYDROCHLORIDE\\RIBOFLAVIN\\THIAMINE HYDROCHLORIDE', 'THIAMINE', 'MYCOPHENOLATE MOFETIL', 'CALCIUM CARBONATE', 'SULFALENE', 'VITAMIN C', 'VITAMIN D NOS', 'ATLANTIC SALMON OIL', 'RIBOFLAVIN 5^-PHOSPHATE SODIUM ANHYDROUS', 'GAMMAGARD', 'CREATININE', 'ASCORBIC ACID', 'AMINOBENZOIC ACID', 'SODIUM CHLORIDE', 'INSULIN NOS', 'THYMOCYTE IMMUNE GLOBULIN NOS', 'SENNOSIDES A AND B', 'COBAMAMIDE', 'CHOLINE BITARTRATE', 'MAGNESIUM CITRATE', 'SENOKOT', 'FOSAMAX', 'LORATADINE\\PSEUDOEPHEDRINE SULFATE', 'LOPERAMIDE', 'IMODIUM', 'CLORPRENALINE HYDROCHLORIDE', 'NIACINAMIDE', 'ALFACALCIDOL', 'BIFIDOBACTERIUM LONGUM\\LACTOBACILLUS ACIDOPHILUS', 'LACTOBACILLUS RHAMNOSUS', 'NORTRIPTYLINE', 'CYANOCOBALAMIN', 'VITAMIN B COMPLEX', '.ALPHA.-TOCOPHEROL', 'ASPIRIN', 'ATIVAN', 'SENNA', 'DOCONEXENT\\ICOSAPENT', 'FOLIC ACID', 'SULFADIAZINE\\TRIMETHOPRIM', 'LACTOBACILLUS CASEI', 'CALCIUM GLUCONATE', 'ALENDRONATE SODIUM', 'INOSITOL', 'CALCIUM', 'GUAIFENESIN', 'HYDROMORPHONE HYDROCHLORIDE', 'DILPACIMAB', 'NASONEX', 'OMEGA-3-ACID ETHYL ESTERS', 'ZOPICLONE', 'SENNA LEAF', 'NICOTINAMIDE', 'INULIN', 'ZANTAC', 'CREATINE', '.ALPHA.-TOCOPHEROL, D-', 'MELATONIN', 'DOCUSATE SODIUM', 'PREDNISONE', 'TIZANIDINE HYDROCHLORIDE'}

# Stage 1: SQLite Data Integrity Checks

## Foreign keys check

In [14]:
query = """
SELECT safetyreportid 
FROM patient_age 
WHERE safetyreportid NOT IN (SELECT safetyreportid FROM report);
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Unique safety reports: {result}")

query = """
SELECT safetyreportid FROM reaction WHERE safetyreportid NOT IN (SELECT safetyreportid FROM report);
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Unique safety reports: {result}")

query = """
SELECT safetyreportid FROM patient_drug_history WHERE safetyreportid NOT IN (SELECT safetyreportid FROM report);
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Unique safety reports: {result}")

Unique safety reports: None
Unique safety reports: None
Unique safety reports: None


## Reports with no patient_drug_history

In [56]:
query = """
SELECT r.safetyreportid
FROM report r
LEFT JOIN patient_drug_history p ON r.safetyreportid = p.safetyreportid
WHERE p.safetyreportid IS NULL;
"""
cursor.execute(query)
result = cursor.fetchall()
print(f"Unique safety reports: {len(result)}")

Unique safety reports: 0


## NULLs in primary drug identifiers

In [16]:
query = """
SELECT * FROM drug_catalog WHERE medicinalproduct IS NULL;
"""
cursor.execute(query)
result = cursor.fetchall()
print(f"Unique safety reports: {len(result)}")

query = """
SELECT * FROM patient_drug_history WHERE drug_id IS NULL;
"""
cursor.execute(query)
result = cursor.fetchall()
print(f"Unique safety reports: {len(result)}")

Unique safety reports: 0
Unique safety reports: 0


# Stage 2: Count-Based comparizon with mongo

### report count

In [20]:
query = """
SELECT COUNT(*) FROM report;
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Total number of reports: {result[0]}")

mongo_total = collection.count_documents({})
print(f"Total number of reports in MongoDB: {mongo_total}")


Total number of reports: 36000
Total number of reports in MongoDB: 35999


### Drug entities

In [22]:
query = """
SELECT COUNT(*) FROM patient_drug_history;
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Total number of patient drug history: {result[0]}")

mongo_total = collection.aggregate([
    {"$project": {"n_drugs": {"$size": {"$ifNull": ["$patient.drug", []]}}}},
    {"$group": {"_id": None, "total": {"$sum": "$n_drugs"}}}
])

for doc in mongo_total:
    print(f"Total number of patient drug history in MongoDB: {doc['total']}")

print(f"difference: {result[0] - doc['total']}")

Total number of patient drug history: 188847
Total number of patient drug history in MongoDB: 188286
difference: 561


### Unique drug count

In [23]:
query = """
SELECT COUNT(*) FROM drug_catalog;
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Total number of drug catalog: {result[0]}")

Total number of drug catalog: 8301


In [None]:
def normalize_key(drug):
    if not isinstance(drug, dict):
        print(f"Warning: drug is not a dict: {drug}")
        return None

    def safe_lower(val):
        return val.strip().lower() if isinstance(val, str) else str(val).lower()

    actives = drug.get("activesubstance", [])
    if isinstance(actives, dict):
        actives = [actives]
    elif not isinstance(actives, list):
        actives = []

    activesubstance_names = []
    for s in actives:
        if isinstance(s, dict):
            name = s.get("activesubstancename")
            if isinstance(name, list):  # Defensive fix: flatten nested lists
                print(f"Warning: activesubstancename is a list: {name}")
                activesubstance_names.extend([safe_lower(n) for n in name if isinstance(n, str)])
            elif isinstance(name, str):
                activesubstance_names.append(safe_lower(name))

    return (
        safe_lower(drug.get("medicinalproduct", "")),
        safe_lower(drug.get("openfda", {}).get("application_number", "")) if isinstance(drug.get("openfda"), dict) else None,
        tuple(sorted(activesubstance_names))
    )

unique_keys = set()
for doc in collection.find({}, {"patient.drug": 1}):
    for drug in doc.get("patient", {}).get("drug", []):
        key = normalize_key(drug)
        if key:
            unique_keys.add(key)

print("Unique drug keys:", len(unique_keys))

Unique drug keys: 8292


In [67]:
medicinalproducts = []

for doc in collection.find({}, {"patient.drug.medicinalproduct": 1}):
    for drug in doc.get("patient", {}).get("drug", []):
        mp = drug.get("medicinalproduct")
        if isinstance(mp, str):
            medicinalproducts.append(mp)

print("Total medicinalproduct entries:", len(medicinalproducts))
print("Sample:", medicinalproducts[:10])

Total medicinalproduct entries: 188286
Sample: ['COPIKTRA', 'AMLODIPINE', 'BEVACIZUMAB', 'ATEZOLIZUMAB', 'DUPIXENT', 'DUPIXENT', 'NERLYNX', 'INFLECTRA', 'INFLECTRA', 'INFLECTRA']


In [89]:
### compare unique keys with SQLite
query = """
SELECT medicinalproduct
FROM drug_catalog;
"""
cursor.execute(query)
result = cursor.fetchall()
print("Drug entries in SQLite:", len(result))

unique_meds = set()
print(f"Unique drugs in mongoDB {len(set(medicinalproducts))}")
for row in result:
    unique_meds.add(row[0])
print(f"Unique drugs in SQLite {len(unique_meds)}")
print(unique_meds - set(medicinalproducts))
print(len(unique_meds - set(medicinalproducts)))

Drug entries in SQLite: 8301
Unique drugs in mongoDB 8214
Unique drugs in SQLite 8156
{'ORYCTOLAGUS CUNICULUS SKIN', 'BIFIDOBACTERIUM BIFIDUM', 'BIFIDOBACTERIUM LONGUM\\LACTOBACILLUS ACIDOPHILUS', 'BROMHEXINE HYDROCHLORIDE', 'DILPACIMAB', 'ALKA-SELTZER HEARTBURN RELIEFCHEWS', 'RIBOFLAVIN 5^-PHOSPHATE SODIUM ANHYDROUS', 'CLORPRENALINE HYDROCHLORIDE', 'SULFALENE'}
9


In [82]:
print((unique_meds - set(medicinalproducts)) - unique_toolarge)
print(unique_toolarge - set(medicinalproducts))
print(len(unique_toolarge - set(medicinalproducts)))
print(len(unique_toolarge - unique_meds))

set()
{'ORYCTOLAGUS CUNICULUS SKIN', 'BIFIDOBACTERIUM BIFIDUM', 'BIFIDOBACTERIUM LONGUM\\LACTOBACILLUS ACIDOPHILUS', 'BROMHEXINE HYDROCHLORIDE', 'DILPACIMAB', 'ALKA-SELTZER HEARTBURN RELIEFCHEWS', 'RIBOFLAVIN 5^-PHOSPHATE SODIUM ANHYDROUS', 'CLORPRENALINE HYDROCHLORIDE', 'SULFALENE'}
9
0


### Reactions

In [83]:
query = """
SELECT COUNT(*) FROM reaction;
"""
cursor.execute(query)
result = cursor.fetchone()
print(f"Total number of reactions: {result[0]}")

mongo_total = collection.aggregate([
    {"$project": {"n_reactions": {"$size": {"$ifNull": ["$patient.reaction", []]}}}},
    {"$group": {"_id": None, "total": {"$sum": "$n_reactions"}}}
])
for doc in mongo_total:
    print(f"Total number of reactions in MongoDB: {doc['total']}")
    print(result[0] - doc['total'])


Total number of reactions: 126049
Total number of reactions in MongoDB: 126041
8


In [109]:
# find duplicates in drug catalog
query = """
SELECT drug_id
FROM drug_catalog
GROUP BY medicinalproduct
HAVING COUNT(*) > 1;
"""
cursor.execute(query)
duplicates = cursor.fetchall()
print(f"Duplicates in drug catalog: {len(duplicates)}")
dup_ids = [row[0] for row in duplicates]


Duplicates in drug catalog: 143


In [101]:
print(duplicates[:10])

[(8, 'CALCIUM'), (9, 'FERROUS SULFATE'), (15, 'METFORMIN'), (21, 'OXYCODONE'), (23, 'TRIKAFTA'), (27, 'SODIUM CHLORIDE'), (29, 'FLUCONAZOLE'), (42, 'NOREPINEPHRINE'), (43, 'ACETAMINOPHEN'), (49, 'POTASSIUM CHLORIDE')]


In [93]:
query = """
SELECT *
FROM drug_openfda
GROUP BY drug_id
HAVING COUNT(*) > 1;
"""
cursor.execute(query)
duplicates = cursor.fetchall()
print(f"Duplicates in drug_openfda: {len(duplicates)}")
for row in duplicates:
    print(f"Medicinal product: {row[0]}, Count: {row[1]}")


Duplicates in drug_openfda: 0


In [111]:
def get_fda(drug_id):
    query = f"""
    SELECT *
    FROM drug_openfda
    WHERE drug_id = {drug_id};
    """
    cursor.execute(query)
    result = cursor.fetchall()
    return result



In [116]:
u_fda = set()
for id in dup_ids:
    fda = get_fda(id)
    # print(fda)
    # u_fda.add(fda[0][0])

print(f"Unique FDA IDs: {len(u_fda)}")

Unique FDA IDs: 0


### Mongo drug-fda

In [118]:
missing_openfda_count = 0
drug_namesss = set()

for doc in collection.find({}, {"patient.drug": 1}):
    drugs = doc.get("patient", {}).get("drug", [])
    for drug in drugs:
        if not isinstance(drug, dict):
            continue
        if "openfda" not in drug or not isinstance(drug.get("openfda"), dict):
            missing_openfda_count += 1
            drug_name = drug.get("medicinalproduct")
            if isinstance(drug_name, str):
                drug_namesss.add(drug_name)
            else:
                print(f"Warning: medicinalproduct is not a string: {drug_name}")

print("💊 Drug entries WITHOUT openfda:", missing_openfda_count)

💊 Drug entries WITHOUT openfda: 28007


In [120]:
print(len(drug_namesss))
print(drug_namesss)

5030
{'Ponaris', 'CHONDROITIN SULFATE A\\GLUCOSAMINE', 'SANCTURA', 'ALLEGRA D-12 HOUR', 'BENFOTIAMINE\\CYANOCOBALAMIN', 'TILIDINE HYDROCHLORIDE', 'CALCIUM CARBONATE\\ERGOCALCIFEROL', 'ADETPHOS', 'MUSHROOM COMPLEX [ALLIUM SATIVUM;GANODERMA LUCIDUM;HERICIUM ERINACEUS;', 'LIPOSTATINE', 'ALIZAPRIDE', 'PROACTIV SOLUTION BLACKHEAD DISSOLVING', 'GAMMAGARD LIQUID', 'KALIUM HAUSMANN EFFERVETTES', 'Setron', 'LOVASTAT', 'SORIATANE', 'PANTOPRAZOLE BLUEFISH [PANTOPRAZOLE SODIUM SESQUIHYDRATE]', 'Paxil tablet 40 mg', '^Konakion mm', 'GEMZAR', 'DAPSON SCANPHARM', 'DOPADEX SR', 'SPAGULAX', 'IMITREX                            /01044801/', 'ALLERGY MEDICATION (NON-DROWSY) OTC', 'TEMESTA', 'B12 [MECOBALAMIN]', 'ELOBIXIBAT', 'Huons moxyfloxacin', 'BUNAVAIL', 'ALOGLIPTIN BENZOATE', 'FLUMETHASONE', 'DAFLON', 'ACETAMINOPHEN\\BELLADONNA LEAF\\CAFFEINE\\ERGOTAMINE TARTRATE', 'VITAMIN D NOS', 'ADOXA [DOXYCYCLINE]', 'JAMP FER FC', 'LIDOCAINE/PRILOCAINE CRM', 'BOOSTRIX', 'OMEGA 3 KRILL OIL', 'MITOCORE', 'GRAFALON

In [122]:
missing_activesubstance_count = 0

drug_namess = set()



for doc in collection.find({}, {"patient.drug": 1}):
    drugs = doc.get("patient", {}).get("drug", [])
    for drug in drugs:
        if not isinstance(drug, dict):
            continue
        # Count if missing or invalid type
        if "activesubstance" not in drug or not isinstance(drug.get("activesubstance"), (dict, list)):
            missing_activesubstance_count += 1
            drug_name = drug.get("medicinalproduct")
            if isinstance(drug_name, str):
                drug_namess.add(drug_name)
            else:
                print(f"Warning: medicinalproduct is not a string: {drug_name}")

print("💊 Drug entries WITHOUT activesubstance:", missing_activesubstance_count)

💊 Drug entries WITHOUT activesubstance: 3696


In [126]:
# find drugs with multiple activesubstance using mongoDB
multiple_activesubstance_count = 0
multiple_activesubstance_drugs = set()
for doc in collection.find({}, {"patient.drug": 1}):
    drugs = doc.get("patient", {}).get("drug", [])
    for drug in drugs:
        if not isinstance(drug, dict):
            continue
        actives = drug.get("activesubstance", [])
        if isinstance(actives, dict):
            actives = [actives]
        elif not isinstance(actives, list):
            actives = []

        if len(actives) > 1:
            multiple_activesubstance_count += 1
            drug_name = drug.get("medicinalproduct")
            if isinstance(drug_name, str):
                multiple_activesubstance_drugs.add(drug_name)
            else:
                print(f"Warning: medicinalproduct is not a string: {drug_name}")
                

In [127]:
print(multiple_activesubstance_count)

0


In [123]:
print(len(drug_namess))

2634


In [125]:
print(drug_namess)

{'Premens', 'BROMFENAC SODIUM HYDRATE', 'LEUPROLIDE ACET 2-WEEK', 'Neupro 6mg/24 hr TD Patch', 'EQL Glucosamine Chondroitin', 'ZYRTEC', 'Ponaris', 'CENTRAVIT', 'Tamsulosin (Zydus)', 'PRAM [CITALOPRAM HYDROBROMIDE]', 'PANTOPRAZOLE', 'DIGESAN [BROMOPRIDE]', 'VIBRAMYCIN [DOXYCYCLINE HYDROCHLORIDE]', 'OMEPRAZOLE', '0mega3', 'CALCIUM POLYCARBOPHIL', 'NERIZA', 'ROSUCARD COMBI', 'NASACORT ALLERGO', 'Nature Made multi for women', 'GLIZID', 'BLOOD PRESSURE MONITOR', 'ACETYLSALICYLIC ACID/CODEINE', 'UPTRAVI', 'LIBERATOR', 'BUTALBITAL;PARACETAMOL', 'Ciclopirox top soln', 'MUSHROOM COMPLEX [ALLIUM SATIVUM;GANODERMA LUCIDUM;HERICIUM ERINACEUS;', 'Calcium + vit D 3 500mg/d', 'MIGRANOL', 'MAXIPIME BORYUNG', 'PROBIOTIC BLEND', 'LIPOSTATINE', 'KALIUM HAUSMANN EFFERVETTES', 'Setron', 'SUNVITE D3', 'Semglee inj 100u/ml', 'EPINEPHRINE', 'TETRALYSAL', 'Upasita', 'DICHLOZID', 'FLUVASTATIN [FLUVASTATIN SODIUM]', 'NEXIUM                             /01479303/', 'fluoxetine. PRN: Albuterol inhaler', 'MICROZIDE

In [132]:
# get duplicates in the activesubstance table
query = """
SELECT drug_id, activesubstancename
FROM drug_activesubstance
GROUP BY activesubstancename
HAVING COUNT(*) > 1;
"""
cursor.execute(query)
duplicates = cursor.fetchall()
print(f"Duplicates in activesubstance: {len(duplicates)}")
# for row in duplicates:
#     print(f"Activesubstance ID: {row[0]}, Activesubstancename: {row[1]}")


Duplicates in activesubstance: 1163


In [131]:
query = """
SELECT drug_id, COUNT(*) AS n_substances
FROM drug_activesubstance
GROUP BY drug_id
HAVING n_substances > 1
ORDER BY n_substances DESC
LIMIT 10;
"""
cursor.execute(query)
duplicates = cursor.fetchall()
print(f"Duplicates in activesubstance: {len(duplicates)}")

Duplicates in activesubstance: 0
