# ✅ Notebook Goal:
- Explore the converted MongoDB dataset to derive structural and semantic insights
- that will guide improvements to the existing SQLite database design

In [1]:
from pymongo import MongoClient
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
import matplotlib.pyplot as plt

# --- MongoDB Connection to Converted Data ---
client = MongoClient("mongodb://localhost:27017")
db = client["openfda_converted"]
collection = db["full_reports"]

In [2]:
# --- 1. Basic Overview ---
report_count = collection.count_documents({})
print(f"Total reports in converted MongoDB: {report_count}")

Total reports in converted MongoDB: 35999


In [5]:
# --- 3. Distribution of Nested Structures ---
def count_nested_items(path):
    pipeline = [
        {
            "$project": {
                "count": {
                    "$cond": [
                        { "$isArray": f"${path}" },
                        { "$size": f"${path}" },
                        0
                    ]
                }
            }
        },
        { "$group": { "_id": "$count", "count": { "$sum": 1 } } },
        { "$sort": { "_id": 1 } }
    ]
    return list(collection.aggregate(pipeline))

nested_fields = [
    "patient.drug",
    "patient.reaction",
    "patient.drug.openfda",
    "patient.drug.activesubstance",
    "reportduplicate"
]

for field in nested_fields:
    print(f"\n📊 Distribution for {field}:")
    pprint(count_nested_items(field))


📊 Distribution for patient.drug:
[{'_id': 1, 'count': 13270},
 {'_id': 2, 'count': 5927},
 {'_id': 3, 'count': 3374},
 {'_id': 4, 'count': 2705},
 {'_id': 5, 'count': 1867},
 {'_id': 6, 'count': 1334},
 {'_id': 7, 'count': 952},
 {'_id': 8, 'count': 1048},
 {'_id': 9, 'count': 678},
 {'_id': 10, 'count': 759},
 {'_id': 11, 'count': 495},
 {'_id': 12, 'count': 494},
 {'_id': 13, 'count': 399},
 {'_id': 14, 'count': 323},
 {'_id': 15, 'count': 283},
 {'_id': 16, 'count': 259},
 {'_id': 17, 'count': 208},
 {'_id': 18, 'count': 185},
 {'_id': 19, 'count': 149},
 {'_id': 20, 'count': 139},
 {'_id': 21, 'count': 116},
 {'_id': 22, 'count': 104},
 {'_id': 23, 'count': 75},
 {'_id': 24, 'count': 82},
 {'_id': 25, 'count': 67},
 {'_id': 26, 'count': 50},
 {'_id': 27, 'count': 61},
 {'_id': 28, 'count': 47},
 {'_id': 29, 'count': 33},
 {'_id': 30, 'count': 38},
 {'_id': 31, 'count': 27},
 {'_id': 32, 'count': 28},
 {'_id': 33, 'count': 22},
 {'_id': 34, 'count': 23},
 {'_id': 35, 'count': 27},


## Patient Age fields analysis

In [6]:
# --- 4. Age-related Field Relationship Analysis ---
print("\n🔍 Analyzing patient age relationships...")
age_cursor = collection.find({"patient.patientonsetage": {"$exists": True}}, {
    "safetyreportid": 1,
    "patient.patientonsetage": 1,
    "patient.patientonsetageunit": 1,
    "patient.patientagegroup": 1
})

agegroup_map = defaultdict(set)
ageunit_map = defaultdict(set)

for doc in age_cursor:
    onset_age = doc.get("patient", {}).get("patientonsetage")
    age_group = doc.get("patient", {}).get("patientagegroup")
    age_unit = doc.get("patient", {}).get("patientonsetageunit")

    if onset_age is not None:
        if age_group is not None:
            agegroup_map[str(age_group)].add(str(onset_age))
        if age_unit is not None:
            ageunit_map[str(age_unit)].add(str(onset_age))

print("\n🧠 Unique patientonsetage values per patientagegroup:")
for group, ages in agegroup_map.items():
    print(f"Age Group {group}: {sorted(list(ages))[:10]}... (total: {len(ages)})")

print("\n🧠 Unique patientonsetage values per patientonsetageunit:")
for unit, ages in ageunit_map.items():
    print(f"Unit {unit}: {sorted(list(ages))[:10]}... (total: {len(ages)})")


🔍 Analyzing patient age relationships...

🧠 Unique patientonsetage values per patientagegroup:
Age Group 6: ['10', '100', '102', '58', '6', '61', '62', '63', '64', '65']... (total: 47)
Age Group 5: ['18', '19', '2', '20', '21', '22', '23', '24', '25', '26']... (total: 57)
Age Group 4: ['12', '13', '14', '15', '16', '17', '18', '19', '213']... (total: 9)
Age Group 3: ['10', '11', '12', '13', '2', '26', '28', '3', '31', '35']... (total: 16)
Age Group 2: ['0', '1', '10', '11', '13', '2', '22', '4', '5', '7']... (total: 11)
Age Group 1: ['1', '20', '21', '26']... (total: 4)

🧠 Unique patientonsetage values per patientonsetageunit:
Unit 801: ['0', '1', '10', '100', '102', '11', '12', '13', '14', '15']... (total: 102)
Unit 804: ['1', '10', '10034', '10075', '10481', '13634', '1516', '15292', '15647', '18']... (total: 80)
Unit 802: ['1', '10', '100', '1095', '11', '12', '13', '133', '14', '15']... (total: 57)
Unit 800: ['1', '10', '2', '3', '4', '5', '6', '7', '8', '9']... (total: 10)
Unit 8

In [7]:
# Find reports with 'duplicate' set
with_duplicate_flag = db.full_reports.count_documents({"duplicate": {"$exists": True}})

# Find reports that have at least one 'reportduplicate' entry
with_duplicate_links = db.full_reports.count_documents({"reportduplicate": {"$exists": True, "$ne": []}})

# Subtract to get mismatches
only_flagged = with_duplicate_flag - with_duplicate_links

print(f"MongoDB verification:")
print(f"Reports with 'duplicate' field set: {with_duplicate_flag}")
print(f"Reports with 'reportduplicate' entries: {with_duplicate_links}")
print(f"Reports with 'duplicate' flag only (no link details): {only_flagged}")


MongoDB verification:
Reports with 'duplicate' field set: 11714
Reports with 'reportduplicate' entries: 11561
Reports with 'duplicate' flag only (no link details): 153


## Drug related fields

In [10]:
# count unique medicinalproducts
medicinal_products = collection.distinct("patient.drug.medicinalproduct")
print(f"\nUnique medicinal products: {len(medicinal_products)}")


Unique medicinal products: 8214


In [9]:
# --- 1. Field-Variability Check for Drug Fields ---
candidate_fields = [
    "drugauthorizationnumb",
    "drugcharacterization",
    "drugadministrationroute",
    "drugrecurreadministration",
    "drugcumulativedosageunit",
    "drugseparatedosagenumb"
]

summary_data = []

for field in candidate_fields:
    print(f"\n🔍 Checking variability of: {field}")
    pipeline = [
        {"$unwind": "$patient.drug"},
        {"$match": {f"patient.drug.{field}": {"$exists": True}}},
        {"$group": {
            "_id": "$patient.drug.medicinalproduct",
            "distinct_values": {"$addToSet": f"$patient.drug.{field}"}
        }},
        {"$project": {
            "num_distinct": {"$size": "$distinct_values"}
        }},
        {"$group": {
            "_id": None,
            "total_products": {"$sum": 1},
            "invariant": {"$sum": {"$cond": [{"$eq": ["$num_distinct", 1]}, 1, 0]}},
            "variable": {"$sum": {"$cond": [{"$gt": ["$num_distinct", 1]}, 1, 0]}}
        }}
    ]
    result = list(db.full_reports.aggregate(pipeline))
    if result:
        summary_data.append({
            "field": field,
            "total_products": result[0]["total_products"],
            "invariant": result[0]["invariant"],
            "variable": result[0]["variable"]
        })

# Display results in table format
summary_df = pd.DataFrame(summary_data)
display(summary_df.sort_values("variable", ascending=False).reset_index(drop=True))


🔍 Checking variability of: drugauthorizationnumb

🔍 Checking variability of: drugcharacterization

🔍 Checking variability of: drugadministrationroute

🔍 Checking variability of: drugrecurreadministration

🔍 Checking variability of: drugcumulativedosageunit

🔍 Checking variability of: drugseparatedosagenumb


Unnamed: 0,field,total_products,invariant,variable
0,drugadministrationroute,5945,3386,2559
1,drugcharacterization,8214,5741,2473
2,drugseparatedosagenumb,3941,3025,916
3,drugauthorizationnumb,2137,1469,668
4,drugrecurreadministration,1783,1485,298
5,drugcumulativedosageunit,827,671,156


In [11]:
# --- 1. Inspect unique values for stable drug-catalog candidate fields ---
stable_fields = [
    "drugcumulativedosageunit",
    "drugrecurreadministration",
    "drugseparatedosagenumb"
]

for field in stable_fields:
    print(f"\n📊 Unique values for: {field}")
    pipeline = [
        {"$unwind": "$patient.drug"},
        {"$match": {f"patient.drug.{field}": {"$exists": True}}},
        {"$group": {"_id": f"$patient.drug.{field}", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}}
    ]
    results = list(db.full_reports.aggregate(pipeline))
    for r in results:
        print(f"Value: {r['_id']}, Count: {r['count']}")


📊 Unique values for: drugcumulativedosageunit
Value: 3, Count: 3351
Value: 32, Count: 407
Value: 2, Count: 276
Value: 7, Count: 137
Value: 9, Count: 116
Value: 12, Count: 84
Value: 25, Count: 66
Value: 4, Count: 40
Value: 31, Count: 14
Value: 16, Count: 8
Value: 23, Count: 6
Value: 8, Count: 4
Value: 10, Count: 4
Value: 30, Count: 3

📊 Unique values for: drugrecurreadministration
Value: 3, Count: 12588
Value: 2, Count: 585
Value: 1, Count: 382

📊 Unique values for: drugseparatedosagenumb
Value: 1.0, Count: 47138
Value: 2.0, Count: 4002
Value: 3.0, Count: 1034
Value: 4.0, Count: 516
Value: 6.0, Count: 29
Value: 5.0, Count: 10
Value: -1.0, Count: 8
Value: 21.0, Count: 2
Value: 200.0, Count: 2
Value: 75.0, Count: 1
Value: 11.0, Count: 1
Value: 150.0, Count: 1
Value: 600.0, Count: 1
Value: 0.8, Count: 1
Value: 0.4, Count: 1
Value: 1.5, Count: 1
Value: 105.0, Count: 1
Value: 7.0, Count: 1
Value: 20.0, Count: 1
Value: 8.0, Count: 1


In [20]:
# --- 1. Inspect value consistency per drug for stable catalog candidates ---
stable_fields = [
    "drugcumulativedosageunit",
    "drugrecurreadministration",
    "drugseparatedosagenumb"
]

noisiness = []

for field in stable_fields:
    print(f"\n📊 Consistency check for field: {field}")
    pipeline = [
        {"$unwind": "$patient.drug"},
        {"$match": {f"patient.drug.{field}": {"$exists": True}}},
        {"$group": {
            "_id": "$patient.drug.medicinalproduct",
            "unique_values": {"$addToSet": f"$patient.drug.{field}"}
        }},
        {"$project": {
            "num_unique": {"$size": "$unique_values"},
            "unique_values": 1
        }},
        {"$match": {"num_unique": {"$gt": 0}}},
        {"$sort": {"num_unique": -1}}
    ]
    results = list(db.full_reports.aggregate(pipeline))
    noise = Counter()
    for r in results:
        # noisiness.add(r["num_unique"])
        # print(f"Drug: {r['_id']}, Unique Count: {r['num_unique']}, Values: {r['unique_values']}")
        noise[r["num_unique"]] += 1
    
    noisiness.append(noise)



📊 Consistency check for field: drugcumulativedosageunit

📊 Consistency check for field: drugrecurreadministration

📊 Consistency check for field: drugseparatedosagenumb


In [21]:
print(noisiness)

[Counter({1: 671, 2: 125, 3: 25, 4: 6}), Counter({1: 1485, 2: 264, 3: 34}), Counter({1: 3025, 2: 698, 3: 168, 4: 44, 5: 5, 8: 1})]


In [27]:
for i, field in enumerate(stable_fields):
    print(f"\n📊 Noisiness for field: {field}")
    print(noisiness[i])
    for k, v in noisiness[i].items():
        # print(f"Unique Count: {k}, Frequency: {v}")
        total = sum(noisiness[i].values())
        # print(f"Unique Count: {k}, Frequency: {v}, Percentage: {v/total:.2%}")
    print(total)
        


📊 Noisiness for field: drugcumulativedosageunit
Counter({1: 671, 2: 125, 3: 25, 4: 6})
827

📊 Noisiness for field: drugrecurreadministration
Counter({1: 1485, 2: 264, 3: 34})
1783

📊 Noisiness for field: drugseparatedosagenumb
Counter({1: 3025, 2: 698, 3: 168, 4: 44, 5: 5, 8: 1})
3941


In [32]:
# count the number of distinct madicinal products where drugrecurreadministration is present
pipeline = [
    {"$unwind": "$patient.drug"},
    {"$match": {"patient.drug.drugrecurreadministration": {"$exists": True}}},
    {"$group": {
        "_id": "$patient.drug.medicinalproduct",
        "count": {"$sum": 1}
    }}
]
results = list(db.full_reports.aggregate(pipeline))
print(f"\nTotal distinct medicinal products with drugrecurreadministration: {len(results)}")


Total distinct medicinal products with drugrecurreadministration: 1783


In [34]:
# count the number of distinct madicinal products where drugseparatedosagenumb is present
pipeline = [
    {"$unwind": "$patient.drug"},
    {"$match": {"patient.drug.drugseparatedosagenumb": {"$exists": True}}},
    {"$group": {
        "_id": "$patient.drug.medicinalproduct",
        "count": {"$sum": 1}
    }}
]
results = list(db.full_reports.aggregate(pipeline))
print(f"\nTotal distinct medicinal products with drugseparatedosagenumb: {len(results)}")


Total distinct medicinal products with drugseparatedosagenumb: 3941


In [None]:

pipeline = [
    {"$unwind": "$patient.drug"},
    {"$match": {"patient.drug.drugcumulativedosageunit": {"$exists": True}}},
    {"$group": {
        "_id": "$patient.drug.medicinalproduct",
        "distinct_values": {"$addToSet": "$patient.drug.drugcumulativedosageunit"}
    }},
    {"$project": {"num_unique": {"$size": "$distinct_values"}}},
    {"$match": {"num_unique": {"$gt": 1}}}
]
results = list(db.full_reports.aggregate(pipeline))
print(f"\nTotal distinct medicinal products with multiple drugcumulativedosageunit values: {len(results)}")


Total distinct medicinal products with multiple drugcumulativedosageunit values: 156


In [40]:
pipeline = [
    {"$unwind": "$patient.drug"},
    {"$match": {"patient.drug.drugrecurreadministration": {"$exists": True}}},
    {"$group": {
        "_id": "$patient.drug.medicinalproduct",
        "distinct_values": {"$addToSet": "$patient.drug.drugrecurreadministration"}
    }},
    {"$project": {"num_unique": {"$size": "$distinct_values"}}},
    {"$match": {"num_unique": {"$gt": 1}}}
]

results = list(db.full_reports.aggregate(pipeline))
print(f"\nTotal distinct medicinal products with multiple drugrecurreadministration values: {len(results)}")


Total distinct medicinal products with multiple drugrecurreadministration values: 298
