# MongoDB Evaluation Queries
This notebook evaluates query expressiveness and performance using the MongoDB version of the OpenFDA dataset.

In [1]:
from pymongo import MongoClient
import time

# MongoDB connection
client = MongoClient('mongodb://localhost:27017')
db = client['openfda']
collection = db['full_reports']

## Query 1: Find all reports where `medicinalproduct = 'INFLECTRA'`
We match nested drug names using `$elemMatch`.

In [4]:
query = {
    "patient.drug": {
        "$elemMatch": {
            "medicinalproduct": "INFLECTRA"
        }
    }
}

# Run the query and fetch results
results = list(collection.find(query))

# Print how many results we got
print(f"Found {len(results)} reports.")
# Optionally print the first one
# if results:
#     from pprint import pprint
#     pprint(results[0])

Found 177 reports.


## Query 2: Reports where `patient.patientsex = '2'` and a `reaction.reactionoutcome = '5'` exists

In [6]:
query = {
    "patient.patientsex": "2",
    "patient.reaction": {
        "$elemMatch": {
            "reactionoutcome": "5"
        }
    }
}

# Execute and display
results = list(collection.find(query))
print(f"Found {len(results)} reports.")
# if results:
#     from pprint import pprint
#     pprint(results[0])

Found 1094 reports.


## Query 3: Count of serious reports grouped by `medicinalproduct`
This uses `$unwind` and `$group` for nested aggregation.

In [7]:
pipeline = [
    { "$unwind": "$patient.drug" },
    { "$match": { "serious": "1" }},
    { "$group": {
        "_id": "$patient.drug.medicinalproduct",
        "count": { "$sum": 1 }
    }},
    { "$sort": { "count": -1 }}
]

results = list(collection.aggregate(pipeline))
print(f"Found {len(results)} medicinal products with serious reports.")
from pprint import pprint
pprint(results[:5])  # Preview top 5


Found 7198 medicinal products with serious reports.
[{'_id': 'INFLECTRA', 'count': 2559},
 {'_id': 'PREDNISONE', 'count': 2148},
 {'_id': 'METHOTREXATE', 'count': 1982},
 {'_id': 'RITUXIMAB', 'count': 1388},
 {'_id': 'ACETAMINOPHEN', 'count': 1351}]


## Query 4: Find reports that have at least one duplicate report reference

In [12]:
cursor = collection.find({
    "duplicate": "1"
})
results = list(cursor)
print(f"Found {len(results)} reports marked as duplicates.")


Found 11714 reports marked as duplicates.


## Query 5: Reports with `activesubstancename = 'INFLIXIMAB'`

In [9]:
cursor = collection.find({
    "patient.drug.activesubstance.activesubstancename": "INFLIXIMAB"
})
results = list(cursor)
print(f"Found {len(results)} reports with activesubstancename = 'INFLIXIMAB'.")


Found 524 reports with activesubstancename = 'INFLIXIMAB'.


## Query 6: Count number of reactions per report

In [10]:
pipeline = [
    {
        "$project": {
            "safetyreportid": 1,
            "reaction_count": { "$size": { "$ifNull": ["$patient.reaction", []] } }
        }
    },
    { "$sort": { "reaction_count": -1 }}
]

results = list(collection.aggregate(pipeline))
print(f"Found reaction counts for {len(results)} reports.")
from pprint import pprint
pprint(results[:5])  # Top 5 reports with most reactions


Found reaction counts for 35999 reports.
[{'_id': ObjectId('6819f2407894e637cc611f9a'),
  'reaction_count': 147,
  'safetyreportid': '23604512'},
 {'_id': ObjectId('6819f1c47894e637cc60e40f'),
  'reaction_count': 146,
  'safetyreportid': '18422156'},
 {'_id': ObjectId('6819f2987894e637cc614dc8'),
  'reaction_count': 131,
  'safetyreportid': '17530620'},
 {'_id': ObjectId('6819f29a7894e637cc614eab'),
  'reaction_count': 116,
  'safetyreportid': '15462312'},
 {'_id': ObjectId('6819f2a87894e637cc6155b2'),
  'reaction_count': 111,
  'safetyreportid': '22537856'}]
