# MongoDB Data Verification – Converted OpenFDA Database
This notebook verifies the field conversions and data integrity in the MongoDB collection `full_reports`.

In [1]:
import pymongo
from bson import json_util
from pprint import pprint

# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["openfda_converted"]
collection = db["full_reports"]

## 1. Check total number of inserted reports

In [2]:
total_reports = collection.count_documents({})
print(f"Total inserted reports: {total_reports}")

Total inserted reports: 35999


## 2. Verify integer fields

In [3]:
query = {"safetyreportversion": {"$type": "int"}}
docs = list(collection.find(query).limit(5))
for doc in docs:
    pprint(doc)

{'_id': ObjectId('68246feda116cc7161242716'),
 'companynumb': 'US-SECURA BIO, INC.-2021US002331',
 'fulfillexpeditecriteria': 2,
 'occurcountry': 'US',
 'patient': {'drug': [{'actiondrug': 5,
                       'activesubstance': {'activesubstancename': 'DUVELISIB'},
                       'drugadditional': 3,
                       'drugadministrationroute': 48,
                       'drugauthorizationnumb': 211155,
                       'drugbatchnumb': '1697765A',
                       'drugcharacterization': 1,
                       'drugdosageform': 'Capsule',
                       'drugdosagetext': '75 MG, BID',
                       'drugindication': 'Non-Hodgkin^s lymphoma',
                       'drugintervaldosagedefinition': 804,
                       'drugintervaldosageunitnumb': 1.0,
                       'drugrecurreadministration': 3,
                       'drugseparatedosagenumb': 2.0,
                       'drugstructuredosagenumb': 75.0,
               

## 3. Verify date normalization

In [5]:
query = {"drug.drugstartdate": {"$type": "date"}}
docs = list(collection.find(query).limit(5))
for doc in docs:
    pprint(doc["drug"][0]["drugstartdate"])

## 4. Check that oversized reports were skipped (logged to file)

In [6]:
with open("skipped_reports.log", "r") as f:
    skipped = f.readlines()
print(f"Total oversized reports skipped: {len(skipped)}")
print("First few entries:")
print("".join(skipped[:3]))

FileNotFoundError: [Errno 2] No such file or directory: 'skipped_reports.log'