In [1]:
# ✅ Notebook Goal:
# Use the field presence CSVs to verify which fields are populated in both SQLite and MongoDB

import pandas as pd
import pymongo
import sqlite3

# --- Load CSV Files ---
value_fields = pd.read_csv("../../reports/evaluation_results/value_fields_presence.csv")
object_fields = pd.read_csv("../../reports/evaluation_results/object_fields_presence.csv")

# --- MongoDB Setup ---
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_collection = mongo_client["openfda_converted"]["full_reports"]

# --- SQLite Setup ---
sqlite_conn = sqlite3.connect("../../sql/openfda_base_updated.db")


In [3]:
# --- 1. Field Presence in MongoDB ---
def count_field_presence_mongo(field):
    return mongo_collection.count_documents({field: {"$exists": True}})

print("\n🔍 MongoDB - Value Field Presence")
for field in value_fields["field_path"]:
    count = count_field_presence_mongo(field)
    print(f"{field}: {count} documents")

print("\n📦 MongoDB - Object Field Presence")
for field in object_fields["field_path"]:
    count = count_field_presence_mongo(field)
    print(f"{field}: {count} documents")


🔍 MongoDB - Value Field Presence
authoritynumb: 3793 documents
companynumb: 32084 documents
duplicate: 11714 documents
fulfillexpeditecriteria: 35999 documents
occurcountry: 33456 documents
patient.drug.actiondrug: 34484 documents
patient.drug.activesubstance.activesubstancename: 35999 documents
patient.drug.drugadditional: 28116 documents
patient.drug.drugadministrationroute: 30832 documents
patient.drug.drugauthorizationnumb: 33368 documents
patient.drug.drugbatchnumb: 15417 documents
patient.drug.drugcharacterization: 35999 documents
patient.drug.drugcumulativedosagenumb: 1348 documents
patient.drug.drugcumulativedosageunit: 1348 documents
patient.drug.drugdosageform: 28649 documents
patient.drug.drugdosagetext: 29863 documents
patient.drug.drugenddate: 8429 documents
patient.drug.drugenddateformat: 8429 documents
patient.drug.drugindication: 34014 documents
patient.drug.drugintervaldosagedefinition: 17594 documents
patient.drug.drugintervaldosageunitnumb: 17594 documents
patient.d

In [7]:
# # --- 2. Field Presence in SQLite ---
# print("\n🧱 SQLite - Table Column Coverage")
# def count_not_null(conn, table, column):
#     try:
#         df = pd.read_sql_query(f"SELECT COUNT(*) as cnt FROM {table} WHERE {column} IS NOT NULL", conn)
#         return df["cnt"].iloc[0]
#     except Exception as e:
#         return f"Error: {str(e)}"

# # Value fields are stored as table.column format
# print("\nSQLite - Checking column presence")
# value_fields_split = value_fields["field_path"].str.extract(r"(?P<table>\w+)\.(?P<column>\w+)")
# value_fields = pd.concat([value_fields, value_fields_split], axis=1)

# for _, row in value_fields.iterrows():
#     table, column = row["table"], row["column"]
#     result = count_not_null(sqlite_conn, table, column)
#     print(f"{table}.{column}: {result} non-null rows")

In [8]:
# Cleanup
sqlite_conn.close()
mongo_client.close()