In [7]:
# 02_type_and_format_check.ipynb

# ✅ Notebook Goal:
# Verify that numeric fields and date fields were properly converted
# in both MongoDB and SQLite

# --- MongoDB Setup ---
import pymongo
from pprint import pprint

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["openfda_converted"]
collection = db["full_reports"]

# --- SQLite Setup ---
import sqlite3
import pandas as pd
import re

conn = sqlite3.connect("../../sql/openfda_base_updated.db")

In [3]:
# --- 1. MongoDB Type Checks ---
print("MongoDB - Check integer fields:\n")
int_query = {"safetyreportversion": {"$type": "int"}}
int_count = collection.count_documents(int_query)
print(f"Reports with integer 'safetyreportversion': {int_count}")

print("\nMongoDB - Check float fields (e.g., patient.patientweight):\n")
float_query = {"patient.patientweight": {"$type": "double"}}
float_count = collection.count_documents(float_query)
print(f"Reports with float 'patient.patientweight': {float_count}")

print("\nMongoDB - Check ISODate fields (e.g., receivedate):\n")
date_query = {"receivedate": {"$type": "date"}}
date_count = collection.count_documents(date_query)
print(f"Reports with date-typed 'receivedate': {date_count}")

MongoDB - Check integer fields:

Reports with integer 'safetyreportversion': 35999

MongoDB - Check float fields (e.g., patient.patientweight):

Reports with float 'patient.patientweight': 6874

MongoDB - Check ISODate fields (e.g., receivedate):

Reports with date-typed 'receivedate': 35999


In [4]:
# --- 2. SQLite Type & Format Checks ---
print("\nSQLite - Check format of date strings (e.g., receivedate):")
df = pd.read_sql_query("SELECT receivedate FROM report LIMIT 10", conn)
print(df)

print("\nSQLite - Check that padded date format is respected:")
df_all_dates = pd.read_sql_query("SELECT receivedate FROM report", conn)
df_all_dates["valid_format"] = df_all_dates["receivedate"].apply(
    lambda x: bool(re.match(r"^\d{8}$|^\d{6}$|^\d{4}$", str(x)))
)
invalid_dates = df_all_dates[~df_all_dates["valid_format"]]
print(f"Invalid format count: {len(invalid_dates)}")


SQLite - Check format of date strings (e.g., receivedate):
  receivedate
0    20070320
1    20070406
2    20080129
3    20080414
4    20100217
5    20100331
6    20100616
7    20120117
8    20120120
9    20130626

SQLite - Check that padded date format is respected:
Invalid format count: 0


In [None]:
# --- 3. Additional Numeric Field Checks ---
numeric_fields = [
    ("report", "serious"),
    ("reaction", "reactionoutcome"),
    ("drug_optional", "drugstructuredosagenumb"),
    ("drug_optional", "drugseparatedosagenumb")
]

for table, field in numeric_fields:
    print(f"\nSQLite - Checking type for {field} in {table}:")
    df_check = pd.read_sql_query(f"SELECT {field} FROM {table} WHERE {field} IS NOT NULL LIMIT 10", conn)
    print(df_check.dtypes)
    print(df_check.head())




SQLite - Checking type for serious in report:
serious    int64
dtype: object
   serious
0        1
1        1
2        1
3        1
4        1

SQLite - Checking type for reactionoutcome in reaction:
reactionoutcome    int64
dtype: object
   reactionoutcome
0                6
1                6
2                6
3                6
4                6

SQLite - Checking type for drugstructuredosagenumb in drug_optional:
drugstructuredosagenumb    float64
dtype: object
   drugstructuredosagenumb
0                     75.0
1                     15.0
2                   1200.0
3                    300.0
4                    300.0

SQLite - Checking type for drugseparatedosagenumb in drug_optional:
drugseparatedosagenumb    float64
dtype: object
   drugseparatedosagenumb
0                     2.0
1                     1.0
2                     1.0
3                     1.0
4                     1.0


In [8]:
### sql count reportversion
query = """
SELECT COUNT(*) AS count
FROM report
WHERE safetyreportversion IS NOT NULL
"""
df = pd.read_sql_query(query, conn)
print(f"Count of non-null safetyreportversion: {df['count'][0]}")



Count of non-null safetyreportversion: 36000


In [9]:
# Cleanup
conn.close()
client.close()