# üß™ Field Validation: SQLite Numeric & Date Conversion
This notebook validates all candidate fields across the normalized SQLite database to check whether they can be safely converted to `INTEGER`, `REAL`, or `DATE` types.

## üîß Setup & Connection

In [11]:
import sqlite3
import pandas as pd
import re
from datetime import datetime
from collections import defaultdict, Counter

# Set the path to the SQLite DB
db_path = "../sql/openfda_base.db"
conn = sqlite3.connect(db_path)

## üóÇÔ∏è Define Field Validation Schema

In [19]:
# Fields to validate: {table: [(field, type)]}
field_specs = {
    'drug': [
        ('drugcharacterization', 'int'),
        ('drugauthorizationnumb', 'int'),
        ('drugstartdate', 'date'),
        ('drugenddate', 'date'),
        ('drugadministrationroute', 'int')
    ],
    'drug_openfda': [
        ('package_ndc', 'list'),
        ('product_ndc', 'list'),
        ('rxcui', 'list')
    ],
    'drug_optional': [
        ('actiondrug', 'int'), ('drugadditional', 'int'), ('drugintervaldosagedefinition', 'int'),
        ('drugcumulativedosagenumb', 'real'), ('drugcumulativedosageunit', 'int'),
        ('drugenddateformat', 'int'), ('drugintervaldosageunitnumb', 'real'),
        ('drugrecurreadministration', 'int'), ('drugseparatedosagenumb', 'real'),
        ('drugstartdateformat', 'int'), ('drugstructuredosagenumb', 'real'),
        ('drugstructuredosageunit', 'int'), ('drugtreatmentduration', 'real'), ('drugtreatmentdurationunit', 'int')
    ],
    'patient_optional': [
        ('patientagegroup', 'int'), ('patientonsetage', 'int'),
        ('patientonsetageunit', 'int'), ('patientsex', 'int'), ('patientweight', 'real')
    ],
    'reaction': [
        ('reactionmeddraversionpt', 'real'), ('reactionoutcome', 'int')
    ],
    'report': [
        ('safetyreportversion', 'int'), ('receivedateformat', 'int'),
        ('receivedate', 'date'), ('receiptdateformat', 'int'), ('receiptdate', 'date'),
        ('transmissiondateformat', 'int'), ('transmissiondate', 'date'), ('reporttype', 'int'),
        ('fulfillexpeditecriteria', 'int'), ('serious', 'int'), ('seriousnessdeath', 'int'),
        ('seriousnesslifethreatening', 'int'), ('seriousnesshospitalization', 'int'),
        ('seriousnessdisabling', 'int'), ('seriousnesscongenitalanomali', 'int'),
        ('seriousnessother', 'int'), ('sender_sendertype', 'int'), ('receiver_receivertype', 'int'),
        ('primarysource_qualification', 'int'), ('duplicate', 'int')
    ],
    'summary': [
        ('narrativeincludeclinical', 'date')
    ]
}

## üîç Run Validation

In [22]:
results = []
for table, fields in field_specs.items():
    for field, ftype in fields:
        # print(f'Validating: {table}.{field} ({ftype})')
        try:
            query = f"SELECT {field} FROM {table} WHERE {field} IS NOT NULL"
            rows = conn.execute(query).fetchall()
            values = [r[0] for r in rows if r[0] is not None]
            counts = Counter()
            invalid = set()
            for v in values:
                try:
                    if ftype == 'int': int(v); counts['int'] += 1
                    elif ftype == 'real': float(v); counts['real'] += 1
                    elif ftype == 'date':
                        if re.match(r'^\d{8}$', str(v)):
                            datetime.strptime(str(v), '%Y%m%d'); counts['date'] += 1
                        else: raise ValueError()
                    elif ftype == 'list':
                        if isinstance(v, str) and ('-' in v or ',' in v or v.isdigit()):
                            counts['list'] += 1
                        else: raise ValueError()
                except:
                    counts['invalid'] += 1
                    # Only limit invalids for the known noisy field
                    if table == 'summary' and field == 'narrativeincludeclinical':
                        if len(invalid) < 5:
                            invalid.add(v)
                    else:
                        invalid.add(v)
            results.append({
                'table': table, 'field': field, 'suggested_type': ftype,
                'non_null_count': len(values),
                'valid_count': len(values) - counts['invalid'],
                'invalid_count': counts['invalid'],
                'invalid_sample': invalid
            })
        except Exception as e:
            results.append({
                'table': table, 'field': field, 'suggested_type': ftype,
                'non_null_count': 0, 'valid_count': 0, 'invalid_count': 0,
                'invalid_sample': [f'Error: {e}']
            })

validation_df = pd.DataFrame(results)
validation_df['valid_pct'] = (validation_df['valid_count'] / validation_df['non_null_count']).round(3)
validation_df.sort_values(by='valid_pct', ascending=True)

Unnamed: 0,table,field,suggested_type,non_null_count,valid_count,invalid_count,invalid_sample,valid_pct
49,summary,narrativeincludeclinical,date,16948,0,16948,"{CASE EVENT DATE: 20211209, CASE EVENT DATE: 2...",0.0
2,drug,drugstartdate,date,53836,40224,13612,"{201208, 202209, 202104, 200701, 201901, 20120...",0.747
3,drug,drugenddate,date,23151,19162,3989,"{201208, 202209, 202004, 201906, 202101, 20180...",0.828
28,reaction,reactionoutcome,int,122182,122182,0,{},1.0
29,report,safetyreportversion,int,36000,36000,0,{},1.0
30,report,receivedateformat,int,36000,36000,0,{},1.0
31,report,receivedate,date,36000,36000,0,{},1.0
32,report,receiptdateformat,int,36000,36000,0,{},1.0
33,report,receiptdate,date,36000,36000,0,{},1.0
34,report,transmissiondateformat,int,36000,36000,0,{},1.0


In [24]:
validation_df.shape

(50, 8)

## üíæ Export to CSV

In [23]:
validation_df.to_csv('../reports/evaluation_results/field_validation_summary.csv', index=False)
# validation_df.head(20)

# Additional

In [12]:
conn = sqlite3.connect("../sql/openfda_base.db")
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print("Available tables:")
for t in tables:
    print("-", t[0])


Available tables:
- report
- reaction
- reportduplicate
- patient_optional
- drug
- drug_openfda
- drug_activesubstance
- drug_optional
- summary


In [26]:
# list all the columns in a table drug_optional
def get_columns(table_name):
    query = f"PRAGMA table_info({table_name})"
    columns = conn.execute(query).fetchall()
    return [col[1] for col in columns]



['id',
 'drug_id',
 'actiondrug',
 'drugadditional',
 'drugbatchnumb',
 'drugcumulativedosagenumb',
 'drugcumulativedosageunit',
 'drugenddateformat',
 'drugintervaldosagedefinition',
 'drugintervaldosageunitnumb',
 'drugrecurreadministration',
 'drugseparatedosagenumb',
 'drugstartdateformat',
 'drugstructuredosagenumb',
 'drugstructuredosageunit',
 'drugtreatmentduration',
 'drugtreatmentdurationunit']

## Inspecting edge cases

In [40]:
# get the invalid cases from the validation_df for drugstartdate
invalid_one = validation_df[
    (validation_df['table'] == 'drug') &
    (validation_df['field'] == 'drugstartdate') &
    (validation_df['invalid_count'] > 0)
].copy()
invalid_one = invalid_one[['table', 'field', 'invalid_sample']]

invalid_two = validation_df[
    (validation_df['table'] == 'drug') &
    (validation_df['field'] == 'drugenddate') &
    (validation_df['invalid_count'] > 0)
].copy()
invalid_two = invalid_two[['table', 'field', 'invalid_sample']]

invalid_three = validation_df[
    (validation_df['table'] == 'drug') &
    (validation_df['field'] == 'drugauthorizationnumb') &
    (validation_df['invalid_count'] > 0)
].copy()
invalid_three = invalid_three[['table', 'field', 'invalid_sample']]


In [47]:
# access the invalid cases form
# invalid_one = invalid_one.iloc[0]['invalid_sample']
invalid_two = invalid_two.iloc[0]['invalid_sample']
invalid_three = invalid_three.iloc[0]['invalid_sample']

## Investigate the correlation between dateformat and date in the drug/drug_optional table.

In [52]:


# Run SQL query joining drug and drug_optional on safetyreportid and drug_seq
query = """
SELECT 
    d.safetyreportid,
    d.drugstartdate,
    do.drugstartdateformat
FROM drug d
JOIN drug_optional do
    ON d.id = do.drug_id
WHERE d.drugstartdate IS NOT NULL AND do.drugstartdateformat IS NOT NULL
"""

df_start = pd.read_sql_query(query, conn)

# Analyze date length vs format code
df_start["date_length"] = df_start["drugstartdate"].astype(str).str.len()
summary_start = df_start.groupby(["drugstartdateformat", "date_length"]).size().reset_index(name="count")
summary_start.sort_values(by=["drugstartdateformat", "date_length"], ascending=True)


Unnamed: 0,drugstartdateformat,date_length,count
0,102,8,40224
1,602,4,4551
2,610,6,9061


In [53]:
query = """
SELECT 
    d.safetyreportid,
    d.drugenddate,
    do.drugenddateformat
FROM drug d
JOIN drug_optional do
    ON d.id = do.drug_id
WHERE d.drugenddate IS NOT NULL AND do.drugenddateformat IS NOT NULL
"""

df_end = pd.read_sql_query(query, conn)

df_end["date_length"] = df_end["drugenddate"].astype(str).str.len()
summary_end = df_end.groupby(["drugenddateformat", "date_length"]).size().reset_index(name="count")
summary_end.sort_values(by=["drugenddateformat", "date_length"], ascending=True)


Unnamed: 0,drugenddateformat,date_length,count
0,102,8,19162
1,602,4,1289
2,610,6,2700


In [59]:
query = """
SELECT COUNT(*) 
FROM drug d
LEFT JOIN drug_optional do ON d.id = do.drug_id
WHERE d.drugenddate IS NOT NULL AND do.drugenddateformat IS NULL;
"""
df_start_null = pd.read_sql_query(query, conn)

df_start_null.head()

Unnamed: 0,COUNT(*)
0,0


In [60]:
query = """
SELECT COUNT(*) FROM drug d
LEFT JOIN drug_optional do ON d.id = do.drug_id
WHERE do.drugenddateformat IS NOT NULL AND d.drugenddate IS NULL;
"""
df_start_not_null = pd.read_sql_query(query, conn)
df_start_not_null.head()

Unnamed: 0,COUNT(*)
0,0


## Inspecting narrativeincludeclinical from summary table

In [61]:
# Pull all non-null values from the narrative field
query = """
SELECT narrativeincludeclinical
FROM summary
WHERE narrativeincludeclinical IS NOT NULL
"""
df = pd.read_sql_query(query, conn)

# Regex pattern to extract 8-digit dates after "CASE EVENT DATE:"
pattern = re.compile(r'CASE EVENT DATE[:\s]*?(\d{8})')

# Apply extraction and validation
def extract_case_event_date(text):
    match = pattern.search(str(text))
    if match:
        raw_date = match.group(1)
        try:
            datetime.strptime(raw_date, "%Y%m%d")
            return raw_date  # Valid date
        except ValueError:
            return "INVALID"
    return None

df["extracted_date"] = df["narrativeincludeclinical"].apply(extract_case_event_date)

# Summary
summary = df["extracted_date"].value_counts(dropna=False).reset_index()
summary.columns = ["extracted_date", "count"]
summary

Unnamed: 0,extracted_date,count
0,20230101,1279
1,20240101,744
2,20231201,521
3,20220101,503
4,20240201,355
...,...,...
1683,20201230,1
1684,20070705,1
1685,20181025,1
1686,20191122,1


In [62]:
# check the length of the extracted date
summary_date_lenghts = set()
for date in summary["extracted_date"]:
    summary_date_lenghts.add(len(str(date)))

print("Unique lengths of extracted dates:", summary_date_lenghts)



Unique lengths of extracted dates: {8}


In [64]:
query = """
SELECT COUNT(*) AS total_rows,
SUM(CASE WHEN safetyreportid GLOB '*[^0-9]*' THEN 1 ELSE 0 END) AS non_numeric_count
FROM report;
"""
df = pd.read_sql_query(query, conn)
df.head()

Unnamed: 0,total_rows,non_numeric_count
0,36000,0
