# SQLite Evaluation Queries
This notebook evaluates the equivalent queries against the structured SQLite schema.

In [9]:
import sqlite3
conn = sqlite3.connect("../sql/openfda_base.db")
cursor = conn.cursor()

## Query 1: All reports where `medicinalproduct = 'INFLECTRA'`

In [2]:
query = """
SELECT r.safetyreportid
FROM report r
JOIN drug d ON r.safetyreportid = d.safetyreportid
WHERE d.medicinalproduct = 'INFLECTRA';
"""
cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} reports.")
what = set()
for row in results:
    what.add(row[0])
print(f"Found {len(what)} unique reports.")

Found 2569 reports.
Found 177 unique reports.


## Query 2: Reports where `patientsex = '2'` and one `reactionoutcome = '5'` exists

In [20]:
query = """
SELECT DISTINCT r.safetyreportid
FROM report r
JOIN patient_optional p ON r.safetyreportid = p.safetyreportid
JOIN reaction re ON r.safetyreportid = re.safetyreportid
WHERE p.patientsex = '2' AND re.reactionoutcome = '5';
"""

cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} reports.")
what = set()
for row in results:
    what.add(row[0])

Found 1094 reports.


## Query 3: Number of serious reports by `medicinalproduct`

In [17]:
query = """
SELECT d.medicinalproduct, COUNT(*) AS serious_count
FROM report r
JOIN drug d ON r.safetyreportid = d.safetyreportid
WHERE r.serious = '1'
GROUP BY d.medicinalproduct
ORDER BY serious_count DESC;
"""

cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} rows.")
from pprint import pprint
pprint(results[:5])


Found 7209 rows.
[('INFLECTRA', 2559),
 ('PREDNISONE', 2149),
 ('METHOTREXATE', 1982),
 ('RITUXIMAB', 1388),
 ('ACETAMINOPHEN', 1351)]


## Query 4: Reports that have duplicate references

In [19]:
query = """
SELECT DISTINCT r.safetyreportid
FROM report r
WHERE r.duplicate = '1';
"""

cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} reports with duplicates.")


Found 11715 reports with duplicates.


## Query 5: Reports with `activesubstancename = 'INFLIXIMAB'`

In [12]:
query = """
SELECT DISTINCT r.safetyreportid
FROM report r
JOIN drug d ON r.safetyreportid = d.safetyreportid
JOIN drug_activesubstance a ON d.id = a.drug_id
WHERE a.activesubstancename = 'INFLIXIMAB';
"""

cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} reports with active substance INFLIXIMAB.")


Found 524 reports with active substance INFLIXIMAB.


## Query 6: Count of reactions per report

In [18]:
query = """
SELECT r.safetyreportid, COUNT(re.id) AS reaction_count
FROM report r
JOIN reaction re ON r.safetyreportid = re.safetyreportid
GROUP BY r.safetyreportid
ORDER BY reaction_count DESC;
"""

cursor.execute(query)
results = cursor.fetchall()
print(f"Top report has {results[0][1]} reactions.")
print(f"Found reaction counts for {len(results)} reports.")


Top report has 147 reactions.
Found reaction counts for 36000 reports.


In [21]:
# get all the unique values found in the fields drugcharacterization in the drug table

query = """
SELECT DISTINCT drugstartdate
FROM drug;
""" 
cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} unique drugcharacterization values.")
# for row in results:
#     print(row[0])

dupa = set()
dupa8 = set()
dupa4 = set()
dupa6 = set()
for row in results:
    if row[0] is None:
        continue
    x = len(row[0])
    if x == 8:
        dupa8.add(row[0])
    elif x == 4:
        dupa4.add(row[0])
    elif x == 6:
        dupa6.add(row[0])
    else:
        print(f"Unknown length {x} for {row[0]}")
    dupa.add(x)
print(f"Found {len(dupa)} unique drugcharacterization values.")
print(dupa)
print(f"8 digit: {len(dupa8)}")
print(dupa8)
print(f"4 digit: {len(dupa4)}")
print(dupa4)
print(f"6 digit: {len(dupa6)}")
print(dupa6)

Found 3971 unique drugcharacterization values.
Found 3 unique drugcharacterization values.
{8, 4, 6}
8 digit: 3662
{'20220621', '20140219', '20161209', '20230426', '20150127', '20060301', '20200401', '20230204', '20061023', '20230316', '20000820', '20211226', '20230112', '20210705', '20081008', '20231226', '20190727', '20191120', '20110515', '20100127', '20180722', '20140312', '20171025', '20191122', '20160218', '20240109', '20191205', '20210830', '20221230', '20131120', '20230715', '20231128', '20191016', '20190318', '20070504', '20211217', '20180922', '20220814', '20240102', '20090311', '20210917', '20181022', '20111114', '20210812', '20120309', '20190316', '20140114', '20121214', '20050602', '20191102', '20160308', '20190117', '20160609', '20180519', '20191031', '20210630', '20160125', '20210504', '20061007', '20020621', '20180428', '20150815', '20230227', '20151020', '20211112', '20150925', '20160206', '20191226', '20080423', '20230503', '20210128', '20220912', '20130107', '2023072

In [7]:
# check the count of reports with different reportversion values
query = """
SELECT safetyreportversion, COUNT(*) AS report_count
FROM report
GROUP BY safetyreportversion
ORDER BY report_count DESC;
""" 
cursor.execute(query)
results = cursor.fetchall()
print(f"Found {len(results)} unique reportversion values.")
for row in results:
    print(row[0], row[1])

Found 46 unique reportversion values.
1 22248
2 7848
3 2727
4 1184
5 635
6 357
7 260
8 155
9 137
10 92
11 59
12 57
15 30
14 30
13 26
17 20
16 20
19 15
20 13
18 11
22 8
21 8
29 7
26 7
23 6
24 5
30 4
33 3
32 3
46 2
41 2
40 2
36 2
34 2
31 2
27 2
25 2
68 1
62 1
61 1
48 1
47 1
44 1
38 1
37 1
35 1
