forked from trevor-smith/FDA_OpenData
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fda_exploratory_data_analysis.py
127 lines (100 loc) · 3.33 KB
/
fda_exploratory_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
##### THIS FILE IS FOR EXPLORATORY ANALYSIS #####
### IMPORTS ###
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
from collections import Counter
import matplotlib.pyplot as plt
### SETTING UP ENVIRONMENT ###
# make sure mongod is running
client = MongoClient()
labels = client.drugs.drug_labeling
events = client.drugs.adverse_events
### QUERYING THE DATA ###
# this would find all data...I have not given it any filters
cursor_labels = labels.find({})
cursor_events = events.find({})
# this is just pulling the mongodb documents out of the database
# and then appending it to a list called 'documents'
# 'documents' will be a list of dictionaries
documents_labels = []
for i in cursor_labels[:20]:
documents_labels.append(i['results'])
documents_events = []
for i in cursor_events[:20]:
documents_events.append(i['results
'])
# the elements of the 'documents' list are in fact dictionaries
# need to split them
data_labels = []
for i in documents_labels:
for j in i:
data_labels.append(j)
data_events = []
for i in documents_events:
for j in i:
data_events.append(j)
### CLEANING THE DATA ###
# flattening the nested dicts
import collections
def flatten(d, parent_key='', sep='_'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
for i in data_events[:5]:
print "this person's sex is: " + str((i['patient']['patientsex']))
for j in i['patient']['reaction']:
print "this person had this type of reaction: " + str(j['reactionmeddrapt'])
cnt = Counter()
for i in data_events[:5]:
for j in i['patient']['reaction']:
cnt[j['reactionmeddrapt']] += 1
print cnt.most_common(10)
# we can do some basic plotting
# need to get the data out of the counter
top_10 = cnt.most_common(10)
names = []
counts = []
for i in top_10:
names.append(i[0])
for i in top_10:
counts.append(i[1])
# ok now to plot
ind = np.arange(10)
plt.bar(ind, counts)
# need to figure out how to rotate the labels 90 degrees
plt.xticks(ind+width/2., names)
plt.show()
### ADVERSE EVENTS EXPLORATION ###
# this gives us the count of the top x number of events for a drug
# there are a lot of if statements because not ever event has an openfda section
# not every openfda section has a generic_name section
def generic_name_counter(num):
generics_count = Counter()
for i in data_events:
for j in i['patient']['drug']:
if 'openfda' in j:
if 'generic_name' in j['openfda']:
for k in j['openfda']['generic_name']:
generics_count[k] += 1
return generics_count.most_common(num)
### DRUG RECALLS EXPLORATION ###
# let's look at all Class recall distributions
# I = most severe, II = moderately severe, III = not severe
def class_divider():
class_one = []
class_two = []
class_three = []
for i in data_events:
if i['classification'] == 'Class I':
class_one.append(i)
if i['classification'] == 'Class II':
class_two.append(i)
if i['classification'] == 'Class III':
class_three.append(i)