-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetect_phi.sas
275 lines (246 loc) · 10.9 KB
/
detect_phi.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
%macro check_dataset(dset =, obs_lim = max, eldest_age = 89) ;
%local i ;
%local inset_name ;
%let inset_name = &dset ;
%if %lowcase(&obs_lim) = max %then %do ;
%** Nothing ;
%end ;
%else %do ;
proc surveyselect
data = &inset_name
out = __sub_dset
method = srs
sampsize = &obs_lim SELECTALL
seed = 1234567
noprint
;
run;
%let dset = __sub_dset ;
%end ;
%macro check_varname(regx, msg) ;
create table possible_bad_vars as
select name, label
from these_vars
where prxmatch(compress("/(®x)/i"), name)
;
%if &sqlobs > 0 %then %do ;
insert into phi_warnings(dset, variable, label, warning)
select "&inset_name" as dset, name, label, "&msg"
from possible_bad_vars
;
%end ;
%mend check_varname ;
%macro check_vars_for_mrn(length_limit = 6, obs_lim = max) ;
%local char ;
%let char = 2 ;
proc sql noprint ;
select name
into :mrn_array separated by ' '
from these_vars
where type = &char and length ge &length_limit
;
quit ;
%if &sqlobs > 0 %then %do ;
%put Checking these vars for possible MRN contents: &mrn_array ;
data __gnu ;
retain
mrn_regex_handle
badcount
;
set &inset_name (obs = &obs_lim keep = &mrn_array) ;
if _n_ = 1 then do ;
mrn_regex_handle = prxparse("/&mrn_regex/") ;
badcount = 0 ;
end ;
array p &mrn_array ;
do i = 1 to dim(p) ;
if prxmatch(mrn_regex_handle, p{i}) then do ;
badvar = vname(p{i}) ;
badvalue = p{i} ;
badcount = _n_ ;
output ;
end ;
keep badvar badvalue badcount ;
end ;
run ;
proc sql noprint ;
select compress(put(max(badcount), best.))
into :badcount
from __gnu
;
insert into phi_warnings(dset, variable, warning)
select distinct "&inset_name", badvar, "Could this var hold MRN values? Contents of %trim(&badcount) records match the pattern given for MRN values. MRNs should never move across sites."
from __gnu ;
drop table __gnu ;
quit ;
%end ;
%mend check_vars_for_mrn ;
%macro check_vars_for_oldsters(eldest_age = 89, obs_lim = max) ;
%local dtfmts ;
%let dtfmts = 'B8601DA','B8601DN','B8601DT','B8601DZ','B8601LZ','B8601TM','B8601TZ','DATE','DATEAMPM','DATETIME','DAY','DDMMYY',
'DDMMYYB','DDMMYYC','DDMMYYD','DDMMYYN','DDMMYYP','DDMMYYS','DOWNAME','DTDATE','DTMONYY','DTWKDATX','DTYEAR',
'DTYYQC','E8601DA','E8601DN','E8601DT','E8601DZ','E8601LZ','E8601TM','E8601TZ','HHMM','HOUR','JULDAY','JULIAN',
'MMDDYY','MMDDYYB','MMDDYYC','MMDDYYD','MMDDYYN','MMDDYYP','MMDDYYS','MMSS','MMYY','MMYY','MONNAME','MONTH','MONYY',
'PDJULG','PDJULI','QTR','QTRR','WEEKDATE','WEEKDATX','WEEKDAY','WEEKU','WEEKV','WEEKW','WORDDATE','WORDDATX',
'YEAR','YYMM','YYMMC','YYMMD','YYMMN','YYMMP','YYMMS','YYMMDD','YYMMDDB','YYMMDDC','YYMMDDD','YYMMDDN','YYMMDDP',
'YYMMDDS','YYMON','YYQ','YYQC','YYQD','YYQN','YYQP','YYQS','YYQR','YYQRC','YYQRD','YYQRN','YYQRP','YYQRS' ;
%local num ;
%let num = 1 ;
proc sql noprint ;
select name
into :dat_array separated by ' '
from these_vars
where type = &num and (format in (&dtfmts) or lowcase(name) like '%date%')
;
/* added by cb to shorten the process of looking at all dates */
%if &sqlobs > 0 %then %do ;
%put Checking these vars for possible DOB contents: &dat_array ;
select 'min(' || trim(name) || ') as ' || name into :var_list separated by ','
from these_vars
where type = &num and (format in (&dtfmts) or lowcase(name) like '%date%')
;
create table __gnu as
select &var_list from &inset_name
;
/* end cb additions */
quit ;
data __gnu ;
set __gnu (obs = &obs_lim keep = &dat_array) ;
array d &dat_array ;
do i = 1 to dim(d) ;
if n(d{i}) then maybe_age = %calcage(bdtvar = d{i}, refdate = "&sysdate9."d) ;
if maybe_age ge &eldest_age then do ;
badvar = vname(d{i}) ;
badvalue = d{i} ;
output ;
end ;
keep badvar badvalue maybe_age ;
end ;
run ;
proc sql outobs = 30 nowarn ;
insert into phi_warnings(dset, variable, warning)
select distinct "&inset_name", badvar, "If this is a date, at least one value is " || compress(put(maybe_age, best.)) || " years ago, which is older than &eldest_age.. " ||
"If this date applies to a person, the record is probably PHI."
from __gnu ;
drop table __gnu ;
quit ;
%end ;
%else %do ;
%put No obvious date variables found in &inset_name.--skipping age checks. ;
%end ;
%mend check_vars_for_oldsters ;
proc contents noprint data = &inset_name out = these_vars ;
run ;
proc sql noprint ;
create table phi_warnings (dset char(50), variable char(256), label char(256), warning char(200)) ;
%check_varname(regx = mrn|hrn , msg = %str(Name suggests this var may be an MRN, which should never move across sites.)) ;
%check_varname(regx = birth_date|BirthDate|DOB|BDate , msg = %str(Name suggests this var may be a date of birth.)) ;
%check_varname(regx = SSN|SocialSecurityNumber|social_security_number|socsec, msg = %str(Name suggests this var may be a social security number.)) ;
%if %symexist(locally_forbidden_varnames) %then %do ;
%check_varname(regx = &locally_forbidden_varnames, msg = %str(May be on the locally defined list of variables not allowed to be sent to other sites.)) ;
%end ;
quit ;
%check_vars_for_mrn(obs_lim = &obs_lim) ;
%check_vars_for_oldsters(obs_lim = &obs_lim, eldest_age = &eldest_age) ;
title3 "WARNINGS for dataset &inset_name:" ;
proc sql noprint ;
select count(*) as num_warns into :num_warns from phi_warnings ;
%if &num_warns = 0 %then %do ;
reset print outobs = 5 NOWARN ;
select "No obvious PHI-like data elements in &inset_name--BUT PLEASE INSPECT THE CONTENTS AND PRINTs TO FOLLOW" as x label = "No warnings for &inset_name"
from &inset_name
;
%do i = 1 %to 5 ;
%put No obvious phi-like data elements in &inset_name. BUT PLEASE INSPECT THE CONTENTS AND PRINTs CAREFULLY TO MAKE SURE OF THIS! ;
%end ;
%end ;
%else %do ;
reset print ;
select variable, warning from phi_warnings
order by variable, warning
;
quit ;
%end ;
title3 "Dataset &inset_name" ;
proc contents data = &inset_name varnum ;
run ;
/*
proc print data = &inset_name (obs = 20) ;
run ;
*/
** TODO: make the print print out recs that trip the value warnings. ;
proc sql number ;
select *
from &inset_name (obs = 20)
;
quit ;
quit ;
%RemoveDset(dset = __sub_dset) ;
%RemoveDset(dset = possible_bad_vars) ;
%RemoveDset(dset = phi_warnings) ;
%RemoveDset(dset = these_vars) ;
%mend check_dataset ;
%macro detect_phi(transfer_lib, obs_lim = max, eldest_age = 89) ;
%put ;
%put ;
%put ============================================================== ;
%put ;
%put Macro detect_phi: ;
%put ;
%put Checking all datasets found in %sysfunc(pathname(&transfer_lib)) for the following signs of PHI: ;
%put - Variable names signifying sensitive items like 'MRN', 'birth_date', 'SSN' and so forth. ;
%if %symexist(locally_forbidden_varnames) %then %do ;
%put - Variable names on the list defined in the standard macro variable locally_forbidden_varnames (here those names are: &locally_forbidden_varnames). ;
%end ;
%put - Contents of CHARACTER variables that match the pattern given in the standard macro variable mrn_regex (here that var is &mrn_regex) ;
%put Please note that numeric variables ARE NOT CHECKED FOR MRN-LIKE CONTENT. ;
%put - The contents of date variables (as divined by their formats) for values that, if they were DOBs, would indicate a person older than &eldest_age years. ;
%put ;
%put THIS IS BETA SOFTWARE-PLEASE SCRUTINIZE THE RESULTS AND REPORT PROBLEMS TO pardee.r@ghc.org. ;
%put ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ;
%put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ;
%put ;
%put ;
%put ============================================================== ;
%put ;
%put ;
title1 "PHI-Detection Report for the datasets in %sysfunc(pathname(&transfer_lib))." ;
title2 "please inspect all output carefully to make sure it comports with your data sharing agreement!!!" ;
proc sql noprint ;
** describe table dictionary.tables ;
select trim(libname) || '.' || memname as dset
into :d1-:d999
from dictionary.tables
where libname = "%upcase(&transfer_lib)" AND
memtype = 'DATA'
;
%local num_dsets ;
%let num_dsets = &sqlobs ;
quit ;
%local i ;
%if &num_dsets = 0 %then %do i = 1 %to 10 ;
%put ERROR: NO DATASETS FOUND IN &transfer_lib!!!! ;
%end ;
%do i = 1 %to &num_dsets ;
%put about to check &&d&i ;
%check_dataset(dset = &&d&i, obs_lim = &obs_lim, eldest_age = &eldest_age) ;
%end ;
%mend detect_phi ;