In [1]:
from pathlib import Path
from collections import defaultdict
import random
import re

#### 1) get all subjects with a single study AND a single scan in that single study

In [13]:
mimic_cxr_path = Path('/vol/biodata/data/chest_xray/mimic-cxr')
mimic_cxr_jpg_path = Path('/vol/biodata/data/chest_xray/mimic-cxr-jpg')
# show the subdirectories
mimic_cxr_study_list =  Path("/vol/biodata/data/chest_xray/mimic-cxr/cxr-study-list.csv")


subject_id_to_study_freq = defaultdict(int)
with open(mimic_cxr_study_list) as f:
    next(f) # skip the header
    for line in f:
        subject_id, study_id, report_location = line.strip().split(',')
        subject_id_to_study_freq[subject_id] += 1

single_study_subjects = []
for subject_id, freq in subject_id_to_study_freq.items():
    if freq == 1:
        single_study_subjects.append(subject_id)

print(len(single_study_subjects))

### this is slow due to DISK I/O
# for each subject check how many images are in its folder
# subjects_with_single_scan = []
# for subject_id in single_study_subjects:
#     subject_path = mimic_cxr_path/"files"/ f"p{subject_id[:2]}/p{subject_id}"

#     sub = list(subject_path.glob('*'))
#     for s in sub:
#         if s.is_dir():
#             # print(list(s.glob('*')))
#             if len(list(s.glob('*'))) == 1:
#                 subjects_with_single_scan.append(subject_id)

32695


#### 2) inspect reports to see how to filter down from 5500+ reports

In [26]:
subjects_with_single_scan_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/subjects_with_single_scan.txt")

subjects_with_single_scan = []
with open(subjects_with_single_scan_path) as f:
    subjects_with_single_scan = [line.strip() for line in f.readlines()]

    reports_subjects_with_single_scan_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/reports_subjects_with_single_scan.txt")
# with open(reports_subjects_with_single_scan_path, 'w') as output_f:
#     for subject in subjects_with_single_scan:
#         # print(f"Subject {subject}")
#         output_f.write(f"Subject {subject}\n")
#         subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
#         txt_files = list(subject_path.glob('*.txt'))

#         with open(txt_files[0]) as input_f:
#             for line in input_f:
#                 output_f.write(line)

#         output_f.write("--------------------------------\n")



#### 3) Select only the reports with no prior based on REGEX: `COMPARISON:\s*N(O|o)(\s+.+)?`
i.e. words: "COMPARISON" any number of spaces and then "No" followed by anything up to a new line character

In [42]:
test_strings = ["COMPARISON: None", "COMPARISON:  NONE available.", "COMPARISON:  No relevant comparisons available."," COMPARISON:  No prior chest radiographs available for review."," COMPARISON: \n None available."] 


# comparison_regex = re.compile(r"COMPARISON:\s*(NONE|None|NO|No)(\s+.+)?", re.IGNORECASE)
comparison_regex = re.compile(r"COMPARISON:\s*N(O|o)(\s+.+)?", re.IGNORECASE)

for test_string in test_strings:
    match = comparison_regex.search(test_string)
    if match:
        print(test_string)
    else:
        print("No match")

COMPARISON: None
COMPARISON:  NONE available.
COMPARISON:  No relevant comparisons available.
 COMPARISON:  No prior chest radiographs available for review.
 COMPARISON: 
 None available.


In [57]:
single_scan_and_study_subjects_with_no_prior_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/reports_single_scan_no_prior.txt")
single_scan_and_study_subjects_with_no_prior_list = []
comparison_regex = re.compile(r"COMPARISON:\s*N(O|o)(\s+.+)?", re.IGNORECASE)

with open(single_scan_and_study_subjects_with_no_prior_path,'a') as output_f:
    for subject in subjects_with_single_scan:
        subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
        txt_files = list(subject_path.glob('*.txt'))

        with open(txt_files[0]) as input_f:
            content = input_f.read()
            if comparison_regex.search(content) and not "lateral view" in content:  # Use match() for exact line matching
                single_scan_and_study_subjects_with_no_prior_list.append(subject)
                output_f.write(f"Subject {subject}\n")
                output_f.write(content)
                output_f.write("--------------------------------\n")


# with open("subjects_with_single_scan_no_prior.txt", 'w') as f:
#     for subject in single_scan_and_study_subjects_with_no_prior_list:
#         f.write(f"{subject}\n")

In [26]:
with open("subject_single_scan_and_no_prior_report", 'w') as f_output:

    with open("subjects_with_single_scan_no_prior.txt") as f_input:
        subjects_with_single_scan_no_prior = [line.strip() for line in f_input.readlines()]

    subject_to_image_path = {}
    subject_to_report = {}

    for subject in subjects_with_single_scan_no_prior:
        subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
        report_path = list(subject_path.glob('*.txt'))[0]
        report = report_path.read_text()

        subject_path_jpg = mimic_cxr_jpg_path/"files"/ f"p{subject[:2]}/p{subject}"
        
        study_folder = list(subject_path_jpg.glob('*'))[0]
        jpg_file = list(study_folder.glob('*.jpg'))[0]
        
        subject_to_report[subject] = report
        subject_to_image_path[subject] = jpg_file
        # f_output.write(f'{jpg_file}, "{report}"')
            
