In [1]:
from pathlib import Path
from collections import defaultdict
import random
import re

#### 1) get all subjects with a single study AND a single scan in that single study

In [2]:
mimic_cxr_path = Path('/vol/biodata/data/chest_xray/mimic-cxr')
mimic_cxr_jpg_path = Path('/vol/biodata/data/chest_xray/mimic-cxr-jpg')
# show the subdirectories
mimic_cxr_study_list =  Path("/vol/biodata/data/chest_xray/mimic-cxr/cxr-study-list.csv")


subject_id_to_study_freq = defaultdict(int)
with open(mimic_cxr_study_list) as f:
    next(f) # skip the header
    for line in f:
        subject_id, study_id, report_location = line.strip().split(',')
        subject_id_to_study_freq[subject_id] += 1

single_study_subjects = []
for subject_id, freq in subject_id_to_study_freq.items():
    if freq == 1:
        single_study_subjects.append(subject_id)

print(len(single_study_subjects))

### this is slow due to DISK I/O
# for each subject check how many images are in its folder
# subjects_with_single_scan = []
# for subject_id in single_study_subjects:
#     subject_path = mimic_cxr_path/"files"/ f"p{subject_id[:2]}/p{subject_id}"

#     sub = list(subject_path.glob('*'))
#     for s in sub:
#         if s.is_dir():
#             # print(list(s.glob('*')))
#             if len(list(s.glob('*'))) == 1:
#                 subjects_with_single_scan.append(subject_id)

32695


#### 2) inspect reports to see how to filter down from 5500+ reports

In [3]:
subjects_with_single_scan_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/subjects_with_single_scan.txt")

subjects_with_single_scan = []
with open(subjects_with_single_scan_path) as f:
    subjects_with_single_scan = [line.strip() for line in f.readlines()]

    reports_subjects_with_single_scan_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/reports_subjects_with_single_scan.txt")
# with open(reports_subjects_with_single_scan_path, 'w') as output_f:
#     for subject in subjects_with_single_scan:
#         # print(f"Subject {subject}")
#         output_f.write(f"Subject {subject}\n")
#         subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
#         txt_files = list(subject_path.glob('*.txt'))

#         with open(txt_files[0]) as input_f:
#             for line in input_f:
#                 output_f.write(line)

#         output_f.write("--------------------------------\n")



FileNotFoundError: [Errno 2] No such file or directory: '/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/subjects_with_single_scan.txt'

#### 3) Select only the reports with no prior based on REGEX: `COMPARISON:\s*N(O|o)(\s+.+)?`
i.e. words: "COMPARISON" any number of spaces and then "No" followed by anything up to a new line character

In [42]:
test_strings = ["COMPARISON: None", "COMPARISON:  NONE available.", "COMPARISON:  No relevant comparisons available."," COMPARISON:  No prior chest radiographs available for review."," COMPARISON: \n None available."] 


# comparison_regex = re.compile(r"COMPARISON:\s*(NONE|None|NO|No)(\s+.+)?", re.IGNORECASE)
comparison_regex = re.compile(r"COMPARISON:\s*N(O|o)(\s+.+)?", re.IGNORECASE)

for test_string in test_strings:
    match = comparison_regex.search(test_string)
    if match:
        print(test_string)
    else:
        print("No match")

COMPARISON: None
COMPARISON:  NONE available.
COMPARISON:  No relevant comparisons available.
 COMPARISON:  No prior chest radiographs available for review.
 COMPARISON: 
 None available.


In [57]:
single_scan_and_study_subjects_with_no_prior_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/mimic-cxr-reports/reports_single_scan_no_prior.txt")
single_scan_and_study_subjects_with_no_prior_list = []
comparison_regex = re.compile(r"COMPARISON:\s*N(O|o)(\s+.+)?", re.IGNORECASE)

with open(single_scan_and_study_subjects_with_no_prior_path,'a') as output_f:
    for subject in subjects_with_single_scan:
        subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
        txt_files = list(subject_path.glob('*.txt'))

        with open(txt_files[0]) as input_f:
            content = input_f.read()
            if comparison_regex.search(content) and not "lateral view" in content:  # Use match() for exact line matching
                single_scan_and_study_subjects_with_no_prior_list.append(subject)
                output_f.write(f"Subject {subject}\n")
                output_f.write(content)
                output_f.write("--------------------------------\n")


# with open("subjects_with_single_scan_no_prior.txt", 'w') as f:
#     for subject in single_scan_and_study_subjects_with_no_prior_list:
#         f.write(f"{subject}\n")

In [21]:
with open("subject_single_scan_and_no_prior_report", 'w') as f_output:

    with open("subjects_with_single_scan_no_prior.txt") as f_input:
        subjects_with_single_scan_no_prior = [line.strip() for line in f_input.readlines()]

    subject_to_image_path = {}
    subject_to_report = {}

    for subject in subjects_with_single_scan_no_prior:
        subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
        report_path = list(subject_path.glob('*.txt'))[0]
        report = report_path.read_text()

        subject_path_jpg = mimic_cxr_jpg_path/"files"/ f"p{subject[:2]}/p{subject}"
        
        study_folder = list(subject_path_jpg.glob('*'))[0]
        jpg_file = list(study_folder.glob('*.jpg'))[0]
        
        subject_to_report[subject] = report
        subject_to_image_path[subject] = jpg_file
        # f_output.write(f'{jpg_file}, "{report}"')
            


KeyboardInterrupt: 

#### take intersection of test set and based on mimic-split and subjects_with_single_scan_no_prior

In [24]:
mimic_cxr_split_path = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/evaluation_datasets/mimic-cxr-2.0.0-split.csv')
mimic_cxr_test_subjects = defaultdict(set)

with open(mimic_cxr_split_path) as f:
    next(f)
    for line in f:
        dicom_id,study_id,subject_id,split = line.strip().split(',')
        if split == "test" or split == "validate":
        # # if split == "validate":
        # if split == "test":
            mimic_cxr_test_subjects[subject_id].add(study_id)

In [5]:
with open("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/evaluation_datasets/MIMIC-CXR/subjects_with_single_scan_no_prior.txt") as f_input:
        subjects_with_single_scan_no_prior = {line.strip() for line in f_input.readlines()}

print(subjects_with_single_scan_no_prior.intersection(mimic_cxr_test_subjects.keys()))
for subject in subjects_with_single_scan_no_prior.intersection(mimic_cxr_test_subjects.keys()):
    print(subject)


{'13914812'}
13914812


In [26]:
for subject in subjects_with_single_scan_no_prior.intersection(mimic_cxr_test_subjects.keys()):
    subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
    # print(list(subject_path.glob('*')))
    print(subject)
    print(list(subject_path.glob('*.txt')))
    report_path = list(subject_path.glob('*.txt'))[0]
    report = report_path.read_text()
    print(report)
    print("=====================================")
    

18007398
[PosixPath('/vol/biodata/data/chest_xray/mimic-cxr/files/p18/p18007398/s50340409.txt')]
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old woman with hemorrhagic shock  // CT placement
 
 COMPARISON:  No comparison
 
 IMPRESSION: 
 
 The patient is intubated, the tip of the endotracheal tube projects 4 cm above
 the carina.  To right and 1 left-sided chest tube are in place.  Nasogastric
 tube shows a normal course.  Left retrocardiac atelectasis.  Minimal right
 pleural effusion.  No visible pneumothorax.  Non displaced rib fractures
 described on the CT examination from ___, 19:33, are not visualized
 on the radiograph.

14473057
[PosixPath('/vol/biodata/data/chest_xray/mimic-cxr/files/p14/p14473057/s56003480.txt')]
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  History: ___M with narcan status post suspected clonidine use  //
 ? edema
 
 TECHNIQUE:  Supine AP view of

##### generate test reports and manually pick the best ones based on reports:
- with no longitudinal language 
- only frontal views?

In [25]:
words_to_exclude = {"lateral","Lateral","compared", "unchanged", "stable", "improved", "remains","increased",
                     "continued", "decreased" ,"previous", "persists","prior", "study","interval","tube",
                     "two","multiple"}
reports_test_split_pruned = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/evaluation_datasets/MIMIC-CXR/reports_test_split_pruned.txt")
with open(reports_test_split_pruned, 'w') as output_f:
    for index,(subject,studies) in enumerate(mimic_cxr_test_subjects.items()):
        subject_path = mimic_cxr_path/"files"/ f"p{subject[:2]}/p{subject}"
        txt_files = list(subject_path.glob('*.txt'))
        for txt_file in txt_files:
            for study in studies:
                if study in txt_file.name:
                    # read text and make sure does not have words to exclude in it
                    report = txt_file.read_text()
                    if not any(word in report for word in words_to_exclude):
                        output_f.write(f"p{subject}/s{study}\n")
                        output_f.write(report)
                        output_f.write("--------------------------------\n")


Collect reports for Eval

In [25]:
import os
mimic_cxr_path = Path('/vol/biodata/data/chest_xray/mimic-cxr')
subjects_for_eval = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/cxr-agent/evaluation_datasets/MIMIC-CXR/subjects_for_eval.txt")
with open(subjects_for_eval, 'r') as input_f:
    for line in input_f.readlines():
        study = None
        if line[0] == 'p':
            subject = line.split('/')[0][1:].strip()
            study = line.split('/')[1][1:].strip()            
        else:
            subject = line.strip()

        subject_path = mimic_cxr_path/"files"/f"p{subject[:2]}/p{subject}/"

        # count the number of images in the subject folder
        studies = list(subject_path.glob('*[!.txt]'))   
        if len(studies) > 1:
            # get the study folder with name containing the study
            study_folder = Path([study_folder for study_folder in studies if study in study_folder.name][0])
            
        else:
            study_folder = studies[0]

        print(line)
        print(f"Subject {subject} has {len(list(study_folder.glob('*')))} images")
            
        
        # if study is not None:
        #     txt_files = list(subject_path.glob(f'*{study}*.txt'))
        #     print(txt_files[0].read_text())
        # else:
        #     txt_files = list(subject_path.glob('*.txt'))
        #     print(txt_files[0].read_text())


18007398

Subject 18007398 has 1 images
14473057

Subject 14473057 has 1 images
18437840

Subject 18437840 has 1 images
12937037

Subject 12937037 has 1 images
18552428

Subject 18552428 has 1 images
12386201

Subject 12386201 has 1 images
17561996

Subject 17561996 has 1 images
16306599

Subject 16306599 has 1 images
11644926

Subject 11644926 has 1 images
15452636

Subject 15452636 has 1 images
13947388

Subject 13947388 has 1 images
18113771

Subject 18113771 has 1 images
12503812

Subject 12503812 has 1 images
19680874

Subject 19680874 has 1 images
16865871

Subject 16865871 has 1 images
11197890

Subject 11197890 has 1 images
11500818

Subject 11500818 has 1 images
18038196

Subject 18038196 has 1 images
19079797

Subject 19079797 has 1 images
16957065

Subject 16957065 has 1 images
12085050

Subject 12085050 has 1 images
12536467

Subject 12536467 has 1 images
13914812

Subject 13914812 has 1 images
p10046166/s51738740

Subject 10046166 has 3 images
p10183775/s52835225

Subject 