In [1]:
from fastai.vision.all import *

In [2]:
mimic_path = Path("/home/code-base/scratch_space/extra/")

In [3]:
Path.ls_names = lambda o: o.ls().map(lambda o: o.name)

In [4]:
mimic_path.ls_names()

(#5) ['wget-log','image_filenames.pqt','cxr_reports_preprocessed.pqt','physionet.org','mimic-cxr-reports']

In [142]:
image_df = pd.read_parquet(mimic_path/'image_filenames.pqt')
cxr_reports_df = pd.read_parquet(mimic_path/'cxr_reports_preprocessed.pqt')

In [143]:
image_df.head()

Unnamed: 0,Folder,PatientID,StudyID,filename
0,p18,p18635245,s57591038,p18/p18635245/s57591038/485014fa-f44727de-24b13f26-d2e9a979-086e5167.jpg
1,p18,p18635245,s57591038,p18/p18635245/s57591038/dc407ff9-10bc70a2-a442c521-db54ef56-4b238127.jpg
2,p18,p18562317,s51459612,p18/p18562317/s51459612/381cf034-75ce05c8-c29df027-9cc72659-9a92ddc5.jpg
3,p18,p18562317,s50002358,p18/p18562317/s50002358/0a671e62-7ce9430f-85fcb721-c7c61ba1-6692fc01.jpg
4,p18,p18562317,s50002358,p18/p18562317/s50002358/e686b4cc-9332850f-b76e3ab2-ccb21de0-e7efed0b.jpg


In [144]:
cxr_reports_df.head(1)

Unnamed: 0,Folder,PatientID,StudyID,EXAMINATION,INDICATION,TECHNIQUE,COMPARISON,FINDINGS,IMPRESSION,WET READ,...,ARDS,SUPINE AP,SEMIERECT PORTABLE RADIOGRAPH OF THE CHEST,ERECT AP AND PA CHEST RADIOGRAPH,PORTABLE AP CHEST RADIOGRAPHS,SEMIERECT AP VIEW OF THE CHEST,OMR,SINGLE AP UPRIGHT PORTABLE CHEST RADIOGRAPH,TWO IMAGES,PA AND AP CHEST RADIOGRAPH
0,p18,p18635245,s57591038,CHEST (PA AND LAT),"___F with L shoulder injury, + distal clavicle ttp // eval for fx, pnx.",Chest PA and lateral,None.,"The heart size, mediastinal, and hilar contours are normal. The lungs are clear without pleural effusion, focal consolidation, or pneumothorax.",No acute cardiopulmonary process.,,...,,,,,,,,,,


#### Data Preprocessing 

**Images**

- Resize all jpeg files to 256 on the larger size

**Text**

- Tokenize all reports with english tokenizer from [CoreNLP](https://stanfordnlp.github.io/CoreNLP/)
- Keep only **Findings** and **Impression** section
- Remove image-text pair samples where number of tokens < 3
- Should have 217k image-text pairs

#### Pretraining ([MIMIC-CXR-II](https://physionet.org/content/mimic-cxr-jpg/2.0.0/))

- **Image Encoder:** ResNet-50
- **Text Encoder:** [ClinicalBERT](https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT). During contrastive training first 6 layers were frozen and only last 6 layers were trained.
    - https://allenai.github.io/scispacy/
- **Image Data Augmentations:** Random resized crop (0.6,1), hflip, affine tfms: rotation -+20 deg, horizontal-vertical translation 0.1, scaling (0.95,1.05), color jittering brightness and contrast from (0.6, 1.4) and gaussian blue (0.1,3), image size = 224
- **Text Data Augmentations:** Radomly sample a sentence from input document (probably after preprocessing).
- Hyperparameters; embed dim=512, tem=0.1 and loss weight=0.75 for pretraining selected by linear evaluation on RSNA image classification task with pretrained ResNet-50 weights.
- 5k validation during pretraining.
- bs = 32.

#### Downstream Tasks

- [RSNA](https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/overview) - used as a binary classification task
- [Chexpert](https://stanfordmlgroup.github.io/competitions/chexpert/) - used expert labelled dataset as test set and a random 5,000 samples as validation 218414/5000/234 - only 234 samples in test
- [COVID-X](https://github.com/lindawangg/COVID-Net) - latest version
- [COVID-19 Radiography Database Kaggle](https://www.kaggle.com/tawsifurrahman/covid19-radiography-database)

#### Ablation Ideas

- With/Without MLP during pretraining
- Use full text during pretraining
- Learnable temperature
- Linear evaluation or knn, which one is better and faster?

### CONVIRT Paper Implementation

We will only `FINDINGS` and `IMPRESSION` sections from the radiology reports as per paper.

Also, following the model used https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT

*We remove all image-text pairings from the dataset where the text section is empty or has less than 3 tokens*

### Prepare Text

In [9]:
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModel

In [11]:
# remove rows if either FINDINGS or IMPRESSION is missing
reports_df = reports_df[['Folder', 'PatientID', 'StudyID', 'FINDINGS', 'IMPRESSION']]
reports_df = reports_df[~(reports_df['FINDINGS'].isna() & reports_df['IMPRESSION'].isna())]
reports_df['TEXT'] = reports_df['FINDINGS'].fillna("")+reports_df['IMPRESSION'].fillna("")

In [13]:
reports_df.head(2)

Unnamed: 0,Folder,PatientID,StudyID,FINDINGS,IMPRESSION,TEXT
0,p18,p18635245,s57591038,"The heart size, mediastinal, and hilar contours are normal. The lungs are clear without pleural effusion, focal consolidation, or pneumothorax.",No acute cardiopulmonary process.,"The heart size, mediastinal, and hilar contours are normal. The lungs are clear without pleural effusion, focal consolidation, or pneumothorax.No acute cardiopulmonary process."
1,p18,p18562317,s50002358,,"In comparison with the study of ___, there again are low lung volumes with increased opacification at the bases consistent with pleural effusion and underlying compressive atelectasis. Mild elevation of pulmonary venous pressure is again seen. Left IJ catheter again extends to the mid portion of the SVC. In the absence of a lateral view, superimposed pneumonia would have to be considered in the appropriate clinical setting.","In comparison with the study of ___, there again are low lung volumes with increased opacification at the bases consistent with pleural effusion and underlying compressive atelectasis. Mild elevation of pulmonary venous pressure is again seen. Left IJ catheter again extends to the mid portion of the SVC. In the absence of a lateral view, superimposed pneumonia would have to be considered in the appropriate clinical setting."


In [14]:
reports_df.shape

(216221, 6)

In [14]:
def extract_sents(text): return list(text.sents)

In [16]:
nlp = spacy.load("en_core_sci_md")

In [17]:
texts = reports_df['TEXT']
docs = nlp.pipe(texts, batch_size=32, n_process=16)
sents = [list(doc.sents) for doc in progress_bar(docs, total=len(texts))]

In [18]:
len(sents)

216221

In [60]:
final_sents = []
for r,ss in zip(array(reports_df[['Folder', 'PatientID', 'StudyID']]), sents):
    for s in ss:
        final_sents.append(list(r)+[str(s)])

In [62]:
sent_df = pd.DataFrame(final_sents, columns=['Folder', 'PatientID', 'StudyID', 'sentence'])

In [21]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [70]:
def get_num_tokens(sent): return len(tokenizer.encode(sent))

In [100]:
# exclude CLS and SEP token
num_tokens = array(num_tokens)-2
sent_df['num_tokens'] = num_tokens
sent_df.to_parquet(mimic_path/'cxr_reports_sentences.pqt')
sent_df

### Prepare Images

In [109]:
images_path = Path("/home/code-base/scratch_space/extra/physionet.org/files/mimic-cxr-jpg/2.0.0/files/")

In [176]:
def resize_and_save(fns):
    "Multiple fns"
    # resize
    for fn in fns:
        img = PILImage.create(images_path/fn)
        targsz = resize_to(img, targ_sz=256, use_min=False)
        new_img = img.resize(targsz)
        # save
        main_dir = Path("/home/code-base/scratch_space/extra/mimic_256")
        folder, patient_dir, study_dir, fname =  fn.split("/")
        if not (main_dir/folder).exists():                       (main_dir/folder).mkdir()
        if not (main_dir/folder/patient_dir).exists():           (main_dir/folder/patient_dir).mkdir()
        if not (main_dir/folder/patient_dir/study_dir).exists(): (main_dir/folder/patient_dir/study_dir).mkdir()
        new_img.save(main_dir/folder/patient_dir/study_dir/fname)

In [177]:
# feed each folder to a single process so that folder creation doesn't require Lock
folders = image_df['Folder'].unique()
fns_chunks = [image_df.query(f"Folder == '{f}'")['filename'].values for f in folders]

In [None]:
_ = parallel(resize_and_save, fns_chunks, n_workers=4, progress=True)