In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

tqdm.pandas()

report_path = Path('/data/healthy-ml/gobi1/data/mimic-cxr-reports/files/')
root_dir = Path('/data/healthy-ml/gobi1/data/MIMIC-CXR-JPG')

In [None]:
metadata = os.path.join(root_dir, 'mimic-cxr-2.0.0-metadata.csv.gz')
label_path =os.path.join(root_dir, 'mimic-cxr-2.0.0-chexpert.csv.gz')
split_path = os.path.join(root_dir, 'mimic-cxr-2.0.0-split.csv.gz')

In [None]:
# df = pd.read_csv(metadata)
df_split = pd.read_csv(split_path)
df_label = pd.read_csv(label_path)

In [None]:
labels = ['Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax']

In [None]:
def get_label_set(x):
    certain_yes = x[labels][x[labels] == 1].index
    certain_no = x[labels][x[labels] == -1].index
    uncertain = x[labels][x[labels] == 0].index
    return [f'certain_yes_{i}' for i in certain_yes ] +  [f'certain_no_{i}'   for i in certain_no ] +  [ f'uncertain_{i}' for i in uncertain ]

In [None]:
df_label['cat_labels_text'] = df_label.progress_apply(lambda x: get_label_set(x), axis = 1)

In [None]:
all_labels = [i + j for i in ['certain_yes_', 'certain_no_', 'uncertain_'] for j in labels]

In [None]:
mapping = {
    j:i for i, j in enumerate(all_labels)
}

In [None]:
df_label['cat_labels'] = df_label['cat_labels_text'].progress_apply(lambda x: [mapping[i] for i in x])

In [None]:
df_label["reportfilename"] = df_label.progress_apply(lambda row: os.path.join(report_path,
                    'p{}/'.format(str(row.subject_id)[:2])+'p{}/'.format(row.subject_id) + 's{}.txt'.format(row.study_id)), axis=1)

In [None]:
df_label['sentence_long'] = df_label.progress_apply(lambda x: Path(x['reportfilename']).open('r').read().strip(), axis = 1)

In [None]:
df_label = df_label.merge(df_split, on = ['subject_id', 'study_id'], how = 'inner')

In [None]:
df_label.loc[df_label.split == 'validate', 'split'] = 'val'

In [None]:
import section_parser as sp
custom_section_names, custom_indices = sp.custom_mimic_cxr_rules()

def list_rindex(l, s):
    """Helper function: *last* matching element in a list"""
    return len(l) - l[-1::-1].index(s) - 1

def parse(study_id, text):
    if study_id in custom_indices:
        idx = custom_indices[study_id]
        return 'IMPRESSION: \n'+  text[idx[0]:idx[1]]+ '\n'
    
    sections, section_names, section_idx = sp.section_text(
        text
    )
    
    if study_id in custom_section_names:
        sn = custom_section_names[study_id]
        idx = list_rindex(section_names, sn)
        return 'IMPRESSION: \n'+ sections[idx].strip()+ '\n'

    study_sections = {}
    for sn in ('impression', 'findings',
               'last_paragraph', 'comparison'):
        if sn in section_names:
            idx = list_rindex(section_names, sn)
            study_sections[sn] = sections[idx].strip()
    
    ret_text = ''
    if 'findings' in study_sections:
        ret_text += 'FINDINGS:\n' +  study_sections['findings'] + '\n\n'
        
    if 'impression' in study_sections:
        ret_text += 'IMPRESSION:\n' +  study_sections['impression'] + '\n'
        
    if 'impression' not in study_sections and 'last_paragraph' in study_sections:
        ret_text += 'IMPRESSION:\n' +  study_sections['last_paragraph'] + '\n'
        
    return ret_text   

In [None]:
df_label['sentence'] = df_label.progress_apply(lambda x: parse('s'+ str(x['study_id']), x['sentence_long']), axis = 1)

In [None]:
df_label.to_pickle(root_dir/'multimodal_mislabel_split.pkl')