In [5]:
import os
import re
import copy

import pandas as pd
import numpy as np
from sklearn import metrics

# Load dataframes

In [9]:
chexpert_categories = ["No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly",
                      "Lung Lesion", "Lung Opacity", "Edema", "Consolidation",
                      "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion",
                      "Pleural Other", "Fracture", "Support Devices"]

# reports
val = pd.read_csv('mimic_cxr_validation_reports.csv', header=None)
val.columns = ['id', 'text']

# negbio
df_nih = pd.read_csv('mimic_cxr_validation_negbio_labeled.csv')
df_nih.set_index('id', inplace=True)
df_nih = df_nih[chexpert_categories]

# chexpert
df_chexpert = pd.read_csv('mimic_cxr_validation_chexpert_labeled.csv')
df_chexpert = df_chexpert.merge(
    val, how='inner', left_on='Reports', right_on='text'
)
df_chexpert.drop_duplicates(inplace=True)
df_chexpert.set_index('id', inplace=True)
df_chexpert.rename(columns={'Airspace Opacity': 'Lung Opacity'}, inplace=True)
df_chexpert = df_chexpert[chexpert_categories]

# ground truth
gs = pd.read_csv('groundtruth.csv', header=0, index_col=0)
gs.index.name = 'id'
gs.rename(columns={'Airspace Opacity': 'Lung Opacity'}, inplace=True)
gs = gs[chexpert_categories]

print(gs.shape)

# ensure all dataframes are aligned
gs.sort_index(inplace=True)
df_chexpert = df_chexpert.loc[gs.index]
df_nih = df_nih.loc[gs.index]
gs.head()

(687, 14)


Unnamed: 0_level_0,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
s0,,,,,,,,1.0,,,,,,
s1,,1.0,1.0,,,0.0,,0.0,1.0,,,,,
s1000,,,,,1.0,,,,,-1.0,,,,1.0
s101,,,1.0,,,,,,,,,,,
s1017,,1.0,,,1.0,,,-1.0,-1.0,,,,,


# Evaluation function

Define a helper function to evaluate the outputs in three categories: (1) mentions, (2) uncertainty, and (3) negation.

In [18]:
def evaluate_label(tar, pred, ignore_nan=False):
    """
    Return precision, recall, f1, and prevalence for a single label.
    """
    
    if ignore_nan:
        idx = ~(np.isnan(tar) | np.isnan(pred))
        pred = pred[idx]
        tar = tar[idx]
    
    results = {
        'precision': np.nan,
        'recall': np.nan,
        'f1': np.nan,
        'positives': int(tar.sum())
    }
    
    if results['positives'] == 0:
        # return NaN if no positive labels
        return results
    
    results['precision'] = metrics.precision_score(tar, pred)
    results['recall'] = metrics.recall_score(tar, pred)
    results['f1'] = 2*(results['precision']*results['recall'])/(results['precision']+results['recall'])
    
    return results
    

def get_scores(target, prediction, categories, ignore_nan=False):
    
    
    results = {}
    for i, c in enumerate(categories):
        results[c] = evaluate_label(target[:, i], prediction[:, i])
    
    # convert to dataframe
    df = pd.DataFrame.from_dict(results, orient='index')
    
    return df

def evaluate_labels(df_truth, df_label, method='mention'):
    categories = list(df_truth.columns)
    
    # create the matrix of 0s and 1s
    preds = copy.copy(df_label.values)
    targets = copy.copy(df_truth.values)
    
    if method == 'mention':
        # any mention is a 1
        preds[np.isin(preds, [-1, 0, 1])] = 1
        targets[np.isin(targets, [-1, 0, 1])] = 1

        # no mention is a 0
        preds[np.isnan(preds)] = 0
        targets[np.isnan(targets)] = 0
        
        # do not ignore NaN (which we have set to 0 anyway)
        ignore_nan=False
    elif method == 'negation':
        # successful prediction of negation
        idxNonZero = preds != 0
        idxZero = preds == 0
        preds[idxNonZero] = 0
        preds[idxZero] = 1
        
        idxNonZero = targets != 0
        idxZero = targets == 0
        targets[idxNonZero] = 0
        targets[idxZero] = 1
        
        # ignore NaN values
        ignore_nan=True
    elif method == 'uncertain':
        # any non-uncertain prediction is 0
        preds[preds!= -1] = 0
        targets[targets != -1] = 0
        
        # any uncertain prediction is 1
        preds[preds == -1] = 1
        targets[targets == -1] = 1
        
        # ignore NaN
        ignore_nan=True
    else:
        raise ValueError(f'Unrecognized method {method}')
        
    df = get_scores(targets, preds, categories, ignore_nan=ignore_nan)
    
    return df

In [21]:
df = evaluate_labels(gs, df_nih, method='mention')
df.to_latex('nih_mention.tex')
df

Unnamed: 0,precision,recall,f1,positives
No Finding,0.382353,0.866667,0.530612,30
Enlarged Cardiomediastinum,0.375,0.6,0.461538,70
Cardiomegaly,0.813688,0.910638,0.859438,235
Lung Lesion,0.861538,0.848485,0.854962,66
Lung Opacity,0.715447,0.907216,0.8,194
Edema,0.799296,1.0,0.888454,227
Consolidation,0.885714,0.978947,0.93,95
Pneumonia,0.92827,0.986547,0.956522,223
Atelectasis,0.893443,1.0,0.943723,218
Pneumothorax,0.945378,0.995575,0.969828,226


In [22]:
df = evaluate_labels(gs, df_chexpert, method='mention')
df.to_latex('chexpert_mention.tex')
df

Unnamed: 0,precision,recall,f1,positives
No Finding,0.403226,0.833333,0.543478,30
Enlarged Cardiomediastinum,0.375,0.6,0.461538,70
Cardiomegaly,0.813688,0.910638,0.859438,235
Lung Lesion,0.861538,0.848485,0.854962,66
Lung Opacity,0.715447,0.907216,0.8,194
Edema,0.799296,1.0,0.888454,227
Consolidation,0.885714,0.978947,0.93,95
Pneumonia,0.92827,0.986547,0.956522,223
Atelectasis,0.893443,1.0,0.943723,218
Pneumothorax,0.945378,0.995575,0.969828,226


In [24]:
df = evaluate_labels(gs, df_nih, method='uncertain')
df.to_latex('nih_uncertain.tex')
df



Unnamed: 0,precision,recall,f1,positives
No Finding,,,,0
Enlarged Cardiomediastinum,0.033333,1.0,0.064516,1
Cardiomegaly,0.155556,0.5,0.237288,14
Lung Lesion,0.0,0.0,,8
Lung Opacity,,,,0
Edema,0.102041,0.5,0.169492,10
Consolidation,0.529412,0.529412,0.529412,17
Pneumonia,0.431818,0.612903,0.506667,62
Atelectasis,0.333333,0.705882,0.45283,17
Pneumothorax,0.375,0.375,0.375,8


In [25]:
df = evaluate_labels(gs, df_chexpert, method='uncertain')
df.to_latex('chexpert_uncertain.tex')
df



Unnamed: 0,precision,recall,f1,positives
No Finding,,,,0
Enlarged Cardiomediastinum,0.035714,1.0,0.068966,1
Cardiomegaly,0.0,0.0,,14
Lung Lesion,0.0,0.0,,8
Lung Opacity,,,,0
Edema,0.125,0.6,0.206897,10
Consolidation,0.272727,0.176471,0.214286,17
Pneumonia,0.406977,0.564516,0.472973,62
Atelectasis,0.289474,0.647059,0.4,17
Pneumothorax,0.25,0.125,0.166667,8


In [26]:
df = evaluate_labels(gs, df_nih, method='negation')
df.to_latex('nih_negation.tex')
df



Unnamed: 0,precision,recall,f1,positives
No Finding,,,,0
Enlarged Cardiomediastinum,0.653846,0.607143,0.62963,28
Cardiomegaly,0.855072,0.719512,0.781457,82
Lung Lesion,0.5,0.5,0.5,4
Lung Opacity,0.428571,0.391304,0.409091,23
Edema,0.712871,0.847059,0.774194,85
Consolidation,0.916667,0.956522,0.93617,23
Pneumonia,0.835616,0.73494,0.782051,83
Atelectasis,0.333333,0.75,0.461538,4
Pneumothorax,0.919355,0.955307,0.936986,179


In [27]:
df = evaluate_labels(gs, df_chexpert, method='negation')
df.to_latex('chexpert_negation.tex')
df



Unnamed: 0,precision,recall,f1,positives
No Finding,,,,0
Enlarged Cardiomediastinum,0.653846,0.607143,0.62963,28
Cardiomegaly,0.855072,0.719512,0.781457,82
Lung Lesion,0.5,0.5,0.5,4
Lung Opacity,0.533333,0.347826,0.421053,23
Edema,0.714286,0.823529,0.765027,85
Consolidation,0.916667,0.956522,0.93617,23
Pneumonia,0.867647,0.710843,0.781457,83
Atelectasis,0.3,0.75,0.428571,4
Pneumothorax,0.926136,0.910615,0.91831,179
