In [1]:
import os
import re
import copy

import pandas as pd
import numpy as np
from sklearn import metrics

# Load dataframes

In [2]:
chexpert_categories = ["No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly",
                      "Lung Lesion", "Lung Opacity", "Edema", "Consolidation",
                      "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion",
                      "Pleural Other", "Fracture", "Support Devices"]

# reports
val = pd.read_csv('mimic_cxr_validation_reports.csv', header=None)
val.columns = ['id', 'text']

# negbio
df_nih = pd.read_csv('mimic_cxr_validation_negbio_labeled.csv')
df_nih.set_index('id', inplace=True)
df_nih = df_nih[chexpert_categories]

# chexpert
df_chexpert = pd.read_csv('mimic_cxr_validation_chexpert_labeled.csv')
df_chexpert = df_chexpert.merge(
    val, how='inner', left_on='Reports', right_on='text'
)
df_chexpert.drop_duplicates(inplace=True)
df_chexpert.set_index('id', inplace=True)
df_chexpert.rename(columns={'Airspace Opacity': 'Lung Opacity'}, inplace=True)
df_chexpert = df_chexpert[chexpert_categories]

# ground truth
gs = pd.read_csv('groundtruth.csv', header=0, index_col=0)
gs.index.name = 'id'
gs.rename(columns={'Airspace Opacity': 'Lung Opacity'}, inplace=True)
gs = gs[chexpert_categories]

print(gs.shape)

# ensure all dataframes are aligned
gs.sort_index(inplace=True)
df_chexpert = df_chexpert.loc[gs.index]
df_nih = df_nih.loc[gs.index]
gs.head()

(687, 14)


Unnamed: 0_level_0,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
s0,,,,,,,,1.0,,,,,,
s1,,1.0,1.0,,,0.0,,0.0,1.0,,,,,
s1000,,,,,1.0,,,,,-1.0,,,,1.0
s101,,,1.0,,,,,,,,,,,
s1017,,1.0,,,1.0,,,-1.0,-1.0,,,,,


# Evaluation function

Define a helper function to evaluate the outputs in three categories: (1) mentions, (2) uncertainty, and (3) negation.

In [3]:
def evaluate_label(tar, pred, ignore_nan=False):
    """
    Return precision, recall, f1, and prevalence for a single label.
    """
    
    if ignore_nan:
        idx = ~(np.isnan(tar) | np.isnan(pred))
        pred = pred[idx]
        tar = tar[idx]
    
    results = {
        'precision': np.nan,
        'recall': np.nan,
        'f1': np.nan,
        'positives': int(tar.sum())
    }
    
    if results['positives'] == 0:
        # return NaN if no positive labels
        return results
    
    results['precision'] = metrics.precision_score(tar, pred)
    results['recall'] = metrics.recall_score(tar, pred)
    results['f1'] = 2*(results['precision']*results['recall'])/(results['precision']+results['recall'])
    
    return results
    

def get_scores(target, prediction, categories, ignore_nan=False):
    
    
    results = {}
    for i, c in enumerate(categories):
        results[c] = evaluate_label(target[:, i], prediction[:, i])
    
    # convert to dataframe
    df = pd.DataFrame.from_dict(results, orient='index')
    
    return df

def evaluate_labels(df_truth, df_label, method='mention'):
    categories = list(df_truth.columns)
    
    # create the matrix of 0s and 1s
    preds = copy.copy(df_label.values)
    targets = copy.copy(df_truth.values)
    
    if method == 'mention':
        # any mention is a 1
        preds[np.isin(preds, [-1, 0, 1])] = 1
        targets[np.isin(targets, [-1, 0, 1])] = 1

        # no mention is a 0
        preds[np.isnan(preds)] = 0
        targets[np.isnan(targets)] = 0
        
        # do not ignore NaN (which we have set to 0 anyway)
        ignore_nan=False
    elif method == 'negation':
        # successful prediction of negation
        idxNonZero = preds != 0
        idxZero = preds == 0
        preds[idxNonZero] = 0
        preds[idxZero] = 1
        
        idxNonZero = targets != 0
        idxZero = targets == 0
        targets[idxNonZero] = 0
        targets[idxZero] = 1
        
        # ignore NaN values
        ignore_nan=True
    elif method == 'uncertain':
        # any non-uncertain prediction is 0
        preds[preds!= -1] = 0
        targets[targets != -1] = 0
        
        # any uncertain prediction is 1
        preds[preds == -1] = 1
        targets[targets == -1] = 1
        
        # ignore NaN
        ignore_nan=True
    else:
        raise ValueError(f'Unrecognized method {method}')
        
    df = get_scores(targets, preds, categories, ignore_nan=ignore_nan)
    
    return df

# Mention

You'll note that the mention scores are approximately identical.

* NegBio uses the CheXpert patterns for mention detection
* NegBio does not use the same post-processing filter for `No Finding` that CheXpert does

In [4]:

df = evaluate_labels(gs, df_nih, method='mention')
print('NegBio No Finding:')
display(df.loc['No Finding'])

print('CheXpert mention:')
df = evaluate_labels(gs, df_chexpert, method='mention')

for c in df.columns:
    if 'float' in str(df.dtypes[c]):
        df[c] = np.round(df[c], 3)
df

NegBio No Finding:


precision     0.382353
recall        0.866667
f1            0.530612
positives    30.000000
Name: No Finding, dtype: float64

CheXpert mention:


Unnamed: 0,precision,recall,f1,positives
No Finding,0.403,0.833,0.543,30
Enlarged Cardiomediastinum,0.375,0.6,0.462,70
Cardiomegaly,0.814,0.911,0.859,235
Lung Lesion,0.862,0.848,0.855,66
Lung Opacity,0.715,0.907,0.8,194
Edema,0.799,1.0,0.888,227
Consolidation,0.886,0.979,0.93,95
Pneumonia,0.928,0.987,0.957,223
Atelectasis,0.893,1.0,0.944,218
Pneumothorax,0.945,0.996,0.97,226


# Uncertain

In [5]:
df = evaluate_labels(gs, df_nih, method='uncertain')
df.columns = pd.MultiIndex.from_tuples([('NegBio', c) for c in df.columns])

cx = evaluate_labels(gs, df_chexpert, method='uncertain')
cx.columns = pd.MultiIndex.from_tuples([('CheXpert', c) for c in cx.columns])

df = df.merge(cx, how='inner', left_index=True, right_index=True)
# df.columns.swaplabel(0, 1, axis=1, inplace=True)
df.columns = df.columns.reorder_levels([1, 0])

# re-order columns
df = df[['precision', 'recall', 'f1', 'positives']]

# round values
for c in df.columns:
    if 'float' in str(df.dtypes[c]):
        df[c] = np.round(df[c], 3)

# drop the unecessary final column
df.drop(('positives', 'CheXpert'), axis=1, inplace=True)

# output to latex
df.index.name = 'Uncertainty'

df.to_latex('uncertainty.tex')

df

  'precision', 'predicted', average, warn_for)


Unnamed: 0_level_0,precision,precision,recall,recall,f1,f1,positives
Unnamed: 0_level_1,NegBio,CheXpert,NegBio,CheXpert,NegBio,CheXpert,NegBio
Uncertainty,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No Finding,,,,,,,0
Enlarged Cardiomediastinum,0.033,0.036,1.0,1.0,0.065,0.069,1
Cardiomegaly,0.156,0.0,0.5,0.0,0.237,,14
Lung Lesion,0.0,0.0,0.0,0.0,,,8
Lung Opacity,,,,,,,0
Edema,0.102,0.125,0.5,0.6,0.169,0.207,10
Consolidation,0.529,0.273,0.529,0.176,0.529,0.214,17
Pneumonia,0.432,0.407,0.613,0.565,0.507,0.473,62
Atelectasis,0.333,0.289,0.706,0.647,0.453,0.4,17
Pneumothorax,0.375,0.25,0.375,0.125,0.375,0.167,8


# Negation

In [6]:
df = evaluate_labels(gs, df_nih, method='negation')
df.columns = pd.MultiIndex.from_tuples([('NegBio', c) for c in df.columns])

cx = evaluate_labels(gs, df_chexpert, method='negation')
cx.columns = pd.MultiIndex.from_tuples([('CheXpert', c) for c in cx.columns])

df = df.merge(cx, how='inner', left_index=True, right_index=True)
# df.columns.swaplabel(0, 1, axis=1, inplace=True)
df.columns = df.columns.reorder_levels([1, 0])

# re-order columns
df = df[['precision', 'recall', 'f1', 'positives']]

# round values
for c in df.columns:
    if 'float' in str(df.dtypes[c]):
        df[c] = np.round(df[c], 3)

# drop the unecessary final column (redundant)
df.drop(('positives', 'CheXpert'), axis=1, inplace=True)

# output to latex
df.index.name = 'Negation'

df.to_latex('negation.tex')

df



Unnamed: 0_level_0,precision,precision,recall,recall,f1,f1,positives
Unnamed: 0_level_1,NegBio,CheXpert,NegBio,CheXpert,NegBio,CheXpert,NegBio
Negation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No Finding,,,,,,,0
Enlarged Cardiomediastinum,0.654,0.654,0.607,0.607,0.63,0.63,28
Cardiomegaly,0.855,0.855,0.72,0.72,0.781,0.781,82
Lung Lesion,0.5,0.5,0.5,0.5,0.5,0.5,4
Lung Opacity,0.429,0.533,0.391,0.348,0.409,0.421,23
Edema,0.713,0.714,0.847,0.824,0.774,0.765,85
Consolidation,0.917,0.917,0.957,0.957,0.936,0.936,23
Pneumonia,0.836,0.868,0.735,0.711,0.782,0.781,83
Atelectasis,0.333,0.3,0.75,0.75,0.462,0.429,4
Pneumothorax,0.919,0.926,0.955,0.911,0.937,0.918,179
