In [1]:

import os
import glob
import sys
import sklearn
import pip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from PIL import Image

pd.options.plotting.backend = "plotly"

In [2]:
xray_data = pd.read_csv('nih-chest/Data_Entry_2017.csv')

def get_all_image_paths(base_dir):
    """
    Retrieves all image paths within the specified base directory.

    Args:
    - base_dir (str): The base directory containing the image folders.

    Returns:
    - dict: A dictionary mapping image filenames to their full paths.
    """
    # Pattern to match all PNG images in nested 'images' directories
    pattern = os.path.join(base_dir, 'images*', 'images', '*.png')
    
    # Use glob to find all matching image paths
    image_paths = glob.glob(pattern)
    
    # Create a dictionary mapping from basename to full path
    image_paths = {os.path.basename(x): x for x in image_paths}
    
    return image_paths

image_paths = get_all_image_paths('nih-chest')

assert len(image_paths) > 0, 'No images found!'
assert len(image_paths) == len(xray_data), 'Number of images does not match number of entries in the data file!'

print(f'Scans found: {len(image_paths)} | Total Headers: {len(xray_data)} \nexample path: {list(image_paths.items())[:1]}')

Scans found: 112120 | Total Headers: 112120 
example path: [('00000796_007.png', 'nih-chest/images_001/images/00000796_007.png')]


In [3]:
NUM_CLASSES = 15
ALL_LABELS = ['No Finding', 'Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation', 'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis', 'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema']

xray_data = xray_data.iloc[:, :2]

for label in ALL_LABELS:
        xray_data[label] = xray_data['Finding Labels'].map(lambda finding: 1.0 if label in finding else 0)
        
xray_data.head()

Unnamed: 0,Image Index,Finding Labels,No Finding,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Delete 'weight' column if it exists
if 'weight' in xray_data.columns:
    xray_data = xray_data.drop('weight', axis=1)


# Assign weights based on conditions
conditions = [
    xray_data['Finding Labels'].str.contains('Hernia'),
    xray_data['Finding Labels'].str.contains('Pneumonia'),
    xray_data['Finding Labels'].str.contains('Fibrosis') | xray_data['Finding Labels'].str.contains('Edema'),
    xray_data['Finding Labels'].str.contains('Emphysema'),
]

choices = [20, 4, 3, 2]
default_weight = 1

xray_data.insert(2, 'weight', np.select(conditions, choices, default=default_weight))

old_xray_data = xray_data.copy()
xray_data = xray_data.sample(frac=1, weights='weight', replace=True)
xray_data.drop('weight', axis=1, inplace=True)
xray_data.head()


Unnamed: 0,Image Index,Finding Labels,No Finding,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
82753,00020326_103.png,Infiltration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15099,00003966_000.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91443,00022815_064.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442,00000111_000.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54824,00013716_002.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
raw = old_xray_data[ALL_LABELS].sum()
sampled = xray_data[ALL_LABELS].sum()

compare = pd.DataFrame({
    'Raw': raw,
    'Sampled': sampled
}).sort_values('Raw', ascending=False)

compare.plot.bar(barmode='group', text_auto='.2s')

In [16]:
test_list_path = 'nih-chest/test_list.txt'
train_list_path = 'nih-chest/train_val_list.txt'

with open(test_list_path, 'r') as file:
    test_list = set(file.read().splitlines())
with open(train_list_path, 'r') as file:
    train_list = set(file.read().splitlines())
    
print(f'Num items in train list: {len(train_list)} | Num items in test list: {len(test_list)}')


train_df = xray_data[xray_data['Image Index'].isin(train_list)]
test_df = xray_data[xray_data['Image Index'].isin(test_list)]

Num items in train list: 86524 | Num items in test list: 25596


In [17]:
train_df.head()

Unnamed: 0,Image Index,Finding Labels,No Finding,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
82753,00020326_103.png,Infiltration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15099,00003966_000.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442,00000111_000.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54824,00013716_002.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77458,00019027_005.png,Hernia|Infiltration|Pneumothorax,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [14]:
train_df.to_pickle('nih-chest/train_df.pkl')
test_df.to_pickle('nih-chest/test_df.pkl')

In [None]:
# Get Images paths
def get_all_image_paths(base_dir):
    """
    Retrieves all image paths within the specified base directory.

    Args:
    - base_dir (str): The base directory containing the image folders.

    Returns:
    - dict: A dictionary mapping image filenames to their full paths.
    """
    # Pattern to match all PNG images in nested 'images' directories
    pattern = os.path.join(base_dir, 'images*', 'images', '*.png')
    
    # Use glob to find all matching image paths
    image_paths = glob.glob(pattern)
    
    # Create a dictionary mapping from basename to full path
    image_paths = {os.path.basename(x): x for x in image_paths}
    
    return image_paths

image_paths = get_all_image_paths('nih-chest')

image_paths

In [9]:
# train_df['Image Index'] = train_df['Image Index'].map(lambda index: image_paths[index])
# test_df['Image Index'] = test_df['Image Index'].map(lambda index: image_paths[index])

train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

train_df_copy['Image Index'] = train_df_copy['Image Index'].map(lambda index: image_paths[index])
test_df_copy['Image Index'] = test_df_copy['Image Index'].map(lambda index: image_paths[index])

train_df.head()


KeyError: nan

In [91]:
train_df.head()

Unnamed: 0,Image Index,Finding Labels,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,nih-chest/images_001/images/00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,nih-chest/images_001/images/00000001_001.png,Cardiomegaly|Emphysema,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,nih-chest/images_001/images/00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,nih-chest/images_001/images/00000004_000.png,Mass|Nodule,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19,nih-chest/images_001/images/00000005_006.png,Infiltration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [67]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=24)

train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values

for train_index, val_index in msss.split(np.zeros(train_data.shape[0]), train_labels):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_labels[train_index], train_labels[val_index]
    
X_test = test_df['Image Index']
y_test = test_df[test_df.columns[2:]].values

(X_train.shape, y_train.shape), (X_val.shape, y_val.shape), (X_test.shape, y_test.shape)

(((30620,), (30620, 14)), ((5404,), (5404, 14)), ((15735,), (15735, 14)))

In [68]:
print(f'Train size: X: {len(X_train)}, y: {len(y_train)} | '
      f'Validation size: X: {len(X_val)}, y: {len(y_val)} | '
      f'Test size: X: {len(X_test)}, y: {len(y_test)} | '
      f'Total size: X: {len(X_train) + len(X_val) + len(X_test)}'
)


Train size: X: 30620, y: 30620 | Validation size: X: 5404, y: 5404 | Test size: X: 15735, y: 15735 | Total size: X: 51759


In [69]:
label_counts = test_df[test_df.columns[2:]].sum().sort_values(ascending=False)
label_counts.plot.bar(text_auto='.2s')
label_counts

Infiltration          6112.0
Effusion              4658.0
Atelectasis           3279.0
Pneumothorax          2665.0
Consolidation         1815.0
Mass                  1748.0
Nodule                1623.0
Pleural_Thickening    1143.0
Emphysema             1093.0
Cardiomegaly          1069.0
Edema                  925.0
Pneumonia              555.0
Fibrosis               435.0
Hernia                  86.0
dtype: float64

In [70]:
df = pd.read_pickle('nih-chest/processed_data.pkl')

In [71]:
label_cols = df.columns[2:]
label_cols

Index(['Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation',
       'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis',
       'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema'],
      dtype='object')

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
labels = df[label_cols].values
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
# Load data
train_df = pd.read_pickle('nih-chest/train_df.pkl')
test_df = pd.read_pickle('nih-chest/test_df.pkl')

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=24)


In [87]:
print(len(['Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation', 'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis', 'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema']))

14


In [75]:
train_df.head()

Unnamed: 0,Image Index,Finding Labels,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,00000004_000.png,Mass|Nodule,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19,00000005_006.png,Infiltration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [93]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].valuestrain_df['Image Index'][:2], train_df[train_df.columns[2:]].values[:2]

(0    nih-chest/images_001/images/00000001_000.png
 1    nih-chest/images_001/images/00000001_001.png
 Name: Image Index, dtype: object,
 array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]))

In [95]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values

X_train, X_val, y_train, y_val = train_test_split(train_data,
                                     train_labels,
                                     test_size=0.2, 
                                     random_state=14)


print(f'Train size: X: {len(X_train)}, y: {len(y_train)}')
print(f'Validation size: X: {len(X_val)}, y: {len(y_val)}')

y_train[:5], type(y_train)

Train size: X: 28819, y: 28819
Validation size: X: 7205, y: 7205


(array([[0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]),
 numpy.ndarray)

In [77]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values

for train_index, val_index in msss.split(np.zeros(train_data.shape[0]), train_labels):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_labels[train_index], train_labels[val_index]
    
X_test = test_df['Image Index']
y_test = test_df[test_df.columns[2:]].values

print(f'Train size: X: {len(X_train)}, y: {len(y_train)} | '
      f'Validation size: X: {len(X_val)}, y: {len(y_val)} | '
      f'Test size: X: {len(X_test)}, y: {len(y_test)} | '
      f'Total size: X: {len(X_train) + len(X_val) + len(X_test)}'
)

print(f'X_train: {X_train.shape}, y_train: {y_train.shape} | '
      f'X_val: {X_val.shape}, y_val: {y_val.shape} | '
      f'X_test: {X_test.shape}, y_test: {y_test.shape}'
)   

Train size: X: 30620, y: 30620 | Validation size: X: 5404, y: 5404 | Test size: X: 15735, y: 15735 | Total size: X: 51759
X_train: (30620,), y_train: (30620, 14) | X_val: (5404,), y_val: (5404, 14) | X_test: (15735,), y_test: (15735, 14)


In [86]:
y_train[:1]


array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])