In [2]:

import os
import glob
import sys
import sklearn
import pip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from PIL import Image

pd.options.plotting.backend = "plotly"

In [3]:
xray_data = pd.read_csv('nih-chest/Data_Entry_2017.csv')

def get_all_image_paths(base_dir):
    """
    Retrieves all image paths within the specified base directory.

    Args:
    - base_dir (str): The base directory containing the image folders.

    Returns:
    - dict: A dictionary mapping image filenames to their full paths.
    """
    # Pattern to match all PNG images in nested 'images' directories
    pattern = os.path.join(base_dir, 'images*', 'images', '*.png')
    
    # Use glob to find all matching image paths
    image_paths = glob.glob(pattern)
    
    # Create a dictionary mapping from basename to full path
    image_paths = {os.path.basename(x): x for x in image_paths}
    
    return image_paths

image_paths = get_all_image_paths('nih-chest')

assert len(image_paths) > 0, 'No images found!'
assert len(image_paths) == len(xray_data), 'Number of images does not match number of entries in the data file!'

print(f'Scans found: {len(image_paths)} | Total Headers: {len(xray_data)} \nexample path: {list(image_paths.items())[:1]}')

Scans found: 112120 | Total Headers: 112120 
example path: [('00000796_007.png', 'nih-chest/images_001/images/00000796_007.png')]


In [4]:
NUM_CLASSES = 15
ALL_LABELS = ['No Finding', 'Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation', 'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis', 'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema']

xray_data = xray_data.iloc[:, :2]

for label in ALL_LABELS:
        xray_data[label] = xray_data['Finding Labels'].map(lambda finding: 1.0 if label in finding else 0)
        
#xray_data = xray_data[xray_data['Finding Labels'] != 'No Finding']
#xray_data = xray_data.drop(columns=['No Finding'])
xray_data.head()

Unnamed: 0,Image Index,Finding Labels,No Finding,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# save the data
xray_data.to_pickle('nih-chest/processed_data.pkl')

In [6]:
label_counts = xray_data[xray_data.columns[2:]].sum().sort_values(ascending=False)
label_counts.plot.bar(text_auto='.2s')
label_counts

No Finding            60361.0
Infiltration          19894.0
Effusion              13317.0
Atelectasis           11559.0
Nodule                 6331.0
Mass                   5782.0
Pneumothorax           5302.0
Consolidation          4667.0
Pleural_Thickening     3385.0
Cardiomegaly           2776.0
Emphysema              2516.0
Edema                  2303.0
Fibrosis               1686.0
Pneumonia              1431.0
Hernia                  227.0
dtype: float64

In [7]:
test_list_path = 'nih-chest/test_list.txt'
train_list_path = 'nih-chest/train_val_list.txt'

with open(test_list_path, 'r') as file:
    test_list = set(file.read().splitlines())
with open(train_list_path, 'r') as file:
    train_list = set(file.read().splitlines())

train_df = xray_data[xray_data['Image Index'].isin(train_list)]
test_df = xray_data[xray_data['Image Index'].isin(test_list)]



In [8]:
train_df.to_pickle('nih-chest/train_df.pkl')
test_df.to_pickle('nih-chest/test_df.pkl')

In [9]:
# Get Images paths
def get_all_image_paths(base_dir):
    """
    Retrieves all image paths within the specified base directory.

    Args:
    - base_dir (str): The base directory containing the image folders.

    Returns:
    - dict: A dictionary mapping image filenames to their full paths.
    """
    # Pattern to match all PNG images in nested 'images' directories
    pattern = os.path.join(base_dir, 'images*', 'images', '*.png')
    
    # Use glob to find all matching image paths
    image_paths = glob.glob(pattern)
    
    # Create a dictionary mapping from basename to full path
    image_paths = {os.path.basename(x): x for x in image_paths}
    
    return image_paths

image_paths = get_all_image_paths('nih-chest')

image_paths

{'00000796_007.png': 'nih-chest/images_001/images/00000796_007.png',
 '00000181_020.png': 'nih-chest/images_001/images/00000181_020.png',
 '00000561_003.png': 'nih-chest/images_001/images/00000561_003.png',
 '00000390_001.png': 'nih-chest/images_001/images/00000390_001.png',
 '00000764_002.png': 'nih-chest/images_001/images/00000764_002.png',
 '00000037_000.png': 'nih-chest/images_001/images/00000037_000.png',
 '00000779_000.png': 'nih-chest/images_001/images/00000779_000.png',
 '00000133_000.png': 'nih-chest/images_001/images/00000133_000.png',
 '00000525_000.png': 'nih-chest/images_001/images/00000525_000.png',
 '00000612_000.png': 'nih-chest/images_001/images/00000612_000.png',
 '00000870_005.png': 'nih-chest/images_001/images/00000870_005.png',
 '00000003_005.png': 'nih-chest/images_001/images/00000003_005.png',
 '00000377_000.png': 'nih-chest/images_001/images/00000377_000.png',
 '00000212_000.png': 'nih-chest/images_001/images/00000212_000.png',
 '00000870_002.png': 'nih-chest/im

In [10]:
train_df['Image Index'] = train_df['Image Index'].map(lambda index: image_paths[index])
test_df['Image Index'] = test_df['Image Index'].map(lambda index: image_paths[index])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
train_df.head()

Unnamed: 0,Image Index,Finding Labels,No Finding,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,nih-chest/images_001/images/00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,nih-chest/images_001/images/00000001_001.png,Cardiomegaly|Emphysema,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,nih-chest/images_001/images/00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,nih-chest/images_001/images/00000002_000.png,No Finding,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,nih-chest/images_001/images/00000004_000.png,Mass|Nodule,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=24)

train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[3:]].values

for train_index, val_index in msss.split(np.zeros(train_data.shape[0]), train_labels):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_labels[train_index], train_labels[val_index]
    
X_test = test_df['Image Index']
y_test = test_df[test_df.columns[2:]].values

(X_train.shape, y_train.shape), (X_val.shape, y_val.shape), (X_test.shape, y_test.shape)

(((73545,), (73545, 14)), ((12979,), (12979, 14)), ((25596,), (25596, 15)))

In [13]:
print(f'Train size: X: {len(X_train)}, y: {len(y_train)} | '
      f'Validation size: X: {len(X_val)}, y: {len(y_val)} | '
      f'Test size: X: {len(X_test)}, y: {len(y_test)} | '
      f'Total size: X: {len(X_train) + len(X_val) + len(X_test)}'
)


Train size: X: 73545, y: 73545 | Validation size: X: 12979, y: 12979 | Test size: X: 25596, y: 25596 | Total size: X: 112120


In [14]:
label_counts = test_df[test_df.columns[2:]].sum().sort_values(ascending=False)
label_counts.plot.bar(text_auto='.2s')
label_counts

No Finding            9861.0
Infiltration          6112.0
Effusion              4658.0
Atelectasis           3279.0
Pneumothorax          2665.0
Consolidation         1815.0
Mass                  1748.0
Nodule                1623.0
Pleural_Thickening    1143.0
Emphysema             1093.0
Cardiomegaly          1069.0
Edema                  925.0
Pneumonia              555.0
Fibrosis               435.0
Hernia                  86.0
dtype: float64

In [None]:
df = pd.read_pickle('nih-chest/processed_data.pkl')

In [None]:
label_cols = df.columns[2:]
label_cols

Index(['Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation',
       'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis',
       'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels = df[label_cols].values
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Load data
train_df = pd.read_pickle('nih-chest/train_df.pkl')
test_df = pd.read_pickle('nih-chest/test_df.pkl')

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=24)


In [None]:
print(len(['Hernia', 'Emphysema', 'Nodule', 'Pneumonia', 'Consolidation', 'Cardiomegaly', 'Effusion', 'Mass', 'Pleural_Thickening', 'Atelectasis', 'Pneumothorax', 'Fibrosis', 'Infiltration', 'Edema']))

14


In [None]:
train_df.head()

Unnamed: 0,Image Index,Finding Labels,Hernia,Emphysema,Nodule,Pneumonia,Consolidation,Cardiomegaly,Effusion,Mass,Pleural_Thickening,Atelectasis,Pneumothorax,Fibrosis,Infiltration,Edema
0,00000001_000.png,Cardiomegaly,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,00000004_000.png,Mass|Nodule,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19,00000005_006.png,Infiltration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values, train_df[train_df.columns[2:]].values[:2]

In [None]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values

X_train, X_val, y_train, y_val = train_test_split(train_data,
                                     train_labels,
                                     test_size=0.2, 
                                     random_state=14)


print(f'Train size: X: {len(X_train)}, y: {len(y_train)}')
print(f'Validation size: X: {len(X_val)}, y: {len(y_val)}')

y_train[:5], type(y_train)

Train size: X: 28819, y: 28819
Validation size: X: 7205, y: 7205


(array([[0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]),
 numpy.ndarray)

In [None]:
train_data = train_df['Image Index']
train_labels = train_df[train_df.columns[2:]].values

for train_index, val_index in msss.split(np.zeros(train_data.shape[0]), train_labels):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_labels[train_index], train_labels[val_index]
    
X_test = test_df['Image Index']
y_test = test_df[test_df.columns[2:]].values

print(f'Train size: X: {len(X_train)}, y: {len(y_train)} | '
      f'Validation size: X: {len(X_val)}, y: {len(y_val)} | '
      f'Test size: X: {len(X_test)}, y: {len(y_test)} | '
      f'Total size: X: {len(X_train) + len(X_val) + len(X_test)}'
)

print(f'X_train: {X_train.shape}, y_train: {y_train.shape} | '
      f'X_val: {X_val.shape}, y_val: {y_val.shape} | '
      f'X_test: {X_test.shape}, y_test: {y_test.shape}'
)   

Train size: X: 30620, y: 30620 | Validation size: X: 5404, y: 5404 | Test size: X: 15735, y: 15735 | Total size: X: 51759
X_train: (30620,), y_train: (30620, 14) | X_val: (5404,), y_val: (5404, 14) | X_test: (15735,), y_test: (15735, 14)


In [None]:
y_train[:1]


array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
type(y_train)

numpy.ndarray

In [None]:
num_labels = y_train.shape[1]
class_counts = np.sum(y_train, axis=0)
total_counts = np.sum(class_counts)
class_weights = total_counts / (num_labels * class_counts)

# Normalizing weights so that the minimum weight is 1.0
class_weights = class_weights / np.min(class_weights)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

class_weight_dict


{0: 97.625,
 1: 9.68181818181818,
 2: 2.9272863568215888,
 3: 15.7248322147651,
 4: 4.832920792079207,
 5: 8.073742246726395,
 6: 1.5917119565217388,
 7: 3.416447944006999,
 8: 6.146379853095487,
 9: 1.6645353793691386,
 10: 5.227576974564926,
 11: 11.020696142991532,
 12: 1.0,
 13: 10.004269854824935}

In [None]:
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight(class_weight="balanced",
                                            classes=np.unique(y_train),
                                            y=y_train)

weights

TypeError: unhashable type: 'numpy.ndarray'