In [2]:
# Chest X-ray Classification and Localization using DenseNet

#This notebook implements a transfer learning approach using DenseNet for chest X-ray classification and localization on the NIH Chest X-ray dataset.

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.optim as optim
from torchvision import models
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm

## Data Loading and Preprocessing

In [11]:
# Define paths
BASE_PATH = 'DL_for_HIN_Chest_X_Ray'
IMAGES_PATH = os.path.join(BASE_PATH, 'archive','images', 'images')
CSV_PATH = os.path.join(BASE_PATH, 'Data_Entry_2017_filtered_2.csv')
BBOX_PATH = os.path.join(BASE_PATH,'archive', 'BBox_List_2017.csv')

# Load the filtered dataset
df = pd.read_csv(CSV_PATH)
df.rename(columns={'Finding Labels': 'Finding Label'}, inplace=True)
df.drop(columns=['Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width','Height]', 'OriginalImagePixelSpacing[x', 'y]'], inplace=True)
no_finding_images = df[df['Finding Label'] == 'No Finding'].head(10000)
effusion_images = df[df['Finding Label'] == 'Effusion'].head(3000)
infiltration_images = df[df['Finding Label'] == 'Infiltration'].head(3000)
atelectasis_images = df[df['Finding Label'] == 'Atelectasis'].head(3000)

bbox_df = pd.read_csv(BBOX_PATH)
bbox_df.drop(columns=['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], inplace=True)
bbox_df.columns = ['Image Index','Finding Label', 'x', 'y', 'width', 'height']

combined_df = pd.concat([no_finding_images, bbox_df, effusion_images, infiltration_images, atelectasis_images])

combined_df = combined_df.sample(frac=1).reset_index(drop=True)

combined_df["Finding Label"] = combined_df["Finding Label"].replace({
    'Atelectasis': 'Finding',
    'Cardiomegaly': 'Finding',
    'Effusion': 'Finding',
    'Infiltrate': 'Finding',
    'Infiltration': 'Finding',
    'Mass': 'Finding',
    'Nodule': 'Finding',
    'Pneumonia': 'Finding',
    'Pneumothorax': 'Finding'
})

combined_df = combined_df.drop_duplicates(subset='Image Index')
combined_df = combined_df.drop(columns=['x', 'y', 'width', 'height'])

duplicates = combined_df['Image Index'].value_counts()
duplicates = duplicates[duplicates > 1]
print(duplicates)
print(combined_df['Finding Label'].value_counts())

combined_df['Image Index'].shape

Series([], Name: count, dtype: int64)
Finding Label
No Finding    10000
Finding        9771
Name: count, dtype: int64


(19771,)

In [12]:
import os
from PIL import Image

# Define the new directory to save resized images
resized_images_dir = 'resized_images_20k'
os.makedirs(resized_images_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Strip whitespace from image filenames
combined_df['Image Index'] = combined_df['Image Index'].str.strip()

# Iterate over all images_[XXX] folders in the archive directory
archive_path = os.path.join('DL_for_HIN_Chest_X_Ray', 'archive')  # Adjust this path as needed

# Resize and save images
for index, row in combined_df.iterrows():
    image_name = row['Image Index']
    found = False  # Flag to check if the image was found

    # Walk through the archive directory
    for root, dirs, files in os.walk(archive_path):
        if image_name in files:
            image_path = os.path.join(root, image_name)  # Construct the full image path
            print(f"Found image: {image_path}")  # Debugging line
            found = True
            try:
                # Open the image
                with Image.open(image_path) as img:
                    # Resize the image
                    img_resized = img.resize((224, 224))
                    # Save the resized image to the new directory
                    img_resized.save(os.path.join(resized_images_dir, image_name))
                    print(f"Saved resized image: {image_name}")  # Debugging line
            except OSError as e:
                print(f"Error processing {image_name}: {e}")
            except Exception as e:
                print(f"Unexpected error with {image_name}: {e}")
            break  # Exit the loop once the image is found

    if not found:
        print(f"File does not exist in any subdirectory: {image_name}")

Found image: DL_for_HIN_Chest_X_Ray/archive/images_002/images/00003700_000.png
Saved resized image: 00003700_000.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_002/images/00003863_012.png
Saved resized image: 00003863_012.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_004/images/00006938_007.png
Saved resized image: 00006938_007.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_003/images/00004786_000.png
Saved resized image: 00004786_000.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_001/images/00000181_035.png
Saved resized image: 00000181_035.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_002/images/00001682_000.png
Saved resized image: 00001682_000.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_006/images/00011702_074.png
Saved resized image: 00011702_074.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_002/images/00002246_000.png
Saved resized image: 00002246_000.png
Found image: DL_for_HIN_Chest_X_Ray/archive/images_002/images/00

In [13]:
combined_df.to_csv('bbox_resized_filtered_images_20k.csv', index=False)