In [1]:
# # data download

# import kagglehub
# import os

# # Download latest version
# nih_path = kagglehub.dataset_download("nih-chest-xrays/data")

# images_dir = os.path.join(nih_path, "images")
# os.makedirs(images_dir, exist_ok=True)

# for dir in os.listdir(nih_path):
#     if os.path.isdir(os.path.join(nih_path, dir)):
#         if dir != "images":
#             for file in os.listdir(os.path.join(nih_path, dir, "images")):
#                 if file.endswith(".png"):
#                     os.rename(
#                         os.path.join(nih_path, dir, "images", file),
#                         os.path.join(images_dir, file)
#                     )

In [2]:
import pandas as pd

df = pd.read_csv("../../../.cache/kagglehub/datasets/nih-chest-xrays/data/versions/3/Data_Entry_2017.csv")

# explore Finding Labels
label_set = set()

for idx, row in df.iterrows():
    labels = row["Finding Labels"].split("|")
    for label in labels:
        label_set.add(label)

# with the label set, reconstruct the columns
# if a label is in the Finding Labels, set the corresponding column to 1, else 0
for label in label_set:
    df[label] = df["Finding Labels"].apply(lambda x: 1 if label in x.split("|") else 0)

# filter out Finding Labels == "No Finding"
df = df[df["Finding Labels"] != "No Finding"]

"""
columns to keep

['Image Index', 'Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis'
"""

df = df[[
    'Image Index', 'Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 
    'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 
    'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis'
]]

df.head(3)

Unnamed: 0,Image Index,Finding Labels,Patient ID,Hernia,Pneumothorax,Nodule,Edema,Effusion,Pleural_Thickening,Cardiomegaly,Mass,Fibrosis,Consolidation,Pneumonia,Infiltration,Emphysema,Atelectasis
0,00000001_000.png,Cardiomegaly,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,00000001_002.png,Cardiomegaly|Effusion,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0


In [3]:
train_val_ids = []
test_ids = []

with open("../../../.cache/kagglehub/datasets/nih-chest-xrays/data/versions/3/train_val_list.txt") as f:
    lines = f.readlines()
    for line in lines:
        train_val_ids.append(line.strip())
        
with open("../../../.cache/kagglehub/datasets/nih-chest-xrays/data/versions/3/test_list.txt") as f:
    lines = f.readlines()
    for line in lines:
        test_ids.append(line.strip())

In [4]:
# create train_val df and test df
train_val_df = df[df["Image Index"].isin(train_val_ids)].reset_index(drop=True)
test_df = df[df["Image Index"].isin(test_ids)].reset_index(drop=True)

# import df split => train_test split
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.15,
    random_state=42,
)

train_df.to_csv("../src/clean_nih_train_split.csv", index=False)
val_df.to_csv("../src/clean_nih_val_split.csv", index=False)
test_df.to_csv("../src/clean_nih_test_split.csv", index=False)

In [5]:
import pandas as pd
import numpy as np

def analyze_dataframe(df, path_col="Image Index"):
    print("========== BASIC INFO ==========")
    print("Total samples:", len(df))
    print("Columns:", list(df.columns))
    print()

    if path_col in df.columns:
        print("========== PATH COLUMN CHECK ==========")
        print("Missing paths:", df[path_col].isna().sum())
        print("Unique paths:", df[path_col].nunique())
        print("Example path:", df[path_col].iloc[0])
        print()

    # Identify label columns (everything except path)
    label_cols = [c for c in df.columns if c != path_col]
    print("Label columns:", label_cols)
    print()

    print("========== LABEL SUMMARY ==========")
    print(df[label_cols].describe(include="all"))
    print()

    print("========== NULL VALUES ==========")
    print(df[label_cols].isna().sum())
    print()

    print("========== VALUE COUNTS PER LABEL ==========")
    for col in label_cols:
        print(f"\n--- {col} ---")
        counts = df[col].value_counts(dropna=False)
        print(counts)
        print("Positives (%):", round((df[col] == 1).mean() * 100, 3))
        print("Negatives (%):", round((df[col] == 0).mean() * 100, 3))
        if (-1 in counts.index):
            print("Uncertain (%):", round((df[col] == -1).mean() * 100, 3))
    print()

    print("========== CLASS IMBALANCE RATIO ==========")
    imbalance = (df[label_cols] == 1).sum() / (df[label_cols] == 0).sum()
    print(imbalance)
    print()

    print("========== TOTAL POSITIVES PER LABEL ==========")
    print((df[label_cols] == 1).sum())
    print()

    print("========== TOTAL NEGATIVES PER LABEL ==========")
    print((df[label_cols] == 0).sum())
    print()

    if (df[label_cols] == -1).any().any():
        print("========== TOTAL UNCERTAIN (-1) PER LABEL ==========")
        print((df[label_cols] == -1).sum())
        print()

    print("========== MULTI-LABEL STATS ==========")
    df["num_labels_positive"] = (df[label_cols] == 1).sum(axis=1)
    print(df["num_labels_positive"].describe())
    print("Images with ≥1 positive label:", (df["num_labels_positive"] >= 1).sum())
    print("Images with 0 positive labels:", (df["num_labels_positive"] == 0).sum())
    print()

    print("========== CO-OCCURRENCE: LABELS PER IMAGE ==========")
    print("Histogram of positive label counts:")
    print(df["num_labels_positive"].value_counts().sort_index())
    print()

    print("========== SAMPLE ROWS ==========")
    print(df.head(5))
    print()

    print("========== DONE ==========")
    
analyze_dataframe(train_val_df, path_col="Image Index")

Total samples: 36024
Columns: ['Image Index', 'Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis']

Missing paths: 0
Unique paths: 36024
Example path: 00000001_000.png

Label columns: ['Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis']

       Finding Labels    Patient ID        Hernia  Pneumothorax        Nodule  \
count           36024  36024.000000  36024.000000  36024.000000  36024.000000   
unique            618           NaN           NaN           NaN           NaN   
top      Infiltration           NaN           NaN           NaN           NaN   
freq             7327           NaN           NaN           NaN           NaN   
mean              Na

In [6]:
analyze_dataframe(test_df, path_col="Image Index")

Total samples: 15735
Columns: ['Image Index', 'Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis']

Missing paths: 0
Unique paths: 15735
Example path: 00000003_000.png

Label columns: ['Finding Labels', 'Patient ID', 'Hernia', 'Pneumothorax', 'Nodule', 'Edema', 'Effusion', 'Pleural_Thickening', 'Cardiomegaly', 'Mass', 'Fibrosis', 'Consolidation', 'Pneumonia', 'Infiltration', 'Emphysema', 'Atelectasis']

       Finding Labels    Patient ID        Hernia  Pneumothorax        Nodule  \
count           15735  15735.000000  15735.000000  15735.000000  15735.000000   
unique            615           NaN           NaN           NaN           NaN   
top      Infiltration           NaN           NaN           NaN           NaN   
freq             2220           NaN           NaN           NaN           NaN   
mean              Na