In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedasak/chest-x-ray-6-classes-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mohamedasak/chest-x-ray-6-classes-dataset?dataset_version_number=1...


100%|██████████| 194M/194M [00:03<00:00, 64.7MB/s] 

Extracting files...





Path to dataset files: /users/yliu802/.cache/kagglehub/datasets/mohamedasak/chest-x-ray-6-classes-dataset/versions/1


In [2]:
import os

dict = {}

"""
  ├── Covid-19/
  ├── Pneumonia-Bacterial/
  ├── Pneumonia-Viral/
  ├── Tuberculosis/
  ├── Emphysema/
  └── Normal/
"""

for major_dir in os.listdir(path):
    for dir in os.listdir(os.path.join(path, major_dir)):
        subpath = os.path.join(path, major_dir, dir)
        if os.path.isdir(subpath):
            for subdir in os.listdir(subpath):
                for file in os.listdir(os.path.join(subpath, subdir)):
                    label = subdir
                    split = dir
                    dict[dir + '/' + subdir + '/' + file] = (label, split)
                    
import pandas as pd
df = pd.DataFrame.from_dict(dict, orient='index', columns=['label', 'split'])
df.index.name = 'Path'
df.reset_index(inplace=True)
df

Unnamed: 0,Path,label,split
0,test/Covid-19/COVID-1.jpg,Covid-19,test
1,test/Covid-19/COVID-1028.jpg,Covid-19,test
2,test/Covid-19/COVID-1036.jpg,Covid-19,test
3,test/Covid-19/COVID-1039.jpg,Covid-19,test
4,test/Covid-19/COVID-1050.jpg,Covid-19,test
...,...,...,...
18031,val/Tuberculosis/augmented_Tuberculosis-659_0_...,Tuberculosis,val
18032,val/Tuberculosis/augmented_Tuberculosis-659_0_...,Tuberculosis,val
18033,val/Tuberculosis/augmented_Tuberculosis-659_0_...,Tuberculosis,val
18034,val/Tuberculosis/augmented_Tuberculosis-659_0_...,Tuberculosis,val


In [3]:
# print rows containing nans
print(df[df.isna().any(axis=1)])

Empty DataFrame
Columns: [Path, label, split]
Index: []


In [4]:

# convert label to one-hot encoding
df = pd.concat([df, pd.get_dummies(df['label'])], axis=1)
df = df.drop(columns=['label'])

# convert to int64 type
for col in df.columns:
    if col not in ['Path', 'split']:
        df[col] = df[col].astype('int64')
        
df

Unnamed: 0,Path,split,Covid-19,Emphysema,Normal,Pneumonia-Bacterial,Pneumonia-Viral,Tuberculosis
0,test/Covid-19/COVID-1.jpg,test,1,0,0,0,0,0
1,test/Covid-19/COVID-1028.jpg,test,1,0,0,0,0,0
2,test/Covid-19/COVID-1036.jpg,test,1,0,0,0,0,0
3,test/Covid-19/COVID-1039.jpg,test,1,0,0,0,0,0
4,test/Covid-19/COVID-1050.jpg,test,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
18031,val/Tuberculosis/augmented_Tuberculosis-659_0_...,val,0,0,0,0,0,1
18032,val/Tuberculosis/augmented_Tuberculosis-659_0_...,val,0,0,0,0,0,1
18033,val/Tuberculosis/augmented_Tuberculosis-659_0_...,val,0,0,0,0,0,1
18034,val/Tuberculosis/augmented_Tuberculosis-659_0_...,val,0,0,0,0,0,1


In [5]:
print(df[df.isna().any(axis=1)])

Empty DataFrame
Columns: [Path, split, Covid-19, Emphysema, Normal, Pneumonia-Bacterial, Pneumonia-Viral, Tuberculosis]
Index: []


In [6]:
# save to csv files based on split
train_df = df[df['split'] == 'train'].drop(columns=['split'])
val_df = df[df['split'] == 'val'].drop(columns=['split'])
test_df = df[df['split'] == 'test'].drop(columns=['split'])

# shuffle the dataframes
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# check the distribution of labels in the train/val/test splits
# for each label, print the number count (0/1, etc. )
for col in train_df.columns:
    if col not in ['Path']:
        print(f"Train split - Label: {col}")
        # don't ignore nans
        print(train_df[col].value_counts(dropna=False))
        print()
for col in val_df.columns:
    if col not in ['Path']:
        print(f"Val split - Label: {col}")
        print(val_df[col].value_counts(dropna=False))
        print()
for col in test_df.columns:
    if col not in ['Path']:
        print(f"Test split - Label: {col}")
        print(test_df[col].value_counts(dropna=False))
        print()

Train split - Label: Covid-19
Covid-19
0    12134
1     2417
Name: count, dtype: int64

Train split - Label: Emphysema
Emphysema
0    12501
1     2050
Name: count, dtype: int64

Train split - Label: Normal
Normal
0    11880
1     2671
Name: count, dtype: int64

Train split - Label: Pneumonia-Bacterial
Pneumonia-Bacterial
0    12151
1     2400
Name: count, dtype: int64

Train split - Label: Pneumonia-Viral
Pneumonia-Viral
0    12138
1     2413
Name: count, dtype: int64

Train split - Label: Tuberculosis
Tuberculosis
0    11951
1     2600
Name: count, dtype: int64

Val split - Label: Covid-19
Covid-19
0    1448
1     300
Name: count, dtype: int64

Val split - Label: Emphysema
Emphysema
0    1498
1     250
Name: count, dtype: int64

Val split - Label: Normal
Normal
0    1448
1     300
Name: count, dtype: int64

Val split - Label: Pneumonia-Bacterial
Pneumonia-Bacterial
0    1448
1     300
Name: count, dtype: int64

Val split - Label: Pneumonia-Viral
Pneumonia-Viral
0    1448
1     300
Nam

In [7]:

# save to csv files
train_df.to_csv('../src/chestx6_train_split.csv', index=False)
val_df.to_csv('../src/chestx6_val_split.csv', index=False)
test_df.to_csv('../src/chestx6_test_split.csv', index=False)

# labels
class_names = ["Covid-19","Emphysema","Normal","Pneumonia-Bacterial","Pneumonia-Viral","Tuberculosis"]