In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("awsaf49/vinbigdata-512-image-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/awsaf49/vinbigdata-512-image-dataset?dataset_version_number=1...


100%|██████████| 2.30G/2.30G [00:30<00:00, 81.5MB/s]

Extracting files...





Path to dataset files: /users/yliu802/.cache/kagglehub/datasets/awsaf49/vinbigdata-512-image-dataset/versions/1


In [2]:
import pandas as pd

# get from https://physionet.org/content/vindr-cxr/1.0.0/annotations/#files-panel
train_df = pd.read_csv("../src/image_labels_train.csv")

# drop no finding column
train_df = train_df.drop(columns=["No finding"])

# figure out which columns are labels
label_cols = [c for c in train_df.columns if c not in ["image_id", "rad_id"]]

# group by image
grouped = train_df.groupby("image_id")

# soft: mean label per image
soft_df = grouped[label_cols].mean()          # values in [0,1]

# hard majority vote: >= 0.5 -> 1 else 0
hard_majority_df = (soft_df >= 0.5).astype(int).reset_index()
hard_majority_df.head()

Unnamed: 0,image_id,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Clavicle fracture,Consolidation,Edema,Emphysema,Enlarged PA,...,Pleural thickening,Pneumothorax,Pulmonary fibrosis,Rib fracture,Other lesion,COPD,Lung tumor,Pneumonia,Tuberculosis,Other diseases
0,000434271f63a053c4128a0ba6352c7f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00053190460d56c53cc3e57321387478,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0005e8e3701dfb1dd93d53e2ff537b6e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
3,0006e0a85696f6bb578e84fafa9a5607,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0007d316f756b3fa0baea2ff514ce945,1,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [3]:
pos_counts = hard_majority_df[label_cols].sum().sort_values()

# Choose threshold
THRESH = 50

keep_labels = pos_counts[pos_counts >= THRESH].index.tolist()
drop_labels = pos_counts[pos_counts < THRESH].index.tolist()

print("Keeping:", keep_labels)
print("Dropping:", drop_labels)

filtered_df = hard_majority_df[["image_id"] + keep_labels]

# for all the value in "image_id", add ".png" suffix
filtered_df["image_id"] = filtered_df["image_id"].apply(lambda x: x + ".png")

# count how many positive samples each label has
pos_counts = filtered_df[keep_labels].sum(axis=0)

print(pos_counts.sort_values())  # smallest first

Keeping: ['Pneumothorax', 'Atelectasis', 'Mediastinal shift', 'Consolidation', 'Lung tumor', 'ILD', 'Calcification', 'Infiltration', 'Other lesion', 'Nodule/Mass', 'Pneumonia', 'Tuberculosis', 'Lung Opacity', 'Pleural effusion', 'Pleural thickening', 'Pulmonary fibrosis', 'Cardiomegaly', 'Aortic enlargement', 'Other diseases']
Dropping: ['Edema', 'Clavicle fracture', 'Lung cyst', 'COPD', 'Emphysema', 'Lung cavity', 'Enlarged PA', 'Rib fracture']
Pneumothorax            58
Atelectasis             62
Mediastinal shift       85
Consolidation          121
Lung tumor             134
ILD                    152
Calcification          177
Infiltration           245
Other lesion           362
Nodule/Mass            409
Pneumonia              471
Tuberculosis           482
Lung Opacity           547
Pleural effusion       634
Pleural thickening     882
Pulmonary fibrosis    1017
Cardiomegaly          1817
Aortic enlargement    2346
Other diseases        4003
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["image_id"] = filtered_df["image_id"].apply(lambda x: x + ".png")


In [4]:

# split the training data into train and val sets
from sklearn.model_selection import train_test_split

train_ids, val_ids = train_test_split(
    filtered_df["image_id"].values,
    test_size=0.1,
    random_state=42,
    shuffle=True,
)

new_train_df = filtered_df[filtered_df["image_id"].isin(train_ids)].reset_index(drop=True)
new_val_df = filtered_df[filtered_df["image_id"].isin(val_ids)].reset_index(drop=True)

In [5]:
import pandas as pd

test_df = pd.read_csv("../src/image_labels_test.csv")

# rename Other disease to Other diseases
test_df = test_df.rename(columns={"Other disease": "Other diseases"})

# only keep the labels we are using
test_df = test_df[["image_id"] + keep_labels]

# add .png suffix to image_id
test_df["image_id"] = test_df["image_id"].apply(lambda x: x + ".png")

test_df.head()

Unnamed: 0,image_id,Pneumothorax,Atelectasis,Mediastinal shift,Consolidation,Lung tumor,ILD,Calcification,Infiltration,Other lesion,Nodule/Mass,Pneumonia,Tuberculosis,Lung Opacity,Pleural effusion,Pleural thickening,Pulmonary fibrosis,Cardiomegaly,Aortic enlargement,Other diseases
0,e0dc2e79105ad93532484e956ef8a71a.png,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1
1,0aed23e64ebdea798486056b4f174424.png,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0
2,aa15cfcfca7605465ca0513902738b95.png,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0
3,665c4a6d2693dc0286d65ab479c9b169.png,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
4,42da2c134b53cb5594774d3d29faac59.png,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1


In [6]:
new_train_df.to_csv("../src/vindr_train_split.csv", index=False)
new_val_df.to_csv("../src/vindr_val_split.csv", index=False)
test_df.to_csv("../src/vindr_test_split.csv", index=False)