In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("awsaf49/vinbigdata-512-image-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

# get from https://physionet.org/content/vindr-cxr/1.0.0/annotations/#files-panel
train_df = pd.read_csv("../src/vindr/image_labels_train.csv")

# drop no finding column
train_df = train_df.drop(columns=["No finding"])

# figure out which columns are labels
label_cols = [c for c in train_df.columns if c not in ["image_id", "rad_id"]]

# group by image
grouped = train_df.groupby("image_id")

# soft: mean label per image
soft_df = grouped[label_cols].mean()          # values in [0,1]

# hard majority vote: >= 0.5 -> 1 else 0
hard_majority_df = (soft_df >= 0.5).astype(int).reset_index()
hard_majority_df.head()

In [None]:
pos_counts = hard_majority_df[label_cols].sum().sort_values()

# Choose threshold
THRESH = 50

keep_labels = pos_counts[pos_counts >= THRESH].index.tolist()
drop_labels = pos_counts[pos_counts < THRESH].index.tolist()

print("Keeping:", keep_labels)
print("Dropping:", drop_labels)

filtered_df = hard_majority_df[["image_id"] + keep_labels]

# for all the value in "image_id", add ".png" suffix
filtered_df["image_id"] = filtered_df["image_id"].apply(lambda x: x + ".png")

# count how many positive samples each label has
pos_counts = filtered_df[keep_labels].sum(axis=0)

print(pos_counts.sort_values())  # smallest first

In [None]:

# split the training data into train and val sets
from sklearn.model_selection import train_test_split

train_ids, val_ids = train_test_split(
    filtered_df["image_id"].values,
    test_size=0.1,
    random_state=42,
    shuffle=True,
)

new_train_df = filtered_df[filtered_df["image_id"].isin(train_ids)].reset_index(drop=True)
new_val_df = filtered_df[filtered_df["image_id"].isin(val_ids)].reset_index(drop=True)

In [None]:
import pandas as pd

test_df = pd.read_csv("../src/vindr/image_labels_test.csv")

# rename Other disease to Other diseases
test_df = test_df.rename(columns={"Other disease": "Other diseases"})

# only keep the labels we are using
test_df = test_df[["image_id"] + keep_labels]

# add .png suffix to image_id
test_df["image_id"] = test_df["image_id"].apply(lambda x: x + ".png")

test_df.head()

In [None]:
new_train_df.to_csv("../src/vindr_train_split.csv", index=False)
new_val_df.to_csv("../src/vindr_val_split.csv", index=False)
test_df.to_csv("../src/vindr_test_split.csv", index=False)