In [1]:
import os
import pandas as pd
import shutil
from tqdm.auto import tqdm

# =========================
# 0) PATHS (Ã  adapter)
# =========================
csv_path = "masks_subset.csv"                                            # fichier CSV des masques
train_image_dir = "/Users/mbouchou/Downloads/airbus-ship-detection/train_v2"  # dossier des images source
out_dir = "/Users/mbouchou/Downloads/airbus_new_subset/images"           # dossier destination

# =========================
# 1) Read CSV (masks)
# =========================
masks = pd.read_csv(csv_path)  # colonnes: ImageId, EncodedPixels
print("CSV loaded:", masks.shape)
print(masks.head())

# =========================
# 2) Unique list of ImageIds (no duplicates)
# =========================
image_ids = masks["ImageId"].dropna().unique().tolist()
print("Unique images:", len(image_ids))

# =========================
# 3) Create destination folder
# =========================
os.makedirs(out_dir, exist_ok=True)
print("Output folder:", out_dir)

# =========================
# 4) Copy images with progress bar
# =========================
missing = []
copied = 0

for img_id in tqdm(image_ids, desc="Copying images"):
    src = os.path.join(train_image_dir, img_id)
    dst = os.path.join(out_dir, img_id)

    if not os.path.exists(src):
        missing.append(img_id)
        continue

    # copy2 preserves timestamps/metadata
    shutil.copy2(src, dst)
    copied += 1

print(f"Copied: {copied}")
print(f"Missing: {len(missing)}")

# Optional: save missing list
if missing:
    missing_path = os.path.join(os.path.dirname(out_dir), "missing_images.txt")
    with open(missing_path, "w") as f:
        f.write("\n".join(missing))
    print("Missing list saved to:", missing_path)

CSV loaded: (44202, 2)
         ImageId                                      EncodedPixels
0  000194a2d.jpg  360486 1 361252 4 362019 5 362785 8 363552 10 ...
1  000194a2d.jpg  51834 9 52602 9 53370 9 54138 9 54906 9 55674 ...
2  000194a2d.jpg  198320 10 199088 10 199856 10 200624 10 201392...
3  000194a2d.jpg  55683 1 56451 1 57219 1 57987 1 58755 1 59523 ...
4  000194a2d.jpg  254389 9 255157 17 255925 17 256693 17 257461 ...
Unique images: 12788
Output folder: /Users/mbouchou/Downloads/airbus_new_subset/images


Copying images:   0%|          | 0/12788 [00:00<?, ?it/s]

Copied: 12788
Missing: 0
