In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os, shutil, glob, random
import pandas as pd

# dataset paths (in Kaggle environment)
DATASET_DIR = "/kaggle/input/chexpert/train"
CSV_PATH = "/kaggle/input/chexpert/train.csv"

# output folder for small subset
EXPORT_DIR = "/kaggle/working/chexpert_subset"
os.makedirs(EXPORT_DIR, exist_ok=True)

print("Dataset exists:", os.path.exists(DATASET_DIR))
print("Sample folders:", os.listdir(DATASET_DIR)[:5])


Dataset exists: True
Sample folders: ['patient00734', 'patient28598', 'patient32985', 'patient43625', 'patient46811']


In [2]:
# Read metadata
df = pd.read_csv(CSV_PATH)
print("Original CSV shape:", df.shape)

# Keep only rows with valid Path and at least one label = 1.0
valid = df.dropna(subset=["Path"])
subset = valid.sample(50, random_state=42)  # pick 50 random rows (adjust N)
subset.head()


Original CSV shape: (223414, 19)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
305,CheXpert-v1.0-small/train/patient00082/study1/...,Male,18,Frontal,PA,1.0,0.0,,,,,,,,0.0,0.0,,,
151324,CheXpert-v1.0-small/train/patient35759/study9/...,Male,90,Frontal,AP,,,1.0,,,1.0,-1.0,,1.0,,,,,1.0
19284,CheXpert-v1.0-small/train/patient04789/study1/...,Male,39,Lateral,,,,,,1.0,,,,,,1.0,,,
165026,CheXpert-v1.0-small/train/patient38491/study5/...,Male,60,Frontal,AP,,,,,,1.0,,,,,,,,1.0
26786,CheXpert-v1.0-small/train/patient06537/study3/...,Female,66,Frontal,AP,,,0.0,1.0,,,0.0,-1.0,-1.0,0.0,0.0,,,


In [6]:
# Fast: copy N random images into a single flat folder (avoids scanning all files)
import os, random, shutil, pandas as pd, time, glob
from pathlib import Path

SRC_ROOT = "/kaggle/input/chexpert/train"   # adjust if needed
DEST_DIR = "/kaggle/working/new_images"
CSV_OUT = "/kaggle/working/new_images_list.csv"
N_IMAGES = 50            # desired number of images in new_images
MAX_TRIES = 5000         # safety to avoid infinite loops

os.makedirs(DEST_DIR, exist_ok=True)

t0 = time.time()
# 1) list patient directories (one level under train)
#    patient folders look like: /kaggle/input/chexpert/train/patient00001
patient_dirs = sorted([p for p in glob.glob(os.path.join(SRC_ROOT, "patient*")) if os.path.isdir(p)])
print("Patient directories found:", len(patient_dirs))

if len(patient_dirs) == 0:
    # fallback: if CheXpert was placed differently, attempt a looser search
    print("No patient directories found under", SRC_ROOT, "— falling back to scanning one level deeper.")
    patient_dirs = sorted([os.path.dirname(p) for p in glob.glob(os.path.join(SRC_ROOT, "**", "view*.jpg"), recursive=True)])
    patient_dirs = sorted(set(patient_dirs))
    print("Fallback patient dirs:", len(patient_dirs))

# 2) Shuffle patient list and sample until we have N_IMAGES
random.shuffle(patient_dirs)
selected = []
copied = []
i_patient = 0
tries = 0

while len(selected) < N_IMAGES and tries < MAX_TRIES and i_patient < len(patient_dirs):
    pdir = patient_dirs[i_patient]
    i_patient += 1
    tries += 1
    # gather images under this patient dir (studies/subdirs)
    imgs = glob.glob(os.path.join(pdir, "**", "*.jpg"), recursive=True)
    if not imgs:
        continue
    # pick one random image from this patient's images
    src = random.choice(imgs)
    bname = os.path.basename(src)
    dst = os.path.join(DEST_DIR, bname)
    # ensure uniqueness of filename in DEST_DIR
    if os.path.exists(dst):
        base, ext = os.path.splitext(bname)
        idx = 1
        while os.path.exists(os.path.join(DEST_DIR, f"{base}_{idx}{ext}")):
            idx += 1
        dst = os.path.join(DEST_DIR, f"{base}_{idx}{ext}")
    try:
        shutil.copy2(src, dst)
        selected.append({"original": src, "copied_as": os.path.basename(dst)})
        copied.append(dst)
    except Exception as e:
        print("Copy failed:", src, e)
    # If we run out of patient_dirs but still need more images, we can re-shuffle and continue
    if i_patient >= len(patient_dirs) and len(selected) < N_IMAGES:
        # shuffle again and continue attempts (rare)
        random.shuffle(patient_dirs)
        i_patient = 0

elapsed = time.time() - t0
print(f"Copied {len(selected)} images into {DEST_DIR} in {elapsed:.1f}s")

# Save mapping CSV
if selected:
    df = pd.DataFrame(selected)
    df.to_csv(CSV_OUT, index=False)
    print("Saved CSV ->", CSV_OUT)
    print(df.head())
else:
    print("No images were copied. Check SRC_ROOT path and dataset availability.")


Patient directories found: 64540
Copied 50 images into /kaggle/working/new_images in 37.7s
Saved CSV -> /kaggle/working/new_images_list.csv
                                            original            copied_as
0  /kaggle/input/chexpert/train/patient02189/stud...    view2_lateral.jpg
1  /kaggle/input/chexpert/train/patient58785/stud...    view1_frontal.jpg
2  /kaggle/input/chexpert/train/patient14967/stud...  view1_frontal_1.jpg
3  /kaggle/input/chexpert/train/patient37716/stud...  view1_frontal_2.jpg
4  /kaggle/input/chexpert/train/patient35768/stud...  view1_frontal_3.jpg


In [7]:
import shutil, os

SRC_DIR = "/kaggle/working/new_images"
ZIP_PATH = "/kaggle/working/new_images.zip"

if os.path.exists(SRC_DIR):
    shutil.make_archive(ZIP_PATH.replace(".zip", ""), 'zip', SRC_DIR)
    print(f"✅ Zipped successfully -> {ZIP_PATH}")
    print("File size:", round(os.path.getsize(ZIP_PATH)/1024/1024, 2), "MB")
else:
    print("⚠️ Folder not found:", SRC_DIR)


✅ Zipped successfully -> /kaggle/working/new_images.zip
File size: 2.42 MB
