# Colab: Prepare Chest X-Ray Pneumonia dataset (no automatic download)

This notebook will:  
- Mount your Google Drive  
- Optionally copy a ZIP you placed on Drive into the repository (and extract it)  
- Clone or update the repository from GitHub into the Colab runtime  
- Run the included `backend/training/download_dataset.py` script in *exploration* mode (it will NOT download unless you explicitly change the script or run it with a download flag).  

Run cells in order. When asked for a Drive path, paste the full path to your ZIP (for example `/content/drive/MyDrive/Chest X-ray images(pneumonia) (2).zip`) or leave blank to skip copying/extraction.

In [None]:
# 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 2) Clone (or pull) the repository into the Colab runtime
import os, subprocess
repo_url = 'https://github.com/Mizuinu30/xray-ml-platform.git'
repo_dir = '/content/xray-ml-platform'
if not os.path.exists(repo_dir):
    print('Cloning repository...')
    subprocess.run(['git','clone',repo_url,repo_dir], check=True)
else:
    print('Repository already exists; pulling latest...')
    subprocess.run(['git','-C',repo_dir,'pull'], check=True)
os.chdir(repo_dir)
print('Repository ready at', repo_dir)

In [None]:
# 3) Provide the path to your ZIP on Drive (or leave blank to skip)
print('If your ZIP is on Google Drive, provide its full path (e.g. /content/drive/MyDrive/Chest X-ray images(pneumonia) (2).zip).')
drive_zip = input('Drive ZIP path (leave blank to skip): ').strip()
if drive_zip == '':
    drive_zip = None
print('drive_zip =', drive_zip)

In [None]:
# 4) Copy the ZIP into the repository (backend/data/raw) if a path was provided
import pathlib, shutil
raw_dir = pathlib.Path('backend/data/raw')
raw_dir.mkdir(parents=True, exist_ok=True)
if drive_zip:
    src = pathlib.Path(drive_zip)
    if src.exists():
        dst = raw_dir / 'chest-xray-pneumonia.zip'
        shutil.copy2(src, dst)
        print(f'Copied {src} -> {dst}')
    else:
        print(f'Provided path does not exist: {src}')
else:
    print('No ZIP path provided; skipping copy.')

In [None]:
# 5) Extract any ZIPs found in backend/data/raw (safe - only extracts what you provide)
import zipfile, pathlib
raw = pathlib.Path('backend/data/raw')
zip_path = raw / 'chest-xray-pneumonia.zip'
extracted = False
if zip_path.exists():
    print('Extracting', zip_path)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(raw)
    print('Extraction completed.')
    extracted = True
else:
    zips = list(raw.glob('*.zip'))
    if zips:
        for z in zips:
            print('Extracting', z)
            with zipfile.ZipFile(z, 'r') as zz:
                zz.extractall(raw)
            extracted = True
    else:
        print('No zip files found in backend/data/raw')

if extracted:
    print('Extraction finished; available files:')
    for p in list(raw.rglob('*'))[:20]:
        print('-', p)

In [None]:
# 6) Run the repository's dataset helper in exploration mode (it will NOT download by default)
import subprocess, sys, os
# Ensure we're in the repo root
repo_dir = '/content/xray-ml-platform'
os.chdir(repo_dir)
print('Running: python backend/training/download_dataset.py')
try:
    subprocess.run([sys.executable, 'backend/training/download_dataset.py'], check=True)
except subprocess.CalledProcessError as e:
    print('Script finished with non-zero exit code:', e.returncode)
    raise

## Next steps

- If you want to actually download the dataset from Kaggle inside Colab, install `kaggle` and provide `kaggle.json` (or set environment variables). Then run the script with the `--download` flag:  
  ```bash
  python backend/training/download_dataset.py --download
  ```
- If you extracted the data and want to run training, follow your project's training instructions (e.g., `python training/train_pneumonia.py`).