# 01 - CT Data Exploration (TCIA)

In this notebook, we will:

- Query publicly available CT datasets from TCIA using the `tciaclient` library
- List collections and their properties
- Visualize a few sample CT series
- Choose one dataset (under 10 GB) to download into `data/raw`


In [10]:
# !pip install tciaclient pydicom matplotlib numpy
import os
import zipfile
from tqdm import tqdm
from pathlib import Path
from tciaclient.core import TCIAClient
import matplotlib.pyplot as plt
import pydicom
import numpy as np

client = TCIAClient()


In [5]:
collections = client.get_collection_values()
print(f"Found {len(collections)} collections")
for c in collections[:10]:
    print("-", c['Collection'])


Found 151 collections
- 4D-Lung
- ACRIN-FLT-Breast
- ACRIN-NSCLC-FDG-PET
- APOLLO
- Anti-PD-1_Lung
- BREAST-DIAGNOSIS
- Breast-MRI-NACT-Pilot
- C4KC-KiTS
- CBIS-DDSM
- CC-Radiomics-Phantom


In [6]:
# Set constants
COLLECTION = "NSCLC-Radiomics"
DATA_DIR = Path("/mnt/tcia_data/raw") / COLLECTION
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Initialize TCIA API client
client = TCIAClient()

# List all patients in the collection
patients = client.get_patient(collection=COLLECTION)
print(f"Found {len(patients)} patients.")

# Optional: preview a few
for p in patients[:3]:
    print("-", p["PatientID"])

Found 422 patients.
- LUNG1-001
- LUNG1-007
- LUNG1-029


In [7]:
# Pick first patient
PATIENT_ID = patients[0]["PatientID"]

# Get studies
studies = client.get_patient_study(collection=COLLECTION, patientId=PATIENT_ID)
print(f"Found {len(studies)} studies for patient {PATIENT_ID}")
for s in studies:
    print("-", s["StudyInstanceUID"])

# Get CT series from the first study
STUDY_UID = studies[0]["StudyInstanceUID"]
series_list = client.get_series(collection=COLLECTION, studyInstanceUid=STUDY_UID, modality="CT")
print(f"Found {len(series_list)} CT series in study {STUDY_UID}")

for s in series_list:
    desc = s.get("SeriesDescription", "[No Description]")
    print("-", desc, "| UID:", s["SeriesInstanceUID"])

Found 1 studies for patient LUNG1-001
- 1.3.6.1.4.1.32722.99.99.239341353911714368772597187099978969331
Found 1 CT series in study 1.3.6.1.4.1.32722.99.99.239341353911714368772597187099978969331
- [No Description] | UID: 1.3.6.1.4.1.32722.99.99.298991776521342375010861296712563382046


In [11]:
# Loop over all patients and download all CT series
for p in tqdm(patients, desc=f"Downloading {COLLECTION}"):
    patient_id = p["PatientID"]
    try:
        studies = client.get_patient_study(collection=COLLECTION, patientId=patient_id)
        for study in studies:
            study_uid = study["StudyInstanceUID"]
            series_list = client.get_series(collection=COLLECTION, studyInstanceUid=study_uid, modality="CT")

            for series in series_list:
                series_uid = series["SeriesInstanceUID"]
                out_dir = DATA_DIR / patient_id / series_uid
                zip_file = out_dir / "series.zip"

                if out_dir.exists() and any(out_dir.glob("*.dcm")):
                    continue  # already exists

                os.makedirs(out_dir, exist_ok=True)
                client.get_image(seriesInstanceUid=series_uid, downloadPath=out_dir, zipFileName="series.zip")

                with zipfile.ZipFile(zip_file, "r") as zip_ref:
                    zip_ref.extractall(out_dir)
                os.remove(zip_file)

    except Exception as e:
        print(f"[ERROR] {patient_id}: {e}")


Downloading NSCLC-Radiomics: 100%|██████████| 422/422 [1:32:10<00:00, 13.11s/it]
