In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Your target folder
output = "/content/drive/MyDrive/MCT_LTDiag/v_download"
os.makedirs(output, exist_ok=True)

In [None]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

API_TOKEN = "" # fill in your api token obtained through Harvard Dataverse
PERSISTENT_ID = "doi:10.7910/DVN/S3RW15"
SAVE_DIR = "/content/drive/MyDrive/MCT_LTDiag/v_download" # your own data storage dir
os.makedirs(SAVE_DIR, exist_ok=True)

headers = {"X-Dataverse-key": API_TOKEN}

# obtain MCT_LTDiag dataset JSON
dataset_url = f"https://dataverse.harvard.edu/api/datasets/:persistentId?persistentId={PERSISTENT_ID}"
resp = requests.get(dataset_url, headers=headers)
resp.raise_for_status()
dataset_json = resp.json()

files = dataset_json["data"]["latestVersion"]["files"]

def download_file(f):
    file_name = f["dataFile"]["filename"]
    file_id = f["dataFile"]["id"]
    download_url = f"https://dataverse.harvard.edu/api/access/datafile/{file_id}"
    save_path = os.path.join(SAVE_DIR, file_name)

    if os.path.exists(save_path):
        return f"File {file_name} already exists, skipped"

    print(f"Downloading {file_name} ...")
    r = requests.get(download_url, headers=headers, stream=True)
    with open(save_path, "wb") as fd:
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk:
                fd.write(chunk)
    return f"Saved {file_name}"

max_workers = 5
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(download_file, f) for f in files]
    for future in as_completed(futures):
        print(future.result())

In [None]:
import os
import tarfile
from concurrent.futures import ProcessPoolExecutor, as_completed

SAVE_DIR="/inspire/hdd/project/continuinglearning/suhaoyang-240107100018/suhaoyang-240107100018/storage/tumor_dataset/dataset_scidata_format_v2/data_temp"
def extract_tar(file_path):
    if file_path.endswith(".tar"):
        print(f"Extracting {file_path} ...")
        with tarfile.open(file_path, "r") as tar:
            tar.extractall(path=f"{SAVE_DIR}/{os.path.basename(file_path).replace('.tar','')}")
        os.remove(file_path)
        return f"Processed {file_path}"
    return f"Skipped {file_path}"

tar_files = [os.path.join(SAVE_DIR, f) for f in os.listdir(SAVE_DIR) if f.endswith(".tar")]

max_workers = os.cpu_count()
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(extract_tar, f) for f in tar_files]
    for future in as_completed(futures):
        print(future.result())

print("All .tar files have been extracted.")