In [None]:
import os
import requests
import tarfile
from tqdm import tqdm

# 数据集文件及其对应的下载链接和目标文件夹
files = {
    "dev-clean.tar.gz": ("http://www.openslr.org/resources/12/dev-clean.tar.gz", "devDataset"),
    "dev-other.tar.gz": ("http://www.openslr.org/resources/12/dev-other.tar.gz", "devDataset"),
    "test-clean.tar.gz": ("http://www.openslr.org/resources/12/test-clean.tar.gz", "testDataset"),
    "test-other.tar.gz": ("http://www.openslr.org/resources/12/test-other.tar.gz", "testDataset"),
    "train-clean-100.tar.gz": ("http://www.openslr.org/resources/12/train-clean-100.tar.gz", "dataset"),
    "train-clean-360.tar.gz": ("http://www.openslr.org/resources/12/train-clean-360.tar.gz", "dataset"),
    "train-other-500.tar.gz": ("http://www.openslr.org/resources/12/train-other-500.tar.gz", "dataset")
}

# 下载函数
def download_file(url, filepath):
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        print(f"{filepath} 已存在，跳过下载。")
        return True
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024
        with open(filepath, 'wb') as f, tqdm(total=total_size, unit='iB', unit_scale=True, desc=filepath) as t:
            for data in response.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        return True
    except Exception as e:
        print(f"下载失败：{filepath}，错误：{e}")
        return False

# 解压函数
def extract_tar_gz(filepath, extract_path):
    try:
        print(f"正在解压 {filepath} 到 {extract_path}...")
        with tarfile.open(filepath, "r:gz") as tar:
            tar.extractall(path=extract_path)
        print(f"解压完成：{filepath}")
    except Exception as e:
        print(f"解压失败：{filepath}，错误：{e}")

# 主执行逻辑
for filename, (url, folder) in files.items():
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    if download_file(url, filepath):
        extract_tar_gz(filepath, folder)


devDataset/dev-clean.tar.gz: 100%|██████████| 338M/338M [00:07<00:00, 42.7MiB/s] 


正在解压 devDataset/dev-clean.tar.gz 到 devDataset...
解压完成：devDataset/dev-clean.tar.gz


devDataset/dev-other.tar.gz: 100%|██████████| 314M/314M [00:07<00:00, 42.0MiB/s] 


正在解压 devDataset/dev-other.tar.gz 到 devDataset...
解压完成：devDataset/dev-other.tar.gz


testDataset/test-clean.tar.gz: 100%|██████████| 347M/347M [00:09<00:00, 35.5MiB/s] 


正在解压 testDataset/test-clean.tar.gz 到 testDataset...
解压完成：testDataset/test-clean.tar.gz


testDataset/test-other.tar.gz: 100%|██████████| 329M/329M [00:10<00:00, 31.3MiB/s] 


正在解压 testDataset/test-other.tar.gz 到 testDataset...
解压完成：testDataset/test-other.tar.gz


dataset/train-clean-100.tar.gz:  79%|███████▉  | 5.06G/6.39G [02:07<00:29, 45.7MiB/s]