In [12]:
import datasets
from pathlib import Path
from huggingface_hub import list_repo_files, hf_hub_download

def get_lfs_files(repo_id: str):
    """
    Retrieve a list of LFS files from a Hugging Face repository.

    Args:
        repo_id: The repository ID to check for LFS files.

    Returns:
        A list of LFS files.
    """
    files = list_repo_files(repo_id=repo_id)
    lfs_files = [file for file in files if file.endswith(('.bin', '.pt', '.model', '.tar.gz', '.zip', '.ckpt'))]
    return lfs_files

def download_dataset_with_lfs(dataset_name: str, save_dir: str, repo_id: str = None):
    """
    Download a Hugging Face dataset and associated LFS files, and save them locally.

    Args:
        dataset_name: Name of the dataset on Hugging Face.
        save_dir: Directory to save the dataset and LFS files.
        repo_id: The repository ID from which to download the LFS files.

    Example:
        download_dataset_with_lfs("imdb", "./data/", "username/dataset_name")
    """
    if repo_id is None:
        repo_id = f"{dataset_name}"
    # Load the dataset from Hugging Face
    dataset = datasets.load_dataset(dataset_name)

    # Convert to a pandas DataFrame and save each split
    save_path = Path(save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    for split in dataset.keys():
        dataset[split].to_csv(save_path / f"{dataset_name}_{split}.csv", index=False)

    # Retrieve and download LFS files
    lfs_files = get_lfs_files(repo_id=repo_id)
    for file_name in lfs_files:
        hf_hub_download(repo_id=repo_id, filename=file_name, local_dir=save_path)


In [13]:
download_dataset_with_lfs("Wild-Heart/Disney-VideoGeneration-Dataset", "Videos/Disney")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 588.84ba/s]


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-67143659-625831112e9e66ee28cbeb35;ce920c64-1e3b-4bb1-ab15-8e611b0e6378)

Repository Not Found for url: https://huggingface.co/api/models/Wild-Heart/Disney-VideoGeneration-Dataset/tree/main?recursive=True&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Invalid username or password.