In [8]:
from pathlib import Path
from dotenv import load_dotenv

def load_env_vars():
    # load ~/.secrets/.llm_apis
    load_dotenv(dotenv_path=Path.home() / ".secrets/.llm_apis")

    # load project .env
    load_dotenv()
    
load_env_vars()

In [None]:
from langchain_core.tools import tool

@tool
def get_cached_dataset_path(repo_id: str):
    """ Get the path to a cached dataset from Hugging Face Hub.
    Args:
        repo_id (str): The repository ID of the dataset on Hugging Face Hub.
    Returns:
        Path: The path to the cached dataset.
    """
    from huggingface_hub import snapshot_download
    import os
    from pathlib import Path
    
    # Load HF Token
    HF_TOKEN = os.getenv("HF_TOKEN")
    if HF_TOKEN is None:
        raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.")
    
    # get path to cached dataset
    path = snapshot_download(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)

    return Path(path)

@tool
def get_glob_files(repo_id):
    """
    Get list of files in a directory of all patterns"""

    path: Path = get_cached_dataset_path.invoke(repo_id)

    files = []
    for _ in path.rglob("*"):
        if not _.is_file():
            raise ValueError(f"Path {path} is not a file. Please provide a valid file path.")
        
        print(_)
        files.append(_)
        
    return files


In [None]:

REPO_ID = "zayanhugsAI/pulsenet"

In [9]:
get_cached_dataset_path(REPO_ID)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 197379.01it/s]


PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107')

In [10]:
get_glob_files(REPO_ID)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 205855.41it/s]

/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2001-2005.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2020-2023.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Data Dictionary.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_1996-2000.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/.gitattributes
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2016-2019.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2006-2010.xlsx
/home/zayan/.cache/huggingfa


