In [None]:
from pathlib import Path
from dotenv import load_dotenv

def load_env_vars():
    # load ~/.secrets/.llm_apis
    load_dotenv(dotenv_path=Path.home() / ".secrets/.llm_apis")

    # load project .env
    load_dotenv()
    
load_env_vars()

def load_text(file_path):
    """Loads text from a file."""
    with open(file_path, 'r') as f:
        text = f.read()

    return text

def load_data_manifest(path_to_manifest_file) -> dict[str, dict[str, str]]:
    """
    Load a data manifest file and return its contents.
    """
    import json
    import os
    
    if not os.path.exists(path_to_manifest_file):
        raise FileNotFoundError(f"Manifest file not found: {path_to_manifest_file}")
    
    with open(path_to_manifest_file, 'r') as f:
        manifest: dict = json.load(f)
    
    return manifest

def get_data_repoIDs(path_to_manifest_file):
    """
    Get the repository IDs of datasets from the manifest file.
    """
    manifest = load_data_manifest(path_to_manifest_file)
    repo_ids = {dataset: info['repo_id'] for dataset, info in manifest.items() if 'repo_id' in info}
    
    return repo_ids

In [36]:
from langchain_core.tools import tool

@tool
def get_cached_dataset_path(repo_id: str):
    """ Get the path to a cached dataset from Hugging Face Hub.
    Args:
        repo_id (str): The repository ID of the dataset on Hugging Face Hub.
    Returns:
        Path: The path to the cached dataset.
    """
    from huggingface_hub import snapshot_download
    import os
    from pathlib import Path
    
    # Load HF Token
    HF_TOKEN = os.getenv("HF_TOKEN")
    if HF_TOKEN is None:
        raise ValueError("HF_TOKEN environment variable is not set. Please set it before running the script.")
    
    # get path to cached dataset
    path = snapshot_download(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)

    return Path(path)

@tool
def get_glob_files(repo_id):
    """
    Get list of files in a directory of all patterns"""

    path: Path = get_cached_dataset_path.invoke(repo_id)

    files = []
    for _ in path.rglob("*"):
        if not _.is_file():
            raise ValueError(f"Path {path} is not a file. Please provide a valid file path.")
        
        print(_)
        files.append(_)
        
    return files

@tool
def find_csv_excel_files(root_dir: Path) -> list[Path]:
    """
    Recursively find all CSV and Excel files in a directory.
    
    Args:
        root_dir (Path): Root directory to search.
    
    Returns:
        List[Path]: List of file paths with .csv, .xls, or .xlsx extensions.
    """
    if not root_dir.is_dir():
        raise NotADirectoryError(f"{root_dir} is not a valid directory")

    exts = {'.csv', '.xls', '.xlsx'}
    return [f for f in root_dir.rglob("*") if f.suffix.lower() in exts]


In [28]:

REPO_ID = "zayanhugsAI/pulsenet"

In [29]:
get_cached_dataset_path(REPO_ID)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 187454.93it/s]


PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107')

In [35]:
location = get_cached_dataset_path(REPO_ID)
get_glob_files(REPO_ID)

Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 185383.60it/s]
Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 171196.08it/s]

/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2001-2005.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2020-2023.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Data Dictionary.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_1996-2000.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/.gitattributes
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2016-2019.xlsx
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2006-2010.xlsx
/home/zayan/.cache/huggingfa




[PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2001-2005.xlsx'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2020-2023.xlsx'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Data Dictionary.xlsx'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_1996-2000.xlsx'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/.gitattributes'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107/Export_2016-2019.xlsx'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet

In [12]:
REPO_ID = "zayanhugsAI/social_vulnerability_index"
get_cached_dataset_path(REPO_ID)

Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 100663.30it/s]


PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f')

In [14]:
get_glob_files(REPO_ID)

Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 101475.10it/s]

/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/SVI_2022_US_county.csv
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/SVI_2020_US_county.csv
/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/.gitattributes





[PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/SVI_2022_US_county.csv'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/SVI_2020_US_county.csv'),
 PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--social_vulnerability_index/snapshots/9bb6f44cb04883ba56fd228c4ca50b922d0a4e1f/.gitattributes')]

In [42]:
import pandas as pd
import chardet

REPO_ID = "zayanhugsAI/census_population"
location = get_cached_dataset_path(REPO_ID)
print(location)
data_files = find_csv_excel_files.invoke({'root_dir': location})
print(data_files)


data_file = data_files[0]

df = pd.read_excel(data_file)
df.head()

Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 86778.70it/s]

/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--census_population/snapshots/5a1680c29c93aae1cce72c3443c1afed4ec631cb
[PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--census_population/snapshots/5a1680c29c93aae1cce72c3443c1afed4ec631cb/2020-2023.xlsx'), PosixPath('/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--census_population/snapshots/5a1680c29c93aae1cce72c3443c1afed4ec631cb/states/mo_2020_to_2023.xlsx')]





Unnamed: 0,table with row headers in column A and column headers in rows 3 through 4 (leading dots indicate sub-parts),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,Annual Estimates of the Resident Population fo...,,,,,
1,Geographic Area,"April 1, 2020 Estimates Base",Population Estimate (as of July 1),,,
2,,,2020,2021.0,2022.0,2023.0
3,United States,331464948,331526933,332048977.0,333271411.0,334914895.0
4,".Autauga County, Alabama",58809,58915,59203.0,59726.0,60342.0


In [17]:
df

Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,EP_ASIAN,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE
0,1,Alabama,AL,1001,Autauga County,1001,"Autauga County, Alabama",594.454786,58761,0,...,1.1,0.4,0.1,0.1,0.0,0.1,3.3,1.0,0.2,0.3
1,1,Alabama,AL,1003,Baldwin County,1003,"Baldwin County, Alabama",1589.861817,233420,0,...,0.9,0.1,0.2,0.1,0.0,0.1,3.1,0.4,0.4,0.3
2,1,Alabama,AL,1005,Barbour County,1005,"Barbour County, Alabama",885.007619,24877,0,...,0.5,0.1,0.3,0.1,0.0,0.1,1.8,0.7,1.2,0.8
3,1,Alabama,AL,1007,Bibb County,1007,"Bibb County, Alabama",622.469286,22251,0,...,0.3,0.4,0.1,0.1,0.0,0.2,1.7,1.0,0.1,0.1
4,1,Alabama,AL,1009,Blount County,1009,"Blount County, Alabama",644.890376,59077,0,...,0.2,0.2,0.1,0.1,0.2,0.2,2.8,0.7,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3139,56,Wyoming,WY,56037,Sweetwater County,56037,"Sweetwater County, Wyoming",10426.975770,42079,0,...,0.7,0.3,0.9,0.3,0.1,0.1,2.2,0.8,0.9,0.7
3140,56,Wyoming,WY,56039,Teton County,56039,"Teton County, Wyoming",3996.846933,23346,0,...,1.3,0.5,0.1,0.2,0.1,0.1,2.3,1.0,0.7,0.7
3141,56,Wyoming,WY,56041,Uinta County,56041,"Uinta County, Wyoming",2081.719807,20546,0,...,0.2,0.2,0.1,0.1,0.0,0.2,2.9,0.4,0.0,0.2
3142,56,Wyoming,WY,56043,Washakie County,56043,"Washakie County, Wyoming",2238.678265,7725,0,...,0.4,0.4,0.3,0.4,0.0,0.5,3.2,0.7,0.3,0.3


In [9]:
import os

manifest_path = "/home/zayan/Documents/code/personal_repos/research_agentic_salmonella_langgraph/data_manifest.json"
if not os.path.exists(manifest_path):
    raise FileNotFoundError(f"Manifest file does not exist at {manifest_path}. Please check the path.")

manifest = load_text(manifest_path)
manifest


'{\n    "census_population": {\n        "cache_location": "/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--census_population/snapshots/5a1680c29c93aae1cce72c3443c1afed4ec631cb",\n        "repo_id": "zayanhugsAI/census_population"\n    },\n    "naco": {\n        "cache_location": "/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--naco/snapshots/654328b4f5b5aa3150d26fb4e5e669b77b0419ef",\n        "repo_id": "zayanhugsAI/naco"\n    },\n    "nors": {\n        "cache_location": "/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--nors/snapshots/470514dec8d88f26a7186344f7f511fa372ddd8e",\n        "repo_id": "zayanhugsAI/nors"\n    },\n    "pulsenet": {\n        "cache_location": "/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--pulsenet/snapshots/93b6ba578b340211d67ea353e77ed917ad798107",\n        "repo_id": "zayanhugsAI/pulsenet"\n    },\n    "socioecono_salmonella": {\n        "cache_location": "/home/zayan/.cache/huggingface/hub/datasets--zayanhugsAI--soci

In [12]:
import os

manifest_path = "/home/zayan/Documents/code/personal_repos/research_agentic_salmonella_langgraph/data_manifest.json"
if not os.path.exists(manifest_path):
    raise FileNotFoundError(f"Manifest file does not exist at {manifest_path}. Please check the path.")

manifest = load_data_manifest(manifest_path)

df_heads = get_df_heads_from_manifest(manifest)
df_heads

ImportError: attempted relative import with no known parent package