In [4]:
from pathlib import Path
from app.config import load_config

# Load config
config = load_config()
if config is None:
    raise RuntimeError("Config not found")

dest_path = Path(config["local_path"])

# Count files in each subfolder
folder_file_counts = {
    p.name: sum(1 for f in p.iterdir() if f.is_file())
    for p in dest_path.iterdir()
    if p.is_dir()
}

folder_file_counts

{'accel_all': 71,
 'fft_hub1': 71,
 'fft_hub2': 71,
 'fft_hub3': 71,
 'fft_hub4': 71,
 'sst_hub1': 71,
 'sst_hub2': 71,
 'sst_hub3': 71,
 'sst_hub4': 71}

In [5]:
from pathlib import Path
from app.config import load_config
import re
from collections import defaultdict

# Load config
config = load_config()
if config is None:
    raise RuntimeError("Config not found")

dest_path = Path(config["local_path"])

# Regex for {year}W{week}_FolderName.parquet
pattern = re.compile(r"(?P<year>\d{4})W(?P<week>\d{2})_.*\.parquet$")

# Collect weeks per folder
weeks_per_folder = defaultdict(set)

for folder in dest_path.iterdir():
    if not folder.is_dir():
        continue

    for f in folder.iterdir():
        if not f.is_file():
            continue

        m = pattern.match(f.name)
        if m:
            year = int(m.group("year"))
            week = int(m.group("week"))
            weeks_per_folder[folder.name].add((year, week))

# Weeks that appear in ALL folders
all_folders = set(weeks_per_folder.keys())
common_weeks = set.intersection(*weeks_per_folder.values())

# Sort for readability
common_weeks = sorted(common_weeks)

common_weeks

[(2024, 30),
 (2024, 31),
 (2024, 32),
 (2024, 33),
 (2024, 34),
 (2024, 35),
 (2024, 36),
 (2024, 37),
 (2024, 38),
 (2024, 39),
 (2024, 40),
 (2024, 41),
 (2024, 42),
 (2024, 43),
 (2024, 44),
 (2024, 45),
 (2024, 46),
 (2024, 47),
 (2024, 48),
 (2024, 49),
 (2024, 50),
 (2024, 51),
 (2024, 52),
 (2025, 1),
 (2025, 2),
 (2025, 3),
 (2025, 4),
 (2025, 5),
 (2025, 6),
 (2025, 7),
 (2025, 8),
 (2025, 9),
 (2025, 10),
 (2025, 11),
 (2025, 12),
 (2025, 13),
 (2025, 14),
 (2025, 15),
 (2025, 16),
 (2025, 17),
 (2025, 18),
 (2025, 19),
 (2025, 20),
 (2025, 21),
 (2025, 22),
 (2025, 23),
 (2025, 24),
 (2025, 25),
 (2025, 26),
 (2025, 27),
 (2025, 28),
 (2025, 29),
 (2025, 30),
 (2025, 31),
 (2025, 32),
 (2025, 33),
 (2025, 34),
 (2025, 35),
 (2025, 36),
 (2025, 37),
 (2025, 38),
 (2025, 39),
 (2025, 40),
 (2025, 41),
 (2025, 42),
 (2025, 43),
 (2025, 44),
 (2025, 45),
 (2025, 46),
 (2025, 47),
 (2025, 48)]

In [6]:
import re
from collections import defaultdict
import boto3
from app.config import load_config

# Load config
config = load_config()
if config is None:
    raise RuntimeError("Config not found")

# Regex for {year}W{week}_FolderName.parquet
pattern = re.compile(r"(?P<year>\d{4})W(?P<week>\d{2})_.*\.parquet$")

# Init S3 (Cloudflare R2)
s3 = boto3.client(
    "s3",
    endpoint_url=config["endpoint"],
    aws_access_key_id=config["access_key"],
    aws_secret_access_key=config["secret_key"],
)

bucket = config["bucket"]

# Collect weeks per "folder" (prefix before /)
weeks_per_prefix = defaultdict(set)

paginator = s3.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket):
    for obj in page.get("Contents", []):
        key = obj["Key"]

        # Expecting: folder/2025W49_folder.parquet
        if "/" not in key:
            continue

        prefix, filename = key.split("/", 1)

        m = pattern.match(filename)
        if not m:
            continue

        year = int(m.group("year"))
        week = int(m.group("week"))
        weeks_per_prefix[prefix].add((year, week))

# Weeks present in ALL prefixes (same logic as local)
common_bucket_weeks = sorted(set.intersection(*weeks_per_prefix.values()))

common_bucket_weeks

[(2024, 30),
 (2024, 31),
 (2024, 32),
 (2024, 33),
 (2024, 34),
 (2024, 35),
 (2024, 36),
 (2024, 37),
 (2024, 38),
 (2024, 39),
 (2024, 40),
 (2024, 41),
 (2024, 42),
 (2024, 43),
 (2024, 44),
 (2024, 45),
 (2024, 46),
 (2024, 47),
 (2024, 48),
 (2024, 49),
 (2024, 50),
 (2024, 51),
 (2024, 52),
 (2025, 1),
 (2025, 2),
 (2025, 3),
 (2025, 4),
 (2025, 5),
 (2025, 6),
 (2025, 7),
 (2025, 8),
 (2025, 9),
 (2025, 10),
 (2025, 11),
 (2025, 12),
 (2025, 13),
 (2025, 14),
 (2025, 15),
 (2025, 16),
 (2025, 17),
 (2025, 18),
 (2025, 19),
 (2025, 20),
 (2025, 21),
 (2025, 22),
 (2025, 23),
 (2025, 24),
 (2025, 25),
 (2025, 26),
 (2025, 27),
 (2025, 28),
 (2025, 29),
 (2025, 30),
 (2025, 31),
 (2025, 32),
 (2025, 33),
 (2025, 34),
 (2025, 35),
 (2025, 36),
 (2025, 37),
 (2025, 38),
 (2025, 39),
 (2025, 40),
 (2025, 41),
 (2025, 42),
 (2025, 43),
 (2025, 44),
 (2025, 45),
 (2025, 46),
 (2025, 47),
 (2025, 48),
 (2025, 49),
 (2025, 50)]