In [1]:
import os
import json
import requests
# Path to your JSON file
json_path = "bg_noises.json"

# Base directory where the audio files live
# If theyâ€™re in the same folder as the notebook, use "."
base_dir = ""

In [2]:
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} entries from {json_path}")
data[:2]  # quick peek

Loaded 10 entries from bg_noises.json


[{'id': 'BG_01',
  'file_name': 'birdsong_and_wind_through_trees_in_stockwood_park.wav',
  'path': 'final/birdsong_and_wind_through_trees_in_stockwood_park.wav',
  'title': 'Gentle Park Sounds',
  'intensity': 'quiet',
  'environment': 'outdoors',
  'description': "This audio features calming sounds of birdsong and wind rustling through trees. The overall atmosphere is peaceful and relaxing, creating a serene backdrop. It's a pleasant soundscape perfect for unwinding.",
  'url': 'https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/birdsong_and_wind_through_trees_in_stockwood_park.wav',
  's3_url': ''},
 {'id': 'BG_02',
  'file_name': 'car_driving_ambience.wav',
  'path': 'final/car_driving_ambience.wav',
  'title': 'Car Driving Ambiance',
  'intensity': 'moderate',
  'environment': 'vehicle',
  'description': "This audio features the consistent sound of a car driving on a road. You can hear the engine hum and the low rumble of the tires on the asphalt. It's a contin

## Check local files

In [3]:
missing = []
existing = []

for item in data:
    # assuming structure: {"path": "filename.wav", "description": "..."}
    rel_path = item.get("path")
    if not rel_path:
        missing.append({"path": rel_path, "reason": "missing path key"})
        continue

    full_path = os.path.join(base_dir, rel_path)
    if os.path.exists(full_path):
        existing.append(full_path)
    else:
        missing.append({"path": rel_path, "full_path_tried": full_path})

print(f"Existing files: {len(existing)}")
print(f"Missing files: {len(missing)}")

if missing:
    print("\nMissing entries:")
    for m in missing:
        print(m)

Existing files: 10
Missing files: 0


## test github urls

In [4]:
ok = []
bad = []

for item in data:
    bg_id = item.get("id")
    url = item.get("url")
    if not url:
        print(f"{bg_id}: MISSING url")
        bad.append((bg_id, "missing url"))
        continue

    try:
        # Try HEAD first (cheap); fall back to GET if needed
        resp = requests.head(url, allow_redirects=True, timeout=10)
        if resp.status_code != 200:
            resp = requests.get(url, stream=True, timeout=10)
        if resp.status_code == 200:
            print(f"{bg_id}: OK ({url})")
            ok.append(bg_id)
        else:
            print(f"{bg_id}: FAIL {resp.status_code} ({url})")
            bad.append((bg_id, f"HTTP {resp.status_code}"))
    except Exception as e:
        print(f"{bg_id}: ERROR {e} ({url})")
        bad.append((bg_id, str(e)))

print("\nSummary:")
print(f"  OK:   {len(ok)}")
print(f"  FAIL: {len(bad)}")


BG_01: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/birdsong_and_wind_through_trees_in_stockwood_park.wav)
BG_02: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/car_driving_ambience.wav)
BG_03: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/exterior_ambiance_parking_lot_with_birds.wav)
BG_04: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/fan_hum.wav)
BG_05: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/grocery_store_ambience_new.wav)
BG_06: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/office_ambience_mix.wav)
BG_07: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/people_at_airport_room.wav)
BG_08: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/shopping_mall.wav)
BG_09: OK (https://raw.githubusercontent.com/KarthikAvinashFI/bg_noises/main/final/traffic_j

## test s3_url presence and compare GitHub vs S3 content

In [5]:
import json
import hashlib
import requests

JSON_PATH = "bg_noises.json"

def fetch_hash_and_size(url: str, chunk_size: int = 8192):
    """Download a URL, return (size_bytes, sha256_hex)."""
    h = hashlib.sha256()
    size = 0
    with requests.get(url, stream=True, timeout=60) as resp:
        resp.raise_for_status()
        for chunk in resp.iter_content(chunk_size):
            if not chunk:
                continue
            size += len(chunk)
            h.update(chunk)
    return size, h.hexdigest()

with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

no_s3 = []
compared = []
mismatches = []
errors = []

for item in data:
    bg_id = item.get("id")
    url = item.get("url")
    s3_url = (item.get("s3_url") or "").strip()

    if not s3_url:
        no_s3.append(bg_id)
        continue

    print(f"\nChecking {bg_id}:")
    print(f"  GitHub: {url}")
    print(f"  S3:     {s3_url}")

    try:
        gh_size, gh_hash = fetch_hash_and_size(url)
        s3_size, s3_hash = fetch_hash_and_size(s3_url)
    except Exception as e:
        print(f"  ERROR fetching: {e}")
        errors.append((bg_id, str(e)))
        continue

    same = (gh_hash == s3_hash) and (gh_size == s3_size)
    compared.append(bg_id)

    print(f"  GitHub: size={gh_size}, sha256={gh_hash}")
    print(f"  S3:     size={s3_size}, sha256={s3_hash}")
    print(f"  MATCH:  {same}")

    if not same:
        mismatches.append(bg_id)

print("\nSummary:")
print(f"  Entries without s3_url: {len(no_s3)} -> {no_s3}")
print(f"  Compared (GitHub vs S3): {len(compared)}")
print(f"  Mismatches:              {len(mismatches)} -> {mismatches}")
print(f"  Errors:                  {len(errors)} -> {errors}")



Summary:
  Entries without s3_url: 10 -> ['BG_01', 'BG_02', 'BG_03', 'BG_04', 'BG_05', 'BG_06', 'BG_07', 'BG_08', 'BG_09', 'BG_10']
  Compared (GitHub vs S3): 0
  Mismatches:              0 -> []
  Errors:                  0 -> []


In [None]:
# fan_hum: 
# shopping_mall:
# busy_office_no_people_loop:
