# FACTR ‚Äî Ingest (YouTube / Upload ‚Üí WAV)
**Version:** v2025-09-07_1.0  
**Purpose:** Produce a WAV file and set `AUDIO_PATH`.


# How you‚Äôll use this

# Open FACTR_02_Ingest‚Ä¶ipynb in a fresh Colab.

# Run cells 0 ‚Üí 3 to produce a clean AUDIO_PATH at data/processed/*_16k_mono.wav.

# The next notebook (03_ASR+Diarize) just needs to read AUDIO_PATH (or data/processed/LAST_INGEST.json) and proceed.



*   Mounts Drive and syncs your repo (private)
*   Installs only the minimal pins needed for ingest
*   Lets you download audio from YouTube (ungated or with cookies.txt) or upload a local file
*   Normalizes audio to WAV / 16 kHz / mono and sets AUDIO_PATH for later notebooks








## 0) Colab + Drive + Repo (standalone startup)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Config
URL = "https://www.youtube.com/watch?v=speFWRuuJNs"  # replace when needed
COOKIES = "/content/drive/MyDrive/COLAB_FOLDER/cookies.txt"  # optional; Netscape format for gated videos
LOG_PATH = "yt_jobs.log.jsonl"


In [None]:
# === FACTR_02 Ingest ‚Äî Colab Startup (private repo on Drive, PAT from Secrets) ===
from google.colab import drive, userdata
import os, urllib.parse, shutil

# 1) Mount Drive
drive.mount('/content/drive')

# 2) Repo info
DRIVE_ROOT = "/content/drive/MyDrive"
REPO_DIR   = f"{DRIVE_ROOT}/FATCR"
OWNER_REPO = "LukmaanViscomi/FATCR"
USERNAME   = "LukmaanViscomi"

# 3) Git identity
!git config --global user.name  "Colab User"
!git config --global user.email "colab@example.com"

# 4) Ensure root and cd
os.makedirs(DRIVE_ROOT, exist_ok=True)
%cd $DRIVE_ROOT

# 5) Clone (private) with PAT from Colab Secrets
pat = userdata.get("GITHUB_PAT")
assert pat, "‚ö†Ô∏è Add your GitHub token in Colab Secrets as key 'GITHUB_PAT'."
enc_pat  = urllib.parse.quote(pat, safe="")
enc_user = urllib.parse.quote(USERNAME, safe="")
AUTH_URL = f"https://{enc_user}:{enc_pat}@github.com/{OWNER_REPO}.git"

# Clean partial clone
if os.path.isdir(REPO_DIR) and not os.path.isdir(os.path.join(REPO_DIR, ".git")):
    shutil.rmtree(REPO_DIR)

if not os.path.isdir(REPO_DIR):
    !git clone $AUTH_URL FATCR

# Keep authenticated remote (so pull/push work without prompts)
%cd $REPO_DIR
!git remote set-url origin $AUTH_URL
!git pull --ff-only || true

# 6) Ensure folders we‚Äôll use
os.makedirs("notebooks", exist_ok=True)
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

!git status -sb
print("‚úÖ Repo ready:", REPO_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive
/content/drive/MyDrive/FATCR
From https://github.com/LukmaanViscomi/FATCR
   cbe10aa..6afe6a0  main       -> origin/main
Already up to date.
Refresh index: 100% (3/3), done.
## [32mmain[m...[31morigin/main[m
 [31mM[m notebooks/FACTR_01_Setup_v2025-09-09_test.ipynb
[31m??[m notebooks/FACTR_02_Ingest_v2025-09-07_1.0.ipynb
[31m??[m notebooks/FACTR_03_ASR+Diarize_v2025-09-07_1.0.ipynb
[31m??[m notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_1.0.ipynb
‚úÖ Repo ready: /content/drive/MyDrive/FATCR


## 1) Minimal installs (fast)

In [None]:
%%bash
set -euo pipefail

# Keep pip modern but below the next breaking change
pip install -q --upgrade "pip<25.3" wheel

# Minimal pins needed for ingest & audio IO
pip install -q "numpy==2.0.2" "pandas==2.2.3" "pyarrow>=15,<17"
pip install -q yt-dlp ffmpeg-python soundfile librosa==0.10.2.post1

# Show true breakages only (resolver noise is fine)
pip check || true


   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 1.8/1.8 MB 12.8 MB/s eta 0:00:00
ipython 7.34.0 requires jedi, which is not installed.
google-colab 1.0.0 has requirement pandas==2.2.2, but you have pandas 2.2.3.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.


## Quick check:

In [None]:
import sys, numpy as np, pandas as pd, soundfile as sf, librosa
print("Python:", sys.version.split()[0])
print("NumPy :", np.__version__, "| Pandas:", pd.__version__)
print("libsndfile OK:", sf.__version__)

Python: 3.12.11
NumPy : 2.0.2 | Pandas: 2.2.3
libsndfile OK: 0.13.1


## 2) Choose one ingest option

## ‚ñ∏ Option A: YouTube (no cookies, public videos only)

In [None]:
# Set your YouTube URL here (public/ungated only)
YOUTUBE_URL = "https://www.youtube.com/watch?v=speFWRuuJNs"  # <-- change me if needed

!yt-dlp --ignore-config \
  -f bestaudio[ext=m4a]/bestaudio \
  --extract-audio --audio-format wav \
  --no-playlist --no-warnings --restrict-filenames \
  -o "./data/raw/%(id)s.%(ext)s" "$YOUTUBE_URL"

# Pick the newest wav we just created
import glob, os
cand = sorted(glob.glob("data/raw/*.wav"), key=os.path.getmtime)
AUDIO_ORIG = cand[-1] if cand else None
print("AUDIO_ORIG =", AUDIO_ORIG)


## ‚ñ∏ Option B: YouTube with cookies.txt (for age/region/gated)

In [None]:
# === YouTube download with cookies from Drive ===
YOUTUBE_URL = "https://www.youtube.com/watch?v=speFWRuuJNs"  # <-- change if needed

COOKIES_PATH = "/content/drive/MyDrive/COLAB_FOLDER/cookies.txt"
assert os.path.exists(COOKIES_PATH), f"‚ö†Ô∏è cookies.txt not found at {COOKIES_PATH}"

!yt-dlp --cookies $COOKIES_PATH \
  -f bestaudio[ext=m4a]/bestaudio \
  --extract-audio --audio-format wav \
  --no-playlist --no-warnings --restrict-filenames \
  -o "./data/raw/%(id)s.%(ext)s" "$YOUTUBE_URL"

import glob, os
cand = sorted(glob.glob("data/raw/*.wav"), key=os.path.getmtime)
AUDIO_ORIG = cand[-1] if cand else None
print("AUDIO_ORIG =", AUDIO_ORIG)


[youtube] Extracting URL: https://www.youtube.com/watch?v=speFWRuuJNs
[youtube] speFWRuuJNs: Downloading webpage
[youtube] speFWRuuJNs: Downloading tv simply player API JSON
[youtube] speFWRuuJNs: Downloading tv client config
[youtube] speFWRuuJNs: Downloading player 6740c111-main
[youtube] speFWRuuJNs: Downloading tv player API JSON
[info] speFWRuuJNs: Downloading 1 format(s): 140-6
[download] Sleeping 3.00 seconds as required by the site...
[download] Destination: ./data/raw/speFWRuuJNs.m4a
[K[download] 100% of   27.57MiB in [1;37m00:00:08[0m at [0;32m3.32MiB/s[0m
[FixupM4a] Correcting container of "./data/raw/speFWRuuJNs.m4a"
[ExtractAudio] Destination: ./data/raw/speFWRuuJNs.wav
Deleting original file ./data/raw/speFWRuuJNs.m4a (pass -k to keep)
AUDIO_ORIG = data/raw/speFWRuuJNs.wav


## ‚ñ∏ Option C: Upload a local audio/video file

In [None]:
from google.colab import files, output
print("Upload an audio/video file (mp3/mp4/m4a/wav/mov...)")
up = files.upload()

import os, glob
uploaded_paths = [p for p in up.keys()]
print("Uploaded:", uploaded_paths)

# Move the first uploaded file into data/raw/
import shutil
src = uploaded_paths[0]
dst = os.path.join("data/raw", os.path.basename(src))
shutil.move(src, dst)
AUDIO_ORIG = dst
print("AUDIO_ORIG =", AUDIO_ORIG)


## 3) Normalize to WAV / 16 kHz / mono (for ASR)

In [None]:
import os, subprocess, shlex, librosa, soundfile as sf

assert AUDIO_ORIG and os.path.exists(AUDIO_ORIG), "No source audio found. Run Option A/B/C first."

# Target path in processed/
base = os.path.splitext(os.path.basename(AUDIO_ORIG))[0]
AUDIO_PATH = os.path.join("data/processed", f"{base}_16k_mono.wav")

# Use librosa+soundfile for a pure-Python resample/convert (works reliably in Colab)
y, sr = librosa.load(AUDIO_ORIG, sr=16000, mono=True)  # resample + mono
sf.write(AUDIO_PATH, y, 16000, subtype="PCM_16")

print("‚úÖ Normalized WAV written:", AUDIO_PATH)
print("Duration (sec):", round(len(y)/16000, 2))



‚úÖ Normalized WAV written: data/processed/speFWRuuJNs_16k_mono.wav
Duration (sec): 1786.2


(If you prefer ffmpeg for conversion instead, swap in this one-liner:)

In [None]:
# Alternative using ffmpeg (uncomment to use)
# !ffmpeg -y -i "$AUDIO_ORIG" -ac 1 -ar 16000 -sample_fmt s16 "$AUDIO_PATH"


## 4) Persist artifact + show where the next notebook will read from

In [None]:
import json, time
meta = {
  "when": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
  "audio_orig": AUDIO_ORIG,
  "audio_path": AUDIO_PATH,
  "note": "This AUDIO_PATH will be used by FACTR_03_ASR+Diarize."
}
with open("data/processed/LAST_INGEST.json", "w") as f:
  json.dump(meta, f, indent=2)
print("üìù Wrote data/processed/LAST_INGEST.json")
print("‚û°Ô∏è  Use AUDIO_PATH in the next notebook:", AUDIO_PATH)


üìù Wrote data/processed/LAST_INGEST.json
‚û°Ô∏è  Use AUDIO_PATH in the next notebook: data/processed/speFWRuuJNs_16k_mono.wav


## 5) (Optional) Quick listen / peek

In [None]:
from IPython.display import Audio, display
display(Audio(filename=AUDIO_PATH, rate=16000))


Output hidden; open in https://colab.research.google.com to view.

## 6) (Optional) Push helper (commit ingest artifacts & notebooks)

In [None]:
# === FACTR push (PAT from Colab Secrets, safe rebase) ===
from google.colab import userdata
import urllib.parse, os, subprocess

REPO_DIR = "/content/drive/MyDrive/FATCR"
os.chdir(REPO_DIR)

print("üìÇ Repo status:")
!git status -sb

pat = userdata.get("GITHUB_PAT")
assert pat, "Missing GITHUB_PAT in Colab Secrets."
enc_pat = urllib.parse.quote(pat, safe="")
REMOTE_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"

print("\nüîÑ Pulling latest (rebase, autostash)‚Ä¶")
!git pull --rebase --autostash {REMOTE_URL} main || true

# Track processed metadata + notebooks by default (raw audio can be large; keep or drop as you prefer)
!git add notebooks data/processed *.md .gitignore .github 2>/dev/null || true

changed = subprocess.run(["git","diff","--cached","--quiet"]).returncode != 0
if changed:
    msg = "Ingest artifacts and notebooks"
    print("\n‚úèÔ∏è Committing:", msg)
    !git commit -m "{msg}"
else:
    print("\n‚ÑπÔ∏è Nothing to commit.")

print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
!git push {REMOTE_URL} HEAD:main
print("\n‚úÖ Push complete.")


üìÇ Repo status:
## [32mmain[m...[31morigin/main[m
 [31mM[m notebooks/FACTR_01_Setup_v2025-09-09_test.ipynb
[31m??[m data/
[31m??[m notebooks/FACTR_02_Ingest_v2025-09-07_1.0.ipynb
[31m??[m notebooks/FACTR_03_ASR+Diarize_v2025-09-07_1.0.ipynb
[31m??[m notebooks/FACTR_04_Claims+Embeddings_v2025-09-07_1.0.ipynb

üîÑ Pulling latest (rebase, autostash)‚Ä¶
From https://github.com/LukmaanViscomi/FATCR
 * branch            main       -> FETCH_HEAD
Already up to date.

‚ÑπÔ∏è Nothing to commit.

‚¨ÜÔ∏è Pushing to main‚Ä¶
Everything up-to-date

‚úÖ Push complete.


## 7) Smoke test ‚Üí confirms the audio really downloaded, valid size.

In [None]:
# === Smoke test (verify AUDIO_PATH) ===
import os

# Ensure AUDIO_PATH exists and is a valid audio file (>10 KB)
assert AUDIO_PATH and os.path.exists(AUDIO_PATH) and os.path.getsize(AUDIO_PATH) > 10_000, \
       "‚ö†Ô∏è AUDIO_PATH invalid or too small."

print("‚úÖ Ingest smoke test passed:", AUDIO_PATH, os.path.getsize(AUDIO_PATH), "bytes")


‚úÖ Ingest smoke test passed: data/processed/speFWRuuJNs_16k_mono.wav 57158286 bytes


## 8) Snapshot ‚Üí saves a JSON file in snapshots/ (under repo) with metadata.

In [None]:
# === Snapshot (log environment + ingest details) ===
import json, time, os

# Ensure AUDIO_PATH points to the processed mono/16k wav created in this notebook
# If you also keep AUDIO_ORIG (downloaded file), feel free to include it too.
snap = {
    "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "audio_path": os.path.abspath(AUDIO_PATH),
}

# If you maintain a download log, keep its last line (optional)
yt_log = "yt_jobs.log.jsonl"
if os.path.exists(yt_log):
    try:
        with open(yt_log, "r", encoding="utf-8") as f:
            lines = f.read().splitlines()
        snap["log_tail"] = lines[-1] if lines else None
    except Exception:
        snap["log_tail"] = None
else:
    snap["log_tail"] = None

# Where to write: keep a rotating snapshot + a stable pointer for downstream
ROOT = "/content/drive/MyDrive/FATCR"
os.makedirs(os.path.join(ROOT, "snapshots"), exist_ok=True)

snapshot_path = os.path.join(ROOT, "snapshots", f"INGEST_SNAPSHOT_{int(time.time())}.json")
with open(snapshot_path, "w", encoding="utf-8") as f:
    json.dump(snap, f, indent=2)

# Stable pointer used by FACTR_03
last_path = os.path.join(ROOT, "data", "processed", "LAST_INGEST.json")
os.makedirs(os.path.dirname(last_path), exist_ok=True)
with open(last_path, "w", encoding="utf-8") as f:
    json.dump(snap, f, indent=2)

print("üóÇÔ∏è Snapshot saved:", snapshot_path)
print("üìå LAST_INGEST.json updated ‚Üí", snap["audio_path"])



## Quick verification cell (run right after)

In [None]:
import json, os
last = "/content/drive/MyDrive/FATCR/data/processed/LAST_INGEST.json"
assert os.path.exists(last), "LAST_INGEST.json not found"
with open(last, "r", encoding="utf-8") as f:
    meta = json.load(f)
print("AUDIO_PATH:", meta["audio_path"])
assert os.path.isabs(meta["audio_path"]), "AUDIO_PATH is not absolute"
assert os.path.exists(meta["audio_path"]), "AUDIO_PATH does not exist on disk"
print("‚úÖ LAST_INGEST.json looks good.")
