## 📦 Imports

In [None]:
from IPython.display import clear_output
from IPython.display import clear_output, display
from IPython.display import display, Markdown
from datasets import load_from_disk
from torch.nn.functional import softmax
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import Wav2Vec2Processor
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import time
import torch

## 📁 Set Working Directory

In [None]:
os.chdir("C:/Users/ibo.kylegregory/bisaya-stt-module")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/ibo.kylegregory/bisaya-stt-module'

## 📊 Show Latest WER

In [None]:
def show_latest_wer():
    if not os.path.exists(metrics_file):
        display(Markdown("⚠️ `validation_metrics.md` not found yet."))
        return
    with open(metrics_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        if not lines:
            display(Markdown("🟡 No WER data yet."))
            return
        display(Markdown("### 🧾 Latest WER Metrics"))
        display(Markdown("```\n" + "".join(lines[-5:]) + "\n```"))

show_latest_wer()


## 📉 Plot Loss

In [None]:
# Cell 4.1 — Optional: Smoothing and Auto-refresh Plot

def auto_refresh_plot_loss(log_path=loss_log, refresh_interval=60, smoothing_window=5):
    print(f"🔄 Auto-refreshing loss plot every {refresh_interval}s (Ctrl+C to stop)...")
    last_modified = 0

    try:
        while True:
            current_modified = os.path.getmtime(log_path) if os.path.exists(log_path) else 0
            if current_modified != last_modified:
                clear_output(wait=True)
                if os.path.exists(log_path):
                    df = pd.read_csv(log_path)
                    if not df.empty:
                        df["smoothed_loss"] = df["loss"].rolling(smoothing_window).mean()
                        plt.figure(figsize=(10, 4))
                        plt.plot(df["step"], df["loss"], label="Raw Loss", alpha=0.4)
                        plt.plot(df["step"], df["smoothed_loss"], label=f"Smoothed (window={smoothing_window})", linewidth=2)
                        plt.title("📉 Training Loss Over Time")
                        plt.xlabel("Training Step")
                        plt.ylabel("Loss")
                        plt.legend()
                        plt.grid(True)
                        plt.show()
                    else:
                        print("🟡 Log file exists but is empty.")
                else:
                    print("⚠️ Log file not found.")

                last_modified = current_modified
            time.sleep(refresh_interval)
    except KeyboardInterrupt:
        print("🛑 Stopped auto-refresh plot.")


NameError: name 'loss_log' is not defined

## 📌 Training Constants

In [None]:
# Auto-refresh WER Display
def auto_refresh_wer(md_path=metrics_file, refresh_interval=60):
    print(f"🔄 Auto-refreshing WER every {refresh_interval}s (Ctrl+C to stop)...")
    last_modified = 0

    try:
        while True:
            current_modified = os.path.getmtime(md_path) if os.path.exists(md_path) else 0
            if current_modified != last_modified:
                clear_output(wait=True)
                show_latest_wer()
                last_modified = current_modified
            time.sleep(refresh_interval)
    except KeyboardInterrupt:
        print("🛑 Stopped auto-refresh WER display.")


## 📡 Monitor Checkpoint Progress

In [None]:
auto_refresh_plot_loss(refresh_interval=60)

auto_refresh_wer(refresh_interval=60)


In [None]:
TRAIN_SIZE = 3080  # adjust based on your dataset['train'] count
BATCH_SIZE = 1
GRAD_ACCUM = 4

STEPS_PER_EPOCH = (TRAIN_SIZE + (BATCH_SIZE * GRAD_ACCUM) - 1) // (BATCH_SIZE * GRAD_ACCUM)


## 🧠 Load and Inspect Sample

In [None]:

CHECKPOINT_DIR = "models/wav2vec2/v1_bisaya"
POLL_INTERVAL = 60  # in seconds

def get_latest_checkpoint(path=CHECKPOINT_DIR):
    if not os.path.exists(path):
        return None

    checkpoint_steps = []
    for d in os.listdir(path):
        match = re.match(r"checkpoint-(\d+)", d)
        if match:
            checkpoint_steps.append(int(match.group(1)))
    return max(checkpoint_steps) if checkpoint_steps else None


## 🤖 Predict and Decode Sample

In [None]:
def monitor_training_progress(interval=POLL_INTERVAL, patience_minutes=10):
    print("📡 Monitoring training progress...")
    history = []
    last_step = None
    stagnant_counter = 0
    max_stagnant_checks = patience_minutes * 60 // interval

    while True:
        clear_output(wait=True)
        current_step = get_latest_checkpoint()

        if current_step is None:
            print("❌ No checkpoints found.")
        else:
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            estimated_epoch = current_step / STEPS_PER_EPOCH
            print(f"[{timestamp}] ✅ Checkpoint step: {current_step}")
            print(f"📊 Estimated epoch: {estimated_epoch:.2f}")
            history.append((timestamp, current_step, estimated_epoch))

            if last_step == current_step:
                stagnant_counter += 1
                print(f"⚠️ No new checkpoints. Stagnant for {stagnant_counter * interval} seconds.")
                if stagnant_counter >= max_stagnant_checks:
                    print("🚨 Training may have stopped. No progress detected.")
                    break
            else:
                stagnant_counter = 0

            last_step = current_step

        time.sleep(interval)


## 🔡 Inspect Tokenizer

In [None]:
monitor_training_progress(interval=300, patience_minutes=10)


## 🧪 Simulate Trainer Prediction

In [None]:

# Constants
DATASET_VERSION = "v1_training_ready_grapheme"
PROCESSOR_VERSION = "v1_grapheme"
MAX_INPUT_LENGTH_SEC = 15

# === Load your processor ===
processor = Wav2Vec2Processor.from_pretrained(
    r"C:\Users\ibo.kylegregory\bisaya-stt-module\processor\v1_grapheme",
    local_files_only=True
)

# === Load dataset ===
raw_dataset = load_from_disk(r"C:\Users\ibo.kylegregory\bisaya-stt-module\data\processed\v1_training_ready_grapheme")
max_len = int(processor.feature_extractor.sampling_rate * MAX_INPUT_LENGTH_SEC)
filtered_dataset = raw_dataset.filter(lambda x: len(x["input_values"]) <= max_len)
dataset = filtered_dataset["train"].train_test_split(test_size=0.1)

# === Sample: Inspect a label sequence ===
sample = dataset["test"][0]  # or adjust index
label_ids = sample["labels"]

# === Convert label IDs to tokens ===
tokens = processor.tokenizer.convert_ids_to_tokens(label_ids)
decoded_text = processor.tokenizer.decode(label_ids, skip_special_tokens=True)

print("🔢 Label IDs:", label_ids)
print("🔡 Tokens:", tokens)
print("📝 Decoded Text:", decoded_text)


## 🌀 Auto-Refresh: WER

In [None]:

# 🔄 Auto-refresh WER Display
def auto_refresh_wer(md_path=metrics_file, refresh_interval=60):
    print(f"🔄 Auto-refreshing WER every {refresh_interval}s (Ctrl+C to stop)...")
    last_modified = 0

    try:
        while True:
            current_modified = os.path.getmtime(md_path) if os.path.exists(md_path) else 0
            if current_modified != last_modified:
                clear_output(wait=True)
                show_latest_wer()
                last_modified = current_modified
            time.sleep(refresh_interval)
    except KeyboardInterrupt:
        print("🛑 Stopped auto-refresh WER display.")


## 🌀 Auto-Refresh: Loss

In [None]:

# 🔄 Auto-refresh Loss Plot
def auto_refresh_plot_loss(log_path=loss_log, refresh_interval=60, smoothing_window=5):
    print(f"🔄 Auto-refreshing loss plot every {refresh_interval}s (Ctrl+C to stop)...")
    last_modified = 0

    try:
        while True:
            current_modified = os.path.getmtime(log_path) if os.path.exists(log_path) else 0
            if current_modified != last_modified:
                clear_output(wait=True)
                if os.path.exists(log_path):
                    df = pd.read_csv(log_path)
                    if not df.empty:
                        df["smoothed_loss"] = df["loss"].rolling(smoothing_window).mean()
                        plt.figure(figsize=(10, 4))
                        plt.plot(df["step"], df["loss"], label="Raw Loss", alpha=0.4)
                        plt.plot(df["step"], df["smoothed_loss"], label=f"Smoothed (window={smoothing_window})", linewidth=2)
                        plt.title("📉 Training Loss Over Time")
                        plt.xlabel("Training Step")
                        plt.ylabel("Loss")
                        plt.legend()
                        plt.grid(True)
                        plt.show()
                    else:
                        print("🟡 Log file exists but is empty.")
                else:
                    print("⚠️ Log file not found.")
                last_modified = current_modified
            time.sleep(refresh_interval)
    except KeyboardInterrupt:
        print("🛑 Stopped auto-refresh plot.")
