<a href="https://colab.research.google.com/github/GalJakob/NLP/blob/main/post_asr_20250922_15_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### THIS CELL is for installations and setup ###

### FIRST THINGS BEFORE STARTING : ###
# open terminal
# copy this: hf auth login
# copy this: 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
# press y in "add token as git credentials?"
!pip install -U datasets
!pip install transformers datasets evaluate --quiet
!pip install jiwer
!pip install torchcodec 

In [40]:
### THIS CELL is for global constants and hyper parameters for H200 + ByT5-small###

import torch
# ---- Data / paths ----

DATASET_DIR = "combined_asr_dataset"
INPUT_COL   = "asr_output"
TARGET_COL  = "sentence"
VAL_SIZE    = 0.10
SEED        = 42

# ---- Model / output ----
MODEL_NAME  = "google/byt5-small"
OUTPUT_DIR  = "checkpoints/byt5_postasr_h200"
PRECISION   = torch.bfloat16  # Hopper: prefer bf16 over fp16/fp32
MODEL_TAG = "byt5-small"       
GPU_TAG   =  "H200"

# ---- Tokenization (ByT5: bytes ≈ tokens)
# Your percentiles: inputs P99≈402 (+prefix) → 416; targets P99.5≈632 → 640
MAX_INPUT_LEN   =  408     # encoder cap
MAX_TARGET_LEN  = 544    # decoder / labels cap
TRUNCATION_SIDE = "right"
PADDING_SIDE    = "right"
PAD_TO_MULTIPLE_OF = 8     # tensor cores happy


# ---- Training (optimizer/schedule) ----
# H200 has ample VRAM; you can run bigger batches even with long targets.
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE  = 64
GRADIENT_ACCUMULATION_STEPS = 1          # effective batch = 64 (single GPU)
LEARNING_RATE   = 1e-4                   # good default for T5 fine-tuning
WEIGHT_DECAY    = 0.01
WARMUP_RATIO    = 0.06
LR_SCHEDULER    = "linear"
LABEL_SMOOTHING = 0.0
GROUP_BY_LENGTH = True
GRADIENT_CLIP_NORM = 1.0

# Choose ONE of these two stopping modes:
NUM_TRAIN_EPOCHS = 3                      # typical fine-tune: 2–5 epochs
MAX_STEPS        = -1                     # set >0 to override epochs

# ---- Eval / save / logging ----
EVALUATION_STRATEGY = "steps"
EVAL_STEPS          = 2000                # evaluate ~every 2k steps
SAVE_STRATEGY       = "steps"
SAVE_STEPS          = 2000
SAVE_TOTAL_LIMIT    = 3
LOGGING_STRATEGY    = "steps"
LOGGING_STEPS       = 100
LOAD_BEST_MODEL_AT_END = True
METRIC_FOR_BEST_MODEL  = "wer"
GREATER_IS_BETTER      = False
REPORT_TO              = "none"           # set "tensorboard"/"wandb" if you use them
EARLY_STOPPING_PATIENCE = 3

# ---- Generation (used when predict_with_generate=True)
PREDICT_WITH_GENERATE = True
GEN_MAX_NEW_TOKENS    = MAX_TARGET_LEN    # cap decoder output
GEN_NUM_BEAMS         = 1                 # greedy is standard for WER

# If you want sampling/penalties, set via model.generation_config (Trainer ignores here):
DO_SAMPLE             = False
NO_REPEAT_NGRAM_SIZE  = 0
REPETITION_PENALTY    = 1.0
TEMPERATURE           = 1.0
TOP_P                 = 1.0

# ---- Trainer misc ----
REMOVE_UNUSED_COLUMNS = True
LABEL_NAMES           = ["labels"]


In [None]:
### THIS CELL is for global constants and hyper parameters for A40 + ByT5-small###

import torch
# ---- Data / paths ----
DATASET_DIR = "combined_asr_dataset"
INPUT_COL   = "asr_output"
TARGET_COL  = "sentence"
VAL_SIZE    = 0.10
SEED        = 42

# ---- Model / output ----
MODEL_NAME  = "google/byt5-small"
OUTPUT_DIR  = "checkpoints/byt5_postasr_a40"
PRECISION   = torch.float16   # use "bf16" only if your stack/device supports it reliably
MODEL_TAG = "byt5-small"       
GPU_TAG   =  "A40"

# ---- Tokenization (ByT5: bytes ≈ tokens)
# Your percentiles: inputs P99≈402 (+prefix) → 416; targets P99.5≈632 → 640
MAX_INPUT_LEN   = 416      # encoder cap
MAX_TARGET_LEN  = 640      # decoder / labels cap
TRUNCATION_SIDE = "right"
PADDING_SIDE    = "right"
PAD_TO_MULTIPLE_OF = 8     # tensor cores happy

# ---- Training (optimizer/schedule) ----
# Safe defaults for 48 GB VRAM with long decoder sequences.
# Effective train batch = PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS * num_gpus
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE  = 32
GRADIENT_ACCUMULATION_STEPS = 2          # effective train batch = 16*2=32 on 1 GPU
LEARNING_RATE   = 1e-4
WEIGHT_DECAY    = 0.01
WARMUP_RATIO    = 0.03
LR_SCHEDULER    = "linear"
LABEL_SMOOTHING = 0.0
GROUP_BY_LENGTH = True
GRADIENT_CLIP_NORM = 1.0

# Choose ONE of these two stopping modes:
NUM_TRAIN_EPOCHS = 3
MAX_STEPS        = -1

# ---- Eval / save / logging ----
EVALUATION_STRATEGY = "steps"
EVAL_STEPS          = 2000
SAVE_STRATEGY       = "steps"
SAVE_STEPS          = 2000
SAVE_TOTAL_LIMIT    = 3
LOGGING_STRATEGY    = "steps"
LOGGING_STEPS       = 100
LOAD_BEST_MODEL_AT_END = True
METRIC_FOR_BEST_MODEL  = "wer"
GREATER_IS_BETTER      = False
REPORT_TO              = "none"
EARLY_STOPPING_PATIENCE = 3

# ---- Generation (used when predict_with_generate=True)
PREDICT_WITH_GENERATE = True
GEN_MAX_NEW_TOKENS    = MAX_TARGET_LEN    # cap decoder output
GEN_NUM_BEAMS         = 1                 # greedy is standard for WER

# If you want sampling/penalties, set via model.generation_config (Trainer ignores here):
DO_SAMPLE             = False
NO_REPEAT_NGRAM_SIZE  = 0
REPETITION_PENALTY    = 1.0
TEMPERATURE           = 1.0
TOP_P                 = 1.0

# ---- Trainer misc ----
REMOVE_UNUSED_COLUMNS = True
LABEL_NAMES           = ["labels"]


In [41]:
### THIS CELL is for loading datasets made with different ASR's ###

import torch
from datasets import load_from_disk

ds = load_from_disk(DATASET_DIR)
splits = ds.train_test_split(test_size=0.1, seed=42)
training_data = splits["train"]
val_data      = splits["test"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def _row_ok(x):
    src = (x.get("asr_output") or "").strip()
    tgt = (x.get("sentence") or "").strip()
    return len(src) > 0 and len(tgt) > 0

training_data = training_data.filter(_row_ok)
val_data = val_data.filter(_row_ok)



In [42]:
### THIS CELL is for tokenizer adjustments ###
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def _clean(s):
    s = "" if s is None else str(s)
    return " ".join(s.split()).strip()

def preprocess_batch(batch):
    srcs = [f"fix mistakes: {_clean(x)}" for x in batch["asr_output"]]
    tgts = [_clean(x) for x in batch["sentence"]]

    # 1) Encode inputs
    model_inputs = tokenizer(
        srcs,
        truncation=True,
        max_length=MAX_INPUT_LEN,
        padding=False,               
        return_attention_mask=True,
    )

    # 2) Encode targets 
    target_enc = tokenizer(
        text_target=tgts,
        truncation=True,
        max_length=MAX_TARGET_LEN,
        padding=False,
    )
    model_inputs["labels"] = target_enc["input_ids"]

    return model_inputs

training_data = training_data.filter(lambda ex: ex["asr_output"] and ex["sentence"])
val_data = val_data.filter(lambda ex: ex["asr_output"] and ex["sentence"])

tokenized_training_dataset = training_data.map(
    preprocess_batch, batched=True, remove_columns=training_data.column_names
)
tokenized_test_dataset = val_data.map(
    preprocess_batch, batched=True, remove_columns=val_data.column_names
)

print("dataset tokenized")


dataset tokenized


In [43]:
### THIS CELL is for defining what's needed for training ###

import torch
from evaluate import load as load_metric
from transformers import (
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss
import math

torch.cuda.empty_cache()

# model loading 
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,dtype=torch.bfloat16, ignore_mismatched_sizes=True).to(device)

# data_collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,   
    pad_to_multiple_of=PAD_TO_MULTIPLE_OF      
)

# DataLoader to handle batch size and,tokenized dataset, and to collate
train_loader = DataLoader(tokenized_training_dataset,shuffle=True, batch_size=TRAIN_BATCH_SIZE, collate_fn=data_collator)
test_loader = DataLoader(tokenized_test_dataset,shuffle=False, batch_size=EVAL_BATCH_SIZE, collate_fn=data_collator)


# to compute num_warmup_steps and num_training_steps
train_size = len(tokenized_training_dataset)
steps_per_epoch = math.ceil(train_size / (TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS if MAX_STEPS <= 0 else MAX_STEPS
warmup_steps = int(WARMUP_RATIO * total_steps)
print(train_size,steps_per_epoch,warmup_steps)

# optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY)
lr_scheduler = get_scheduler(LR_SCHEDULER, optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)


# WER metric definition function
wer_metric = load_metric("wer")
def compute_wer_from_logits(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    labels = labels.clone()
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_preds  = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return wer_metric.compute(predictions=decoded_preds, references=decoded_labels)


print("finished defining what's needed for training")

89948 1406 253
finished defining what's needed for training


In [None]:
### THIS CELL is for function saving model during training ###

from pathlib import Path

def save_epoch_checkpoint(model, tokenizer, epoch, base_dir = "checkpoints", model_tag = MODEL_TAG,gpu_tag = GPU_TAG):
    save_path = Path(base_dir) / f"{model_tag}_{gpu_tag}_epoch{epoch+1}"
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"[checkpoint] Saved epoch {epoch+1} to {save_path.resolve()}")

In [None]:
### THIS CELL is for training on H200 - new,chatgpt generated ###
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
from pathlib import Path
from tqdm import tqdm

torch.cuda.empty_cache()

# ---------- ONE-TIME GENERATION DEFAULTS ----------
model.generation_config.num_beams = 1        # greedy
model.generation_config.do_sample = False    # deterministic
model.generation_config.max_new_tokens = 640 # cap
model.generation_config.eos_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

# ---- cap eval time ----
MAX_EVAL_BATCHES = 50   # <--- evaluate only first 50 batches each epoch

print("training started")
train_losses, val_losses, wer_scores = [], [], []

for epoch in range(NUM_TRAIN_EPOCHS):
    # ------------------- TRAIN -------------------
    model.train()
    train_loop = tqdm(train_loader, leave=True, desc=f"Train | Epoch {epoch}")
    for batch in train_loop:
        optimizer.zero_grad()

        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            labels=batch["labels"].to(device),
        )
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        train_loop.set_postfix(loss=float(loss.item()))
        train_losses.append(float(loss.item()))

    print(f"Finished train epoch {epoch}")

    # ------------------- VALIDATION (WITH GENERATE, CAPPED) -------------------
    model.eval()
    pred_texts, ref_texts = [], []
    epoch_val_losses = []

    with torch.inference_mode():
        val_loop = tqdm(test_loader, leave=True, desc=f"Val   | Epoch {epoch}")
        for i, batch in enumerate(val_loop):
            if i >= MAX_EVAL_BATCHES:
                val_loop.set_postfix(info=f"stopped at {MAX_EVAL_BATCHES} batches")
                break

            # Forward pass only for loss (teacher forcing)
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                labels=batch["labels"].to(device),
            )
            val_loss = float(outputs.loss.item())
            epoch_val_losses.append(val_loss)

            # Deterministic decoding for WER
            gen_ids = model.generate(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                return_dict_in_generate=False,
            )

            # Decode predictions
            preds = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)

            # Prepare and decode references (replace -100 with pad before decoding)
            labels = batch["labels"].clone()
            labels[labels == -100] = tokenizer.pad_token_id
            refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

            preds = [p.strip() for p in preds]
            refs  = [r.strip() for r in refs]

            pred_texts.extend(preds)
            ref_texts.extend(refs)

            val_loop.set_postfix(loss=val_loss, n_preds=len(pred_texts))

    # Aggregate val loss and compute WER once per epoch (on the capped subset)
    mean_val_loss = (sum(epoch_val_losses) / len(epoch_val_losses)) if epoch_val_losses else float("nan")
    val_losses.append(mean_val_loss)

    try:
        wer = compute_wer(pred_texts, ref_texts)
    except NameError:
        try:
            from jiwer import wer as jiwer_wer
            wer = jiwer_wer(ref_texts, pred_texts)
        except Exception:
            wer = float("nan")
    wer_scores.append(float(wer))

    print(f"Finished validation epoch {epoch} | ValLoss(subset): {mean_val_loss:.4f} | WER(subset): {wer:.4f}")

    # ------------------- CHECKPOINT -------------------
    save_epoch_checkpoint(
        model, tokenizer, epoch,
        base_dir="checkpoints",
        model_tag=MODEL_TAG,
        gpu_tag=GPU_TAG
    )
    print(f"saved model at epoch {epoch}")

# Optional: final summary
print(f"Training complete. Epochs: {NUM_TRAIN_EPOCHS}, "
      f"Last ValLoss(subset): {val_losses[-1]:.4f} | Last WER(subset): {wer_scores[-1]:.4f}")


In [45]:
### THIS CELL is for training -old ###
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
from pathlib import Path
from tqdm import tqdm
torch.cuda.empty_cache()
model.train()
print("training started")
train_losses = []
val_losses = []
wer_scores = []

for epoch in range(NUM_TRAIN_EPOCHS):  
    loop = tqdm(train_loader, leave=True)
    val_loop = tqdm(test_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        loss.backward()
        
        # Gradient accumulation
        optimizer.step()
        lr_scheduler.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())
        train_losses.append(loss.item())
        
    print(f"Finished train epoch {epoch}")
    loop = tqdm(train_loader, leave=True)
    for batch in val_loop:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        pred_ids = torch.argmax(outputs.logits, dim=-1)
        wer = compute_wer_from_logits(outputs.logits, batch['labels'].to(device))
        
        # Update progress bar
        val_loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())
        val_loop.set_postfix(wer=wer, loss = loss.item())
        val_losses.append(loss.item())
        wer_scores.append(wer)
    print(f"Finished validation epoch {epoch}")
    save_epoch_checkpoint(model, tokenizer, epoch,base_dir = "checkpoints", model_tag = MODEL_TAG,gpu_tag = GPU_TAG)
    print(f"saved model at epoch {epoch}")
    



Epoch 0:  90%|████████▉ | 1260/1406 [05:14<00:34,  4.27it/s, loss=1.16][A
Epoch 0:  90%|████████▉ | 1260/1406 [05:14<00:34,  4.27it/s, loss=1.19][A
Epoch 0:  90%|████████▉ | 1261/1406 [05:14<00:35,  4.08it/s, loss=1.19][A
Epoch 0:  90%|████████▉ | 1261/1406 [05:14<00:35,  4.08it/s, loss=1.19][A
Epoch 0:  90%|████████▉ | 1261/1406 [05:14<00:35,  4.08it/s, loss=1.17][A
Epoch 0:  90%|████████▉ | 1262/1406 [05:14<00:36,  3.96it/s, loss=1.17][A
Epoch 0:  90%|████████▉ | 1262/1406 [05:14<00:36,  3.96it/s, loss=1.17][A
Epoch 0:  90%|████████▉ | 1262/1406 [05:14<00:36,  3.96it/s, loss=1.2] [A
Epoch 0:  90%|████████▉ | 1263/1406 [05:14<00:37,  3.83it/s, loss=1.2][A
Epoch 0:  90%|████████▉ | 1263/1406 [05:15<00:37,  3.83it/s, loss=1.2][A
Epoch 0:  90%|████████▉ | 1263/1406 [05:15<00:37,  3.83it/s, loss=1.12][A
Epoch 0:  90%|████████▉ | 1264/1406 [05:15<00:36,  3.92it/s, loss=1.12][A
Epoch 0:  90%|████████▉ | 1264/1406 [05:15<00:36,  3.92it/s, loss=1.12][A
Epoch 0:  90%|████████▉ | 1

Finished train epoch 0



Epoch 0:   0%|          | 0/157 [05:54<?, ?it/s]
Epoch 0:   1%|          | 1/157 [05:57<15:21:36, 354.46s/it, loss=1.18, wer=1.05]
Epoch 0:   1%|▏         | 2/157 [06:02<6:22:23, 148.03s/it, loss=1.13, wer=1.04] 
Epoch 0:   2%|▏         | 3/157 [06:06<3:31:11, 82.28s/it, loss=1.19, wer=1.07] 
Epoch 0:   3%|▎         | 4/157 [06:10<2:11:06, 51.42s/it, loss=1.2, wer=1.1]  
Epoch 0:   3%|▎         | 5/157 [06:14<1:27:04, 34.37s/it, loss=1.12, wer=1.02]
Epoch 0:   4%|▍         | 6/157 [06:18<1:00:36, 24.08s/it, loss=1.14, wer=1.06]
Epoch 0:   4%|▍         | 7/157 [06:22<43:52, 17.55s/it, loss=1.19, wer=1.06]  
Epoch 0:   5%|▌         | 8/157 [06:26<32:54, 13.25s/it, loss=1.18, wer=1.01]
Epoch 0:   6%|▌         | 9/157 [06:29<25:42, 10.42s/it, loss=1.17, wer=1]   
Epoch 0:   6%|▋         | 10/157 [06:33<19:44,  8.06s/it, loss=1.17, wer=1.04]
Epoch 0:   7%|▋         | 11/157 [06:37<16:33,  6.81s/it, loss=1.23, wer=1.05]
Epoch 0:   8%|▊         | 12/157 [06:41<14:27,  5.98s/it, loss=1.21, we

KeyboardInterrupt: 

In [None]:
### THIS CELL is for saving model ###

from pathlib import Path
import re
import torch

# Derive a short, filesystem-safe GPU tag (or set GPU_TAG manually)
def _gpu_tag():
    if torch.cuda.is_available():
        name = torch.cuda.get_device_name(0)
    else:
        name = "cpu"
    tag = re.sub(r"[^A-Za-z0-9]+", "_", name).strip("_").lower()
    return tag

MODEL_TAG = Path(MODEL_NAME).name         
GPU_TAG   = _gpu_tag()       

SAVE_DIR = Path.cwd() / "checkpoints" / f"{MODEL_TAG}__{GPU_TAG}"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Saved to: {SAVE_DIR.resolve()}")


In [None]:
### THIS CELL is for plotting loss,val,WER ###

import matplotlib.pyplot as plt

plt.plot(wer_scores, marker='o')
plt.title("wer across all batches")
plt.xlabel("batch")
plt.ylabel("WER")
plt.grid(True)
plt.show()

plt.plot(train_losses, marker='o')
plt.title("train loss across all batches")
plt.xlabel("batch")
plt.ylabel("Train Loss")
plt.grid(True)
plt.show()

plt.plot(val_losses, marker='o')
plt.title("validation loss across all batches")
plt.xlabel("batch")
plt.ylabel("Validation Loss")
plt.grid(True)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
train_losses_log = np.log(train_losses)
val_losses_log = np.log(val_losses)
wer_scores_log = np.log(wer_scores)

plt.plot(train_losses_log, marker='o')
plt.title("train loss across all batches")
plt.xlabel("batch")
plt.ylabel("Train Loss")
plt.grid(True)
plt.show()

plt.plot(val_losses_log, marker='o')
plt.title("validation loss across all batches")
plt.xlabel("batch")
plt.ylabel("Validation Loss")
plt.grid(True)
plt.show()

plt.plot(wer_scores_log, marker='o')
plt.title("wer across all batches")
plt.xlabel("batch")
plt.ylabel("WER")
plt.grid(True)
plt.show()

## Statistics Over Data

In [None]:

from evaluate import load as load_metric
# training_data = Dataset.load_from_disk("/content/drive/MyDrive/nlp proj/ivrit_ai_1k/train")
# val_data = Dataset.load_from_disk("/content/drive/MyDrive/nlp proj/ivrit_ai_1k/val")

wer_metric = load_metric("wer")
cer_metric = load_metric("cer")
wer_train = wer_metric.compute(predictions=training_data[INPUT_COL], references=training_data["sentence"])
cer_train = cer_metric.compute(predictions=training_data[INPUT_COL], references=training_data["sentence"])
wer_val = wer_metric.compute(predictions=val_data[INPUT_COL], references=val_data["sentence"])
cer_val = cer_metric.compute(predictions=val_data[INPUT_COL], references=val_data["sentence"])

print(f"calculated baseline WER on the training data is: {wer_train} , The CER is {cer_train}")
print(f"calculated baseline WER on the validation data is: {wer_val} , The CER is {cer_val}")



In [None]:
# graph distribution of wer and cer
import numpy as np
import matplotlib.pyplot as plt

wer_vals = np.zeros(101)
cer_vals = np.zeros(101)
num_samples = len(training_data)
for i in range(num_samples):
  wer_samp = wer_metric.compute(predictions=[training_data[INPUT_COL][i]], references=[training_data["sentence"][i]])
  cer_samp = cer_metric.compute(predictions=[training_data[INPUT_COL][i]], references=[training_data["sentence"][i]])
  wer_vals[min(int(np.floor(wer_samp*100)), 100)] += 1/num_samples
  cer_vals[min(int(np.floor(cer_samp*100)), 100)] += 1/num_samples
  if wer_samp > 1.0 or cer_samp > 1.0:
    print(f"sample {i} has wer {wer_samp} and cer {cer_samp}")
    print(training_data[INPUT_COL][i])
    print(training_data["sentence"][i])

# print(wer_vals)
# print(cer_vals)

plt.plot(wer_vals, marker='o')
plt.title("WER ditribution across dataset")
plt.xlabel("WER")
plt.ylabel("Proportion of samples")
plt.grid(True)
plt.show()

plt.plot(cer_vals, marker='o')
plt.title("CER ditribution across dataset")
plt.xlabel("CER")
plt.ylabel("Proportion of samples")
plt.grid(True)
plt.show()


## Model Evaluation

In [None]:
samples_to_print = 100
corrected_data = []
relevant_references = training_data[TARGET_COL][0:samples_to_print]
relevant_asr_outputs = training_data[INPUT_COL][0:samples_to_print]
for i in range(samples_to_print):
  corrected_data.append(infer(training_data[INPUT_COL][i]))
  if i % 10 == 0:
    print("   ****************")
    print(training_data[TARGET_COL][i])
    print(training_data[INPUT_COL][i])
    print(corrected_data[i])
    print("   ****************")
wer_corr = wer_metric.compute(predictions=corrected_data, references=relevant_references)
cer_corr = cer_metric.compute(predictions=corrected_data, references=relevant_references)
print(f"calculated WER on the corrected data is: {wer_corr} , The CER is {cer_corr}")
wer_base = wer_metric.compute(predictions=relevant_asr_outputs, references=relevant_references)
cer_base = cer_metric.compute(predictions=relevant_asr_outputs, references=relevant_references)
print(f"calculated WER on the baseline data is: {wer_base} , The CER is {cer_base}")


In [None]:
from transformers import __version__
print(__version__)

##DEMO

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch

# === Load fine-tuned model ===
model_path = "/content/drive/MyDrive/nlp proj/byt5_checkpoints"  # or wherever you saved it
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch


# === Generation function ===
def correct_sentence(noisy_sentence: str, max_length: int = 128) -> str:
    inputs = tokenizer(noisy_sentence, return_tensors="pt").to(device)
    input_length = min(max_length, inputs["input_ids"].shape[1])
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            min_length=input_length - 2,
            max_length=input_length + 2,
            num_beams=8,
            do_sample=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def infer(prompt, max_length: int = 128):
    input = tokenizer(f"fix mistakes: {prompt}", return_tensors="pt")
    input_ids      = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(input_ids.to(device),
                            attention_mask=attention_mask.to(device),
                            max_new_tokens=max_length,
                            do_sample = True, top_k = 50, top_p = 0.85)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

# === Interactive loop ===
print("🔤 Hebrew ASR Correction | Type 'exit' to quit.")
while True:
    text = input("\n🗣 Enter noisy sentence: ").strip()
    if text.lower() in {"exit", "quit"}:
        break

    try:
        corrected = infer(text)
        print(f"✅ Model Output: {corrected}")
    except Exception as e:
        print(f"⚠️ Error: {e}")


##Model Evaluation

In [None]:
from evaluate import load as load_metric
from datasets import Dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch

# === Load fine-tuned model ===
# model_path = "/content/drive/MyDrive/byt5-checkpoints/checkpoint-5000"  # or wherever you saved it
# # model_path = MODEL_NAME 
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = T5ForConditionalGeneration.from_pretrained(model_path)
# model.eval()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

print("loaded model")

# === Generation function ===
def correct_sentence(noisy_sentence: str, max_length: int = 128) -> str:
    inputs = tokenizer(noisy_sentence, return_tensors="pt").to(device)
    input_length = min(max_length, inputs["input_ids"].shape[1])
    with torch.no_grad():
        outputs = model.generate(
          **inputs,
          min_length=input_length - 2,
          max_length=input_length + 2,
          num_beams=1,                  # Drop beam search if sampling
          do_sample=True,
          repetition_penalty=1.5,       # Stronger penalty for byte loops
          no_repeat_ngram_size=20,      # Block repeating 20-byte sequences
          temperature=0.8,
          top_p=0.9
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === load data and metrics
# val_data = Dataset.load_from_disk("/content/drive/MyDrive/nlp proj/ivrit_ai_1k/val")

wer_metric = load_metric("wer")
cer_metric = load_metric("cer")


# === Run Model on Data
num_samples = 100

relevant_references = val_data["sentence"][0:num_samples]
print("everything was loaded successfully, starting evaluation")
corrected_data = []
for i, pair in enumerate(val_data):
  if i >= num_samples:
      break
  try:
      corrected_data.append(correct_sentence(pair[INPUT_COL]))

  except Exception as e:
      print(f"⚠️ Error: {e}")
      print(f"""Failed on input: {pair[INPUT_COL]}""")

print("done with generation, calculating metrics")

for i in range(num_samples):
  print(corrected_data[i] + " ****** " + relevant_references[i])

wer_val = wer_metric.compute(predictions=corrected_data, references=relevant_references)
cer_val = cer_metric.compute(predictions=corrected_data, references=relevant_references)

print(f"THE MODEL'S WER on the validation data is: {wer_val} , The CER is {cer_val}")



## UTILYTIES

In [None]:
function simulateMouseMove() {
  // Try multiple selectors for the notebook area
  const selectors = [
    'div#notebook-container',        // Main notebook container (classic)
    'colab-run-button',              // a common Colab element
    'colab-connect-button',          // fallback
    'body'                          // fallback to whole page
  ];

  let target = null;
  for (const sel of selectors) {
    const el = document.querySelector(sel);
    if (el) {
      target = el;
      break;
    }
  }

  if (!target) {
    console.log('No suitable element found to simulate mousemove');
    return;
  }

  const event = new MouseEvent('mousemove', {
    bubbles: true,
    cancelable: true,
    clientX: 100,
    clientY: 100
  });
  target.dispatchEvent(event);
  console.log(`Simulated mousemove on ${target.tagName}#${target.id || ''}`);
}

// Run every 60 seconds
setInterval(simulateMouseMove, 60000);
