# Setup Env

In [None]:
!pip install unidecode==1.4.0 evaluate==0.4.6 transformers==4.49.0 jiwer==4.0.0

In [None]:
import os
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass
from typing import Union, Dict, List

import pandas as pd
import numpy as np
from datasets import Dataset
import argparse
import torch
import evaluate

import os
from dataclasses import dataclass
from typing import Union, Dict, List, Optional
from transformers import AdamW, AutoTokenizer, T5ForConditionalGeneration, T5Config
from transformers import (
    DataCollator,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)


os.environ["WANDB_DISABLED"] = "true"

set_seed(41)

# Get Data

In [4]:
correct_url = "https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian/resolve/main/data/train-01.parquet"
df = pd.read_parquet(correct_url)

df.to_csv('PersianG2P.csv')

In [15]:
!wget https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian/resolve/main/augmented_data/PersianG2P_augmented.csv

--2025-10-30 07:32:38--  https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian/resolve/main/augmented_data/PersianG2P_augmented.csv
Resolving huggingface.co (huggingface.co)... 3.166.152.105, 3.166.152.110, 3.166.152.44, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.105|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/677246f593724e487d680fd1/4f2faec3066594a874559101086d38d02c12057e746a3cdc6c53fef7cc8bc3ac?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251030%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251030T073238Z&X-Amz-Expires=3600&X-Amz-Signature=072f4f4afa8fd8e45ccc53b4913f3ac2909e1e479e53ad4f220ec4235e5ed8e0&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27PersianG2P_augmented.csv%3B+filename%3D%22PersianG2P_augmented.csv%22%3B&response-content-type=text%2Fcsv&x-id=GetObje

# Get Base Checkpoint

In [11]:
!gdown -q 1CrCX8SNhMcmi3KogffFaS4pSaC0t73nJ -O checkpoint-320.zip
!unzip checkpoint-320.zip
!mv content/checkpoint-320 checkpoint-320A

Archive:  checkpoint-320.zip
replace content/checkpoint-320/special_tokens_map.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: content/checkpoint-320/special_tokens_map.json  
  inflating: content/checkpoint-320/scheduler.pt  
  inflating: content/checkpoint-320/config.json  
  inflating: content/checkpoint-320/rng_state.pth  
  inflating: content/checkpoint-320/pytorch_model.bin  
  inflating: content/checkpoint-320/tokenizer_config.json  
  inflating: content/checkpoint-320/trainer_state.json  
  inflating: content/checkpoint-320/training_args.bin  


# Finetuning

In [12]:
def prepare_dataset(batch):

    batch['input_ids'] = batch['Grapheme']
    batch['labels'] = batch['Mapped Phoneme']

    return batch

# %%
# Data collator for padding
@dataclass
class DataCollatorWithPadding:
    tokenizer: AutoTokenizer
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        words = [feature["input_ids"] for feature in features]
        prons = [feature["labels"] for feature in features]
        batch = self.tokenizer(words, padding=self.padding, add_special_tokens=False, return_attention_mask=True, return_tensors='pt')
        pron_batch = self.tokenizer(prons, padding=self.padding, add_special_tokens=True, return_attention_mask=True, return_tensors='pt')
        batch['labels'] = pron_batch['input_ids'].masked_fill(pron_batch.attention_mask.ne(1), -100)
        return batch

# %%
# Compute metrics (CER and WER)
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer, 'wer': wer}

# setting the evaluation metrics
cer_metric = evaluate.load("cer")
wer_metric = evaluate.load('wer')

## Phase 1

In [None]:
def load_pronuncation_dictionary(path, train=True, homograph_only=False, human=False) -> Dataset:
    # Read the CSV file
    df = pd.read_csv(path, index_col=[0])

    if homograph_only:
        if human:
            df = df[df['Source'] == 'human']
        if not human:
            df = df[df['Source'] != 'human']

    # Drop unnecessary columns
    df = df.drop(['Source', 'Source ID'], axis=1)

    # Drop rows where 'Phoneme' is NaN
    df = df.dropna(subset=['Mapped Phoneme'])

    # Filter rows based on phoneme length
    Plen = np.array([len(i) for i in df['Mapped Phoneme']])
    df = df.iloc[Plen < 512, :]

    # Filter rows based on 'Homograph Grapheme' column
    if homograph_only:
        df = df[df['Homograph Grapheme'].notna() & (df['Homograph Grapheme'] != '')]
    else:
        df = df[df['Homograph Grapheme'].isna() | (df['Homograph Grapheme'] == '')]

    # Shuffle the DataFrame
    df = df.sample(frac=1)

    # Split into train and test sets
    if train:
        return Dataset.from_pandas(df.iloc[:len(df)-90, :])
    else:
        return Dataset.from_pandas(df.iloc[len(df)-90:, :])

# %%
# Load datasets (only rows with 'Homograph Grapheme')
train_data = load_pronuncation_dictionary('PersianG2P.csv', train=True)
train_data = train_data.map(prepare_dataset)
train_dataset = train_data

dev_data = load_pronuncation_dictionary('PersianG2P.csv', train=False)
dev_data = dev_data.map(prepare_dataset)
dev_dataset = dev_data

# Load tokenizer and model from checkpoint
checkpoint_path = "checkpoint-320"  # Path to your checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments (default values)
training_args = Seq2SeqTrainingArguments(
    output_dir="./phase1-30-ep",  # Directory to save the fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=5,  # Fewer epochs for this step
    learning_rate=5e-4,  # Default learning rate
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./phase1-30-ep")

# %%
import matplotlib.pyplot as plt

# Extract training and validation loss from the log history
train_loss = []
val_loss = []
for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

# Plot the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Training Loss", marker="o")
plt.plot(val_loss, label="Validation Loss", marker="o")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()

# Save the plot to disk
plt.savefig("phase1-30-ep.png")

# Optionally, close the plot to free up memory
plt.close()

## Phase 2

In [None]:
# %%
# Load datasets (only rows with 'Homograph Grapheme')
train_data = load_pronuncation_dictionary('PersianG2P.csv',
                                          train=True,
                                          homograph_only=True)
train_data = train_data.map(prepare_dataset)
train_dataset = train_data

dev_data = load_pronuncation_dictionary('PersianG2P.csv',
                                        train=False,
                                        homograph_only=True)
dev_data = dev_data.map(prepare_dataset)
dev_dataset = dev_data

# Load tokenizer and model from the previous fine-tuning step
checkpoint_path = "./phase1-30-ep"  # Path to the model from Step 1
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments (default values)
training_args = Seq2SeqTrainingArguments(
    output_dir="./phase2-30-ep",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=30,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./phase2-30-ep")


# %%
import matplotlib.pyplot as plt

# Extract training and validation loss from the log history
train_loss = []
val_loss = []
for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

# Plot the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Training Loss", marker="o")
plt.plot(val_loss, label="Validation Loss", marker="o")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()

# Save the plot to disk
plt.savefig("phase2-30-ep.png")

# Optionally, close the plot to free up memory
plt.close()

## Phase 3

In [None]:
# %%
# Load datasets (only rows with 'Homograph Grapheme')
train_data = load_pronuncation_dictionary('PersianG2P_augmented.csv',
                                          train=True,
                                          homograph_only=True,
                                          human=True)
train_data = train_data.map(prepare_dataset)
train_dataset = train_data

dev_data = load_pronuncation_dictionary('PersianG2P_augmented.csv',
                                        train=False,
                                        homograph_only=True,
                                        human=True)
dev_data = dev_data.map(prepare_dataset)
dev_dataset = dev_data

# Load tokenizer and model from the previous fine-tuning step
checkpoint_path = "./phase2-30-ep"  # Path to the model from Step 1
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments (default values)
training_args = Seq2SeqTrainingArguments(
    output_dir="./phase3-30-ep",  # Directory to save the final fine-tuned model
    predict_with_generate=True,
    generation_num_beams=5,
    generation_max_length=512,
    evaluation_strategy="steps",
    per_device_train_batch_size=32,  # Default batch size
    per_device_eval_batch_size=100,  # Default batch size
    num_train_epochs=50,  # More epochs for this step
    learning_rate=5e-4,  # Lower learning rate for fine-tuning
    warmup_steps=1000,  # Default warmup steps
    logging_steps=1000,  # Default logging steps
    save_steps=4000,  # Default save steps
    eval_steps=1000,  # Default evaluation steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
    fp16=False,  # Disable FP16 by default
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./phase3-30-ep")


# %%
import matplotlib.pyplot as plt

# Extract training and validation loss from the log history
train_loss = []
val_loss = []
for log in trainer.state.log_history:
    if "loss" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log:
        val_loss.append(log["eval_loss"])

# Plot the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Training Loss", marker="o")
plt.plot(val_loss, label="Validation Loss", marker="o")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()

# Save the plot to disk
plt.savefig("phase3-30-ep.png")

# Optionally, close the plot to free up memory
plt.close()

