In [1]:
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan,SpeechT5Tokenizer

In [None]:
dataset = load_dataset("mozilla-foundation/common_voice_17_0","ur",trust_remote_code=True,split=["train"])
dataset = dataset[0]

In [None]:
dataset= dataset.remove_columns(column_names=['up_votes', 'down_votes', 'age','accent', 'locale', 'segment', 'variant'])

In [None]:
all_text = " ".join(dataset['sentence'])

In [None]:
vocab = sorted(set(all_text))
len(vocab)

In [None]:
special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
vocab_list = special_tokens + vocab


In [None]:
sentences = dataset['sentence']

In [None]:
import json

vocab_dict = {char: idx for idx, char in enumerate(vocab_list)}
vocab_dict["▁"] = vocab_dict[" "]
del vocab_dict[" "]

# Save the dictionary as a JSON file
with open("urdu_vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=4)

# Assuming `all_text` contains the concatenated sentences
with open("urdu_text.txt", "w",) as f:
    for sen in sentences:
        f.write(sen+"\n")


In [None]:
with open("urdu_text.txt", "w",encoding="utf-8") as f:
    for sen in sentences:
        f.write(sen+"\n")

In [None]:
import sentencepiece as spm

# Train the SentencePiece model
spm.SentencePieceTrainer.Train(
    '--input=urdu_text.txt --model_prefix=urdu_sp --vocab_size=96 --bos_id=0 --eos_id=1 --pad_id=2 --unk_id=3'
)


In [None]:
from transformers import SpeechT5Tokenizer

# Initialize the SpeechT5Tokenizer with the trained SentencePiece model
tokenizer = SpeechT5Tokenizer(vocab_file="urdu_sp.model", 
                              bos_token="<s>", eos_token="</s>", 
                              pad_token="<pad>", unk_token="<unk>")


In [None]:
encoded =  tokenizer("میں ابھی تیار ہو تاہوں")

In [None]:
tokenizer.decode(encoded.input_ids)

In [2]:
checkpoint = "HamzaSidhu786/urdu_text_to_speech_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

In [None]:
feature_extractor = processor.feature_extractor
vocab = processor.tokenizer.get_vocab
len(list(vocab))

In [None]:
processor = SpeechT5Processor(feature_extractor=feature_extractor,tokenizer=tokenizer)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
repo_name = "urdu_text_to_speech_tts"
processor.push_to_hub(repo_name)

In [None]:
from collections import defaultdict

speaker_counts = defaultdict(int)

for speaker_id in dataset["client_id"]:
    speaker_counts[speaker_id] += 1

In [None]:
def select_speaker(speaker_id):
    return speaker_counts[speaker_id]>80

dataset = dataset.filter(select_speaker, input_columns=["client_id"])

In [None]:
dataset = dataset.remove_columns(column_names=['client_id', 'path'])

In [None]:
from datasets import Audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
dataset[0]

In [None]:
import os
import shutil
import torch
from speechbrain.inference import EncoderClassifier
from huggingface_hub import hf_hub_download

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Create a temporary directory to save the model
savedir = os.path.join("C:/temp", spk_model_name.replace('/', '_'))

# Ensure the directory exists
os.makedirs(savedir, exist_ok=True)

# Load the model from the specified directory
speaker_model = EncoderClassifier.from_hparams(
    source=savedir,
    run_opts={"device": device},
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings


In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["sentence"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [None]:
processed_example = prepare_dataset(dataset[0])
processed_example

In [None]:
processed_example['speaker_embeddings'].shape

In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [3]:
dataset = load_dataset("HamzaSidhu786/urdu_text_to_speech_dataset")

In [4]:
dataset = dataset['train'].train_test_split(test_size=0.1)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'speaker_embeddings'],
        num_rows: 3886
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'speaker_embeddings'],
        num_rows: 432
    })
})

In [6]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [7]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [8]:
from transformers import SpeechT5ForTextToSpeech
import os


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SpeechT5ForTextToSpeech.from_pretrained(
    "microsoft/speecht5_tts",
    ignore_mismatched_sizes=True
)

model.resize_token_embeddings(len(processor.tokenizer))

Embedding(96, 768)

In [12]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 20
repo_name = "urdu_text_to_speech_tts"

training_args = Seq2SeqTrainingArguments(
    output_dir=repo_name,  # Change to a repo name of your choice
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=2,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True,
)




In [13]:
from transformers import Seq2SeqTrainer
import os


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [14]:
trainer.train()

  0%|          | 0/9720 [00:00<?, ?it/s]

{'loss': 0.9114, 'grad_norm': 10.997079849243164, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.05}
{'loss': 0.8359, 'grad_norm': 4.4994025230407715, 'learning_rate': 9.600000000000001e-07, 'epoch': 0.1}
{'loss': 0.877, 'grad_norm': 5.132769584655762, 'learning_rate': 1.46e-06, 'epoch': 0.15}
{'loss': 0.8542, 'grad_norm': 4.406528949737549, 'learning_rate': 1.9600000000000003e-06, 'epoch': 0.21}
{'loss': 0.8176, 'grad_norm': 10.743416786193848, 'learning_rate': 2.46e-06, 'epoch': 0.26}
{'loss': 0.8092, 'grad_norm': 3.64143443107605, 'learning_rate': 2.96e-06, 'epoch': 0.31}
{'loss': 0.7713, 'grad_norm': 6.745840549468994, 'learning_rate': 3.46e-06, 'epoch': 0.36}
{'loss': 0.7625, 'grad_norm': 4.192050457000732, 'learning_rate': 3.96e-06, 'epoch': 0.41}
{'loss': 0.7729, 'grad_norm': 3.902493476867676, 'learning_rate': 4.4600000000000005e-06, 'epoch': 0.46}
{'loss': 0.783, 'grad_norm': 7.834615230560303, 'learning_rate': 4.960000000000001e-06, 'epoch': 0.51}
{'loss': 0.7404, 'grad_

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5707175135612488, 'eval_runtime': 10.2686, 'eval_samples_per_second': 42.07, 'eval_steps_per_second': 21.035, 'epoch': 1.0}




{'loss': 0.6288, 'grad_norm': 5.124589443206787, 'learning_rate': 9.940000000000001e-06, 'epoch': 1.03}
{'loss': 0.6747, 'grad_norm': 21.063234329223633, 'learning_rate': 9.976138828633406e-06, 'epoch': 1.08}
{'loss': 0.6325, 'grad_norm': 8.16789722442627, 'learning_rate': 9.949023861171367e-06, 'epoch': 1.13}
{'loss': 0.6379, 'grad_norm': 4.2858476638793945, 'learning_rate': 9.921908893709329e-06, 'epoch': 1.18}
{'loss': 0.6528, 'grad_norm': 10.04606819152832, 'learning_rate': 9.894793926247289e-06, 'epoch': 1.23}
{'loss': 0.6264, 'grad_norm': 3.9601223468780518, 'learning_rate': 9.86767895878525e-06, 'epoch': 1.29}
{'loss': 0.6479, 'grad_norm': 5.14469575881958, 'learning_rate': 9.840563991323211e-06, 'epoch': 1.34}
{'loss': 0.6252, 'grad_norm': 5.869032859802246, 'learning_rate': 9.813449023861173e-06, 'epoch': 1.39}
{'loss': 0.6266, 'grad_norm': 4.930250644683838, 'learning_rate': 9.786334056399133e-06, 'epoch': 1.44}
{'loss': 0.6323, 'grad_norm': 5.389865398406982, 'learning_rate'

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5318584442138672, 'eval_runtime': 10.4964, 'eval_samples_per_second': 41.157, 'eval_steps_per_second': 20.578, 'epoch': 2.0}




{'loss': 0.5814, 'grad_norm': 7.272294521331787, 'learning_rate': 9.488069414316705e-06, 'epoch': 2.01}
{'loss': 0.5889, 'grad_norm': 4.437044620513916, 'learning_rate': 9.460954446854665e-06, 'epoch': 2.06}
{'loss': 0.5924, 'grad_norm': 3.758173942565918, 'learning_rate': 9.433839479392626e-06, 'epoch': 2.11}
{'loss': 0.6089, 'grad_norm': 5.154407024383545, 'learning_rate': 9.406724511930586e-06, 'epoch': 2.16}
{'loss': 0.5845, 'grad_norm': 3.2607545852661133, 'learning_rate': 9.379609544468547e-06, 'epoch': 2.21}
{'loss': 0.5996, 'grad_norm': 20.547828674316406, 'learning_rate': 9.352494577006509e-06, 'epoch': 2.26}
{'loss': 0.5685, 'grad_norm': 3.5959651470184326, 'learning_rate': 9.32537960954447e-06, 'epoch': 2.31}
{'loss': 0.5799, 'grad_norm': 5.675164222717285, 'learning_rate': 9.298264642082431e-06, 'epoch': 2.37}
{'loss': 0.5728, 'grad_norm': 7.9192328453063965, 'learning_rate': 9.271149674620391e-06, 'epoch': 2.42}
{'loss': 0.5767, 'grad_norm': 5.177300453186035, 'learning_ra

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5264674425125122, 'eval_runtime': 10.1826, 'eval_samples_per_second': 42.425, 'eval_steps_per_second': 21.213, 'epoch': 3.0}




{'loss': 0.5634, 'grad_norm': 7.339261054992676, 'learning_rate': 8.945770065075923e-06, 'epoch': 3.03}
{'loss': 0.5551, 'grad_norm': 3.785226821899414, 'learning_rate': 8.918655097613883e-06, 'epoch': 3.09}
{'loss': 0.5653, 'grad_norm': 5.295050621032715, 'learning_rate': 8.891540130151844e-06, 'epoch': 3.14}
{'loss': 0.5545, 'grad_norm': 5.035737991333008, 'learning_rate': 8.864425162689806e-06, 'epoch': 3.19}
{'loss': 0.5677, 'grad_norm': 3.527210235595703, 'learning_rate': 8.837310195227767e-06, 'epoch': 3.24}
{'loss': 0.568, 'grad_norm': 4.7067413330078125, 'learning_rate': 8.810195227765728e-06, 'epoch': 3.29}
{'loss': 0.5898, 'grad_norm': 3.6692609786987305, 'learning_rate': 8.783080260303688e-06, 'epoch': 3.34}
{'loss': 0.5739, 'grad_norm': 4.023890495300293, 'learning_rate': 8.75596529284165e-06, 'epoch': 3.4}
{'loss': 0.5927, 'grad_norm': 5.170442581176758, 'learning_rate': 8.72885032537961e-06, 'epoch': 3.45}
{'loss': 0.5499, 'grad_norm': 6.889874458312988, 'learning_rate': 

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.517765462398529, 'eval_runtime': 10.1504, 'eval_samples_per_second': 42.56, 'eval_steps_per_second': 21.28, 'epoch': 4.0}




{'loss': 0.5728, 'grad_norm': 5.242778778076172, 'learning_rate': 8.430585683297182e-06, 'epoch': 4.01}
{'loss': 0.5715, 'grad_norm': 8.996541976928711, 'learning_rate': 8.403470715835141e-06, 'epoch': 4.06}
{'loss': 0.554, 'grad_norm': 5.452279090881348, 'learning_rate': 8.376355748373103e-06, 'epoch': 4.12}
{'loss': 0.5736, 'grad_norm': 2.9349215030670166, 'learning_rate': 8.349240780911062e-06, 'epoch': 4.17}
{'loss': 0.5411, 'grad_norm': 4.15626335144043, 'learning_rate': 8.322125813449024e-06, 'epoch': 4.22}
{'loss': 0.5708, 'grad_norm': 5.112911224365234, 'learning_rate': 8.295010845986985e-06, 'epoch': 4.27}
{'loss': 0.5406, 'grad_norm': 13.629467964172363, 'learning_rate': 8.267895878524947e-06, 'epoch': 4.32}
{'loss': 0.5568, 'grad_norm': 6.237457275390625, 'learning_rate': 8.240780911062908e-06, 'epoch': 4.37}
{'loss': 0.5654, 'grad_norm': 7.005387783050537, 'learning_rate': 8.213665943600868e-06, 'epoch': 4.42}
{'loss': 0.5609, 'grad_norm': 5.57333517074585, 'learning_rate':

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5142039656639099, 'eval_runtime': 10.0199, 'eval_samples_per_second': 43.114, 'eval_steps_per_second': 21.557, 'epoch': 5.0}




{'loss': 0.5708, 'grad_norm': 12.452983856201172, 'learning_rate': 7.8882863340564e-06, 'epoch': 5.04}
{'loss': 0.5508, 'grad_norm': 5.313774585723877, 'learning_rate': 7.861171366594361e-06, 'epoch': 5.09}
{'loss': 0.5351, 'grad_norm': 3.2509355545043945, 'learning_rate': 7.834056399132321e-06, 'epoch': 5.14}
{'loss': 0.5297, 'grad_norm': 4.524580955505371, 'learning_rate': 7.806941431670282e-06, 'epoch': 5.2}
{'loss': 0.5839, 'grad_norm': 4.675780296325684, 'learning_rate': 7.779826464208244e-06, 'epoch': 5.25}
{'loss': 0.5637, 'grad_norm': 5.421511650085449, 'learning_rate': 7.752711496746205e-06, 'epoch': 5.3}
{'loss': 0.5582, 'grad_norm': 7.746039390563965, 'learning_rate': 7.725596529284165e-06, 'epoch': 5.35}
{'loss': 0.5666, 'grad_norm': 4.6751933097839355, 'learning_rate': 7.698481561822126e-06, 'epoch': 5.4}
{'loss': 0.5732, 'grad_norm': 3.2314035892486572, 'learning_rate': 7.671366594360088e-06, 'epoch': 5.45}
{'loss': 0.5662, 'grad_norm': 3.134005308151245, 'learning_rate':

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5073474049568176, 'eval_runtime': 10.4347, 'eval_samples_per_second': 41.4, 'eval_steps_per_second': 20.7, 'epoch': 6.0}




{'loss': 0.5667, 'grad_norm': 3.4356741905212402, 'learning_rate': 7.374186550976139e-06, 'epoch': 6.02}
{'loss': 0.5392, 'grad_norm': 5.094729900360107, 'learning_rate': 7.3470715835141e-06, 'epoch': 6.07}
{'loss': 0.5348, 'grad_norm': 5.810414791107178, 'learning_rate': 7.319956616052062e-06, 'epoch': 6.12}
{'loss': 0.5472, 'grad_norm': 2.6866109371185303, 'learning_rate': 7.292841648590022e-06, 'epoch': 6.17}
{'loss': 0.5535, 'grad_norm': 7.947140216827393, 'learning_rate': 7.2657266811279836e-06, 'epoch': 6.22}
{'loss': 0.5399, 'grad_norm': 4.591921329498291, 'learning_rate': 7.238611713665944e-06, 'epoch': 6.28}
{'loss': 0.5369, 'grad_norm': 3.1778085231781006, 'learning_rate': 7.2114967462039056e-06, 'epoch': 6.33}
{'loss': 0.5413, 'grad_norm': 5.555600166320801, 'learning_rate': 7.184381778741865e-06, 'epoch': 6.38}
{'loss': 0.546, 'grad_norm': 4.644460201263428, 'learning_rate': 7.1572668112798276e-06, 'epoch': 6.43}
{'loss': 0.5391, 'grad_norm': 3.689602851867676, 'learning_ra

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5014516711235046, 'eval_runtime': 8.7797, 'eval_samples_per_second': 49.204, 'eval_steps_per_second': 24.602, 'epoch': 7.0}




{'loss': 0.5561, 'grad_norm': 5.104550838470459, 'learning_rate': 6.831887201735359e-06, 'epoch': 7.05}
{'loss': 0.5376, 'grad_norm': 3.751511812210083, 'learning_rate': 6.804772234273319e-06, 'epoch': 7.1}
{'loss': 0.5258, 'grad_norm': 8.7872314453125, 'learning_rate': 6.777657266811281e-06, 'epoch': 7.15}
{'loss': 0.5408, 'grad_norm': 5.870528221130371, 'learning_rate': 6.750542299349241e-06, 'epoch': 7.2}
{'loss': 0.5378, 'grad_norm': 3.1934218406677246, 'learning_rate': 6.723427331887203e-06, 'epoch': 7.25}
{'loss': 0.5486, 'grad_norm': 4.101743221282959, 'learning_rate': 6.696312364425164e-06, 'epoch': 7.3}
{'loss': 0.5357, 'grad_norm': 5.055153846740723, 'learning_rate': 6.669197396963124e-06, 'epoch': 7.36}
{'loss': 0.5434, 'grad_norm': 6.036901950836182, 'learning_rate': 6.642082429501085e-06, 'epoch': 7.41}
{'loss': 0.5482, 'grad_norm': 9.086446762084961, 'learning_rate': 6.614967462039046e-06, 'epoch': 7.46}
{'loss': 0.5247, 'grad_norm': 5.397604465484619, 'learning_rate': 6.

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4992269277572632, 'eval_runtime': 9.0324, 'eval_samples_per_second': 47.828, 'eval_steps_per_second': 23.914, 'epoch': 8.0}




{'loss': 0.5313, 'grad_norm': 4.710996627807617, 'learning_rate': 6.316702819956616e-06, 'epoch': 8.02}
{'loss': 0.5473, 'grad_norm': 5.378105640411377, 'learning_rate': 6.289587852494578e-06, 'epoch': 8.08}
{'loss': 0.5434, 'grad_norm': 5.906246662139893, 'learning_rate': 6.262472885032539e-06, 'epoch': 8.13}
{'loss': 0.5418, 'grad_norm': 3.6615874767303467, 'learning_rate': 6.235357917570499e-06, 'epoch': 8.18}
{'loss': 0.5319, 'grad_norm': 5.089931488037109, 'learning_rate': 6.20824295010846e-06, 'epoch': 8.23}
{'loss': 0.5142, 'grad_norm': 7.018446922302246, 'learning_rate': 6.181127982646421e-06, 'epoch': 8.28}
{'loss': 0.5444, 'grad_norm': 4.285708904266357, 'learning_rate': 6.154013015184382e-06, 'epoch': 8.33}
{'loss': 0.5362, 'grad_norm': 5.14425802230835, 'learning_rate': 6.126898047722343e-06, 'epoch': 8.38}
{'loss': 0.5315, 'grad_norm': 4.268003940582275, 'learning_rate': 6.099783080260304e-06, 'epoch': 8.44}
{'loss': 0.5532, 'grad_norm': 7.215863227844238, 'learning_rate':

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.5021665692329407, 'eval_runtime': 9.3345, 'eval_samples_per_second': 46.28, 'eval_steps_per_second': 23.14, 'epoch': 9.0}




{'loss': 0.5514, 'grad_norm': 4.725673198699951, 'learning_rate': 5.801518438177875e-06, 'epoch': 9.0}
{'loss': 0.522, 'grad_norm': 4.177104949951172, 'learning_rate': 5.774403470715836e-06, 'epoch': 9.05}


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/63/f3/63f399aab1bdba7654a85fd9c2ec8981d95d87816d5d4bed57b6adc5d3519dd4/656d41e7092dd6db28f7904f62c042bb8ea317e33d22f08a4c41d011890153d2?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240728%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240728T105858Z&X-Amz-Expires=86400&X-Amz-Signature=4f7b92d683fbdf5dbd4816c34ccff59da0b05ad901570ac4d0a11bcd73bacc69&X-Amz-SignedHeaders=host&partNumber=31&uploadId=pGvDZpC68_7GeFoc_TGViZNKCJWRXxdbD6ccl.de3TPZY_qcHLbHAyXn_9iMFmcl0ZYavBXuSx_G0wmifp_tbjXyOyJk3iDxqAaDGNNtBxEuP6s3E0aFXF9oRxLD7eHe&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2427)')))"), '(Request ID: 61b2ff63-bff0-47a2-92a3-49acc1ee6e20)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/63/f3/

{'loss': 0.5239, 'grad_norm': 6.212646007537842, 'learning_rate': 5.747288503253796e-06, 'epoch': 9.1}
{'loss': 0.5436, 'grad_norm': 5.4273295402526855, 'learning_rate': 5.720173535791757e-06, 'epoch': 9.16}
{'loss': 0.5572, 'grad_norm': 11.77038288116455, 'learning_rate': 5.693058568329718e-06, 'epoch': 9.21}
{'loss': 0.5408, 'grad_norm': 3.2966699600219727, 'learning_rate': 5.665943600867679e-06, 'epoch': 9.26}
{'loss': 0.5216, 'grad_norm': 7.058143615722656, 'learning_rate': 5.638828633405641e-06, 'epoch': 9.31}
{'loss': 0.5191, 'grad_norm': 6.027545928955078, 'learning_rate': 5.611713665943601e-06, 'epoch': 9.36}
{'loss': 0.5512, 'grad_norm': 5.026013374328613, 'learning_rate': 5.584598698481563e-06, 'epoch': 9.41}
{'loss': 0.5158, 'grad_norm': 3.1758065223693848, 'learning_rate': 5.5574837310195225e-06, 'epoch': 9.47}
{'loss': 0.5177, 'grad_norm': 4.798967361450195, 'learning_rate': 5.530368763557484e-06, 'epoch': 9.52}
{'loss': 0.5413, 'grad_norm': 4.277230262756348, 'learning_ra

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.49770984053611755, 'eval_runtime': 9.9691, 'eval_samples_per_second': 43.334, 'eval_steps_per_second': 21.667, 'epoch': 10.0}




{'loss': 0.5403, 'grad_norm': 6.465397357940674, 'learning_rate': 5.259219088937094e-06, 'epoch': 10.03}
{'loss': 0.5124, 'grad_norm': 4.562380313873291, 'learning_rate': 5.2321041214750545e-06, 'epoch': 10.08}
{'loss': 0.5267, 'grad_norm': 3.859264612197876, 'learning_rate': 5.204989154013016e-06, 'epoch': 10.13}
{'loss': 0.5429, 'grad_norm': 5.466004848480225, 'learning_rate': 5.1778741865509765e-06, 'epoch': 10.19}
{'loss': 0.5252, 'grad_norm': 3.6936070919036865, 'learning_rate': 5.150759219088938e-06, 'epoch': 10.24}
{'loss': 0.5494, 'grad_norm': 5.52163028717041, 'learning_rate': 5.1236442516268985e-06, 'epoch': 10.29}
{'loss': 0.5357, 'grad_norm': 6.809316635131836, 'learning_rate': 5.09652928416486e-06, 'epoch': 10.34}
{'loss': 0.5352, 'grad_norm': 4.302192211151123, 'learning_rate': 5.069414316702821e-06, 'epoch': 10.39}
{'loss': 0.5505, 'grad_norm': 3.9345309734344482, 'learning_rate': 5.042299349240781e-06, 'epoch': 10.44}
{'loss': 0.5374, 'grad_norm': 4.914389610290527, 'le

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.49749577045440674, 'eval_runtime': 10.3672, 'eval_samples_per_second': 41.67, 'eval_steps_per_second': 20.835, 'epoch': 11.0}




{'loss': 0.5187, 'grad_norm': 4.2929816246032715, 'learning_rate': 4.744034707158352e-06, 'epoch': 11.01}
{'loss': 0.5399, 'grad_norm': 4.2792253494262695, 'learning_rate': 4.716919739696313e-06, 'epoch': 11.06}
{'loss': 0.5238, 'grad_norm': 4.445128917694092, 'learning_rate': 4.6898047722342736e-06, 'epoch': 11.11}
{'loss': 0.526, 'grad_norm': 5.6370978355407715, 'learning_rate': 4.662689804772235e-06, 'epoch': 11.16}
{'loss': 0.5539, 'grad_norm': 3.4419546127319336, 'learning_rate': 4.6355748373101956e-06, 'epoch': 11.21}
{'loss': 0.5268, 'grad_norm': 3.517758846282959, 'learning_rate': 4.608459869848156e-06, 'epoch': 11.27}
{'loss': 0.5236, 'grad_norm': 4.773644924163818, 'learning_rate': 4.5813449023861175e-06, 'epoch': 11.32}
{'loss': 0.5295, 'grad_norm': 4.537932872772217, 'learning_rate': 4.554229934924079e-06, 'epoch': 11.37}
{'loss': 0.5115, 'grad_norm': 3.955444574356079, 'learning_rate': 4.5271149674620395e-06, 'epoch': 11.42}
{'loss': 0.5247, 'grad_norm': 4.27235746383667, 

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.49699103832244873, 'eval_runtime': 10.9825, 'eval_samples_per_second': 39.335, 'eval_steps_per_second': 19.668, 'epoch': 12.0}




{'loss': 0.5299, 'grad_norm': 4.762423992156982, 'learning_rate': 4.202819956616052e-06, 'epoch': 12.04}
{'loss': 0.5097, 'grad_norm': 4.261786937713623, 'learning_rate': 4.175704989154013e-06, 'epoch': 12.09}
{'loss': 0.5214, 'grad_norm': 3.915036916732788, 'learning_rate': 4.148590021691975e-06, 'epoch': 12.14}
{'loss': 0.5262, 'grad_norm': 4.811728477478027, 'learning_rate': 4.121475054229935e-06, 'epoch': 12.19}
{'loss': 0.5273, 'grad_norm': 15.730298042297363, 'learning_rate': 4.094360086767896e-06, 'epoch': 12.24}
{'loss': 0.5191, 'grad_norm': 3.6283211708068848, 'learning_rate': 4.067245119305857e-06, 'epoch': 12.29}
{'loss': 0.5172, 'grad_norm': 5.336327075958252, 'learning_rate': 4.040130151843818e-06, 'epoch': 12.35}
{'loss': 0.5283, 'grad_norm': 4.64885950088501, 'learning_rate': 4.0130151843817785e-06, 'epoch': 12.4}
{'loss': 0.5081, 'grad_norm': 7.08329439163208, 'learning_rate': 3.98590021691974e-06, 'epoch': 12.45}
{'loss': 0.5375, 'grad_norm': 3.620811700820923, 'learni

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4937259256839752, 'eval_runtime': 9.5043, 'eval_samples_per_second': 45.453, 'eval_steps_per_second': 22.727, 'epoch': 13.0}




{'loss': 0.5091, 'grad_norm': 5.098037242889404, 'learning_rate': 3.6876355748373104e-06, 'epoch': 13.01}
{'loss': 0.5248, 'grad_norm': 4.7581892013549805, 'learning_rate': 3.6605206073752714e-06, 'epoch': 13.07}
{'loss': 0.5257, 'grad_norm': 5.6759114265441895, 'learning_rate': 3.6334056399132324e-06, 'epoch': 13.12}
{'loss': 0.5396, 'grad_norm': 4.132021903991699, 'learning_rate': 3.606290672451193e-06, 'epoch': 13.17}
{'loss': 0.5259, 'grad_norm': 8.249797821044922, 'learning_rate': 3.5791757049891544e-06, 'epoch': 13.22}
{'loss': 0.5159, 'grad_norm': 3.610530138015747, 'learning_rate': 3.5520607375271154e-06, 'epoch': 13.27}
{'loss': 0.5191, 'grad_norm': 8.475055694580078, 'learning_rate': 3.5249457700650764e-06, 'epoch': 13.32}
{'loss': 0.5416, 'grad_norm': 3.4401819705963135, 'learning_rate': 3.4978308026030374e-06, 'epoch': 13.37}
{'loss': 0.5228, 'grad_norm': 3.354264974594116, 'learning_rate': 3.470715835140998e-06, 'epoch': 13.43}
{'loss': 0.5093, 'grad_norm': 4.2071094512939

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4942667782306671, 'eval_runtime': 10.5531, 'eval_samples_per_second': 40.936, 'eval_steps_per_second': 20.468, 'epoch': 14.0}




{'loss': 0.5279, 'grad_norm': 6.827444553375244, 'learning_rate': 3.14533622559653e-06, 'epoch': 14.04}
{'loss': 0.5203, 'grad_norm': 6.747674465179443, 'learning_rate': 3.1182212581344905e-06, 'epoch': 14.09}
{'loss': 0.5185, 'grad_norm': 6.584140300750732, 'learning_rate': 3.0911062906724515e-06, 'epoch': 14.15}
{'loss': 0.5353, 'grad_norm': 25.02531623840332, 'learning_rate': 3.0639913232104125e-06, 'epoch': 14.2}
{'loss': 0.5236, 'grad_norm': 5.718808174133301, 'learning_rate': 3.036876355748373e-06, 'epoch': 14.25}
{'loss': 0.5287, 'grad_norm': 3.2913262844085693, 'learning_rate': 3.009761388286334e-06, 'epoch': 14.3}
{'loss': 0.5111, 'grad_norm': 4.809070110321045, 'learning_rate': 2.982646420824295e-06, 'epoch': 14.35}
{'loss': 0.5315, 'grad_norm': 4.377767086029053, 'learning_rate': 2.9555314533622565e-06, 'epoch': 14.4}
{'loss': 0.5255, 'grad_norm': 3.9803125858306885, 'learning_rate': 2.9284164859002175e-06, 'epoch': 14.45}
{'loss': 0.5279, 'grad_norm': 4.7946672439575195, 'l

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4921085238456726, 'eval_runtime': 10.4695, 'eval_samples_per_second': 41.263, 'eval_steps_per_second': 20.631, 'epoch': 15.0}




{'loss': 0.519, 'grad_norm': 7.500046253204346, 'learning_rate': 2.631236442516269e-06, 'epoch': 15.02}
{'loss': 0.5423, 'grad_norm': 4.3894572257995605, 'learning_rate': 2.6041214750542303e-06, 'epoch': 15.07}
{'loss': 0.5289, 'grad_norm': 3.636378049850464, 'learning_rate': 2.5770065075921913e-06, 'epoch': 15.12}
{'loss': 0.5149, 'grad_norm': 6.4018449783325195, 'learning_rate': 2.5498915401301523e-06, 'epoch': 15.17}
{'loss': 0.5211, 'grad_norm': 4.843406677246094, 'learning_rate': 2.522776572668113e-06, 'epoch': 15.23}
{'loss': 0.5211, 'grad_norm': 4.4747138023376465, 'learning_rate': 2.495661605206074e-06, 'epoch': 15.28}
{'loss': 0.5238, 'grad_norm': 4.378927707672119, 'learning_rate': 2.468546637744035e-06, 'epoch': 15.33}
{'loss': 0.5202, 'grad_norm': 7.364276885986328, 'learning_rate': 2.441431670281996e-06, 'epoch': 15.38}
{'loss': 0.5116, 'grad_norm': 4.228830337524414, 'learning_rate': 2.414316702819957e-06, 'epoch': 15.43}
{'loss': 0.5423, 'grad_norm': 6.150075912475586, '

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.49459895491600037, 'eval_runtime': 9.1183, 'eval_samples_per_second': 47.377, 'eval_steps_per_second': 23.689, 'epoch': 16.0}




{'loss': 0.5125, 'grad_norm': 5.255313396453857, 'learning_rate': 2.0889370932754884e-06, 'epoch': 16.05}
{'loss': 0.5115, 'grad_norm': 4.556352138519287, 'learning_rate': 2.061822125813449e-06, 'epoch': 16.1}
{'loss': 0.5081, 'grad_norm': 4.686473369598389, 'learning_rate': 2.0347071583514104e-06, 'epoch': 16.15}
{'loss': 0.5047, 'grad_norm': 8.198234558105469, 'learning_rate': 2.007592190889371e-06, 'epoch': 16.2}
{'loss': 0.5199, 'grad_norm': 4.915348529815674, 'learning_rate': 1.980477223427332e-06, 'epoch': 16.26}
{'loss': 0.5062, 'grad_norm': 6.680322170257568, 'learning_rate': 1.953362255965293e-06, 'epoch': 16.31}
{'loss': 0.5145, 'grad_norm': 7.817899227142334, 'learning_rate': 1.926247288503254e-06, 'epoch': 16.36}
{'loss': 0.5378, 'grad_norm': 3.675462007522583, 'learning_rate': 1.899132321041215e-06, 'epoch': 16.41}
{'loss': 0.5119, 'grad_norm': 3.4628615379333496, 'learning_rate': 1.8720173535791757e-06, 'epoch': 16.46}
{'loss': 0.5081, 'grad_norm': 3.6009714603424072, 'le

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4930541515350342, 'eval_runtime': 10.9335, 'eval_samples_per_second': 39.512, 'eval_steps_per_second': 19.756, 'epoch': 17.0}




{'loss': 0.5168, 'grad_norm': 3.444053888320923, 'learning_rate': 1.5737527114967462e-06, 'epoch': 17.03}
{'loss': 0.5079, 'grad_norm': 5.668766021728516, 'learning_rate': 1.5466377440347072e-06, 'epoch': 17.08}
{'loss': 0.5072, 'grad_norm': 7.674752235412598, 'learning_rate': 1.5195227765726682e-06, 'epoch': 17.13}
{'loss': 0.5275, 'grad_norm': 5.401914119720459, 'learning_rate': 1.4924078091106292e-06, 'epoch': 17.18}
{'loss': 0.5026, 'grad_norm': 3.510282039642334, 'learning_rate': 1.46529284164859e-06, 'epoch': 17.23}
{'loss': 0.5088, 'grad_norm': 8.376921653747559, 'learning_rate': 1.438177874186551e-06, 'epoch': 17.28}
{'loss': 0.5121, 'grad_norm': 3.937462568283081, 'learning_rate': 1.4110629067245122e-06, 'epoch': 17.34}
{'loss': 0.5297, 'grad_norm': 7.1793718338012695, 'learning_rate': 1.383947939262473e-06, 'epoch': 17.39}
{'loss': 0.5053, 'grad_norm': 3.378804922103882, 'learning_rate': 1.3568329718004338e-06, 'epoch': 17.44}
{'loss': 0.5116, 'grad_norm': 4.921199321746826, 

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4947637617588043, 'eval_runtime': 10.7026, 'eval_samples_per_second': 40.364, 'eval_steps_per_second': 20.182, 'epoch': 18.0}




{'loss': 0.5027, 'grad_norm': 7.282534599304199, 'learning_rate': 1.0585683297180043e-06, 'epoch': 18.0}
{'loss': 0.5113, 'grad_norm': 5.414830684661865, 'learning_rate': 1.0314533622559653e-06, 'epoch': 18.06}
{'loss': 0.5077, 'grad_norm': 4.596520900726318, 'learning_rate': 1.0043383947939263e-06, 'epoch': 18.11}
{'loss': 0.5143, 'grad_norm': 4.830936431884766, 'learning_rate': 9.772234273318873e-07, 'epoch': 18.16}
{'loss': 0.5025, 'grad_norm': 7.923271179199219, 'learning_rate': 9.501084598698482e-07, 'epoch': 18.21}
{'loss': 0.5141, 'grad_norm': 9.67483901977539, 'learning_rate': 9.229934924078092e-07, 'epoch': 18.26}
{'loss': 0.5015, 'grad_norm': 5.650562286376953, 'learning_rate': 8.958785249457701e-07, 'epoch': 18.31}
{'loss': 0.5122, 'grad_norm': 4.924361228942871, 'learning_rate': 8.687635574837311e-07, 'epoch': 18.36}
{'loss': 0.5139, 'grad_norm': 11.35871410369873, 'learning_rate': 8.416485900216921e-07, 'epoch': 18.42}
{'loss': 0.5147, 'grad_norm': 5.770892143249512, 'lear

  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.4936120808124542, 'eval_runtime': 9.2947, 'eval_samples_per_second': 46.478, 'eval_steps_per_second': 23.239, 'epoch': 19.0}




{'loss': 0.5206, 'grad_norm': 3.928406238555908, 'learning_rate': 5.162689804772235e-07, 'epoch': 19.03}
{'loss': 0.5239, 'grad_norm': 3.6950273513793945, 'learning_rate': 4.891540130151844e-07, 'epoch': 19.08}
{'loss': 0.5205, 'grad_norm': 5.919267177581787, 'learning_rate': 4.620390455531454e-07, 'epoch': 19.14}
{'loss': 0.5116, 'grad_norm': 4.1917524337768555, 'learning_rate': 4.3492407809110634e-07, 'epoch': 19.19}
{'loss': 0.5065, 'grad_norm': 6.075621128082275, 'learning_rate': 4.078091106290673e-07, 'epoch': 19.24}
{'loss': 0.5075, 'grad_norm': 6.048943519592285, 'learning_rate': 3.806941431670282e-07, 'epoch': 19.29}
{'loss': 0.5163, 'grad_norm': 3.5021588802337646, 'learning_rate': 3.5357917570498917e-07, 'epoch': 19.34}
{'loss': 0.5095, 'grad_norm': 3.5059444904327393, 'learning_rate': 3.264642082429501e-07, 'epoch': 19.39}
{'loss': 0.5161, 'grad_norm': 3.0514986515045166, 'learning_rate': 2.9934924078091105e-07, 'epoch': 19.44}
{'loss': 0.5076, 'grad_norm': 4.406310081481934

Non-default generation parameters: {'max_length': 1876}


  0%|          | 0/216 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1876}


{'eval_loss': 0.49357128143310547, 'eval_runtime': 10.8552, 'eval_samples_per_second': 39.797, 'eval_steps_per_second': 19.898, 'epoch': 20.0}
{'train_runtime': 2278.9013, 'train_samples_per_second': 34.104, 'train_steps_per_second': 4.265, 'train_loss': 0.5488485797442526, 'epoch': 20.0}


TrainOutput(global_step=9720, training_loss=0.5488485797442526, metrics={'train_runtime': 2278.9013, 'train_samples_per_second': 34.104, 'train_steps_per_second': 4.265, 'total_flos': 3299807655432288.0, 'train_loss': 0.5488485797442526, 'epoch': 20.0})

In [15]:
trainer.push_to_hub(repo_name)

Non-default generation parameters: {'max_length': 1876}


model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1722163552.Hamza.5788.1:   0%|          | 0.00/94.1k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HamzaSidhu786/urdu_text_to_speech_tts/commit/8d263e4f4cfa99f118a457e8a32b5054c9d6c41b', commit_message='urdu_text_to_speech_tts', commit_description='', oid='8d263e4f4cfa99f118a457e8a32b5054c9d6c41b', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("HamzaSidhu786/urdu_text_to_speech_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("HamzaSidhu786/urdu_text_to_speech_tts")

special_tokens_map.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

In [17]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [54]:
example = dataset["train"][120]

In [55]:
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

In [56]:
imput_encoded = torch.tensor(example["input_ids"]).unsqueeze(0)

In [57]:
speech = model.generate_speech(imput_encoded, speaker_embeddings, vocoder=vocoder)

In [58]:
from IPython.display import Audio

Audio(speech, rate=16000)

In [62]:
spt5processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
encoded = processor(text = "وقت پر رقوم کبھی بھی واپس نہ کیں۔")

In [64]:
processor.decode(encoded["input_ids"])

'وقت پر رقوم کبھی بھی واپس نہ کیں۔</s>'