In [3]:
from datasets import load_dataset,Dataset, Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import SpeechT5Processor
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


dataset_stream = load_dataset("facebook/voxpopuli", "en", streaming=True)

dataset = {
    'train': dataset_stream["train"],
    'validation': dataset_stream["validation"],
    'test': dataset_stream["test"]
}

# Cast the audio column for preprocessing
dataset['train'] = dataset['train'].cast_column("audio", Audio(sampling_rate=16000))
dataset['validation'] = dataset['validation'].cast_column("audio", Audio(sampling_rate=16000))
dataset['test'] = dataset['test'].cast_column("audio", Audio(sampling_rate=16000))


train_dataset = list(dataset_stream["train"].take(4000))
validation_dataset = list(dataset_stream["validation"].take(500))
test_dataset = list(dataset_stream["test"].take(500))

dataset = {
    'train': Dataset.from_list(train_dataset),
    'validation': Dataset.from_list(validation_dataset),
    'test': Dataset.from_list(test_dataset)
}
train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

Downloading readme:   0%|          | 0.00/10.7k [00:00<?, ?B/s]



In [4]:
model_path = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_path)

model = SpeechT5ForTextToSpeech.from_pretrained(model_path)
model.config.use_cache=False

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [6]:
spk_model_name="speechbrain/spkrec-xvect-voxceleb"

device="cuda"
speaker_model=EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def get_embedding(wave):
    with torch.no_grad():
        embeddings=speaker_model.encode_batch(torch.tensor(wave))
        embeddings=torch.nn.functional.normalize(embeddings, dim=2)
        embeddings=embeddings.squeeze().cpu().numpy()
    return embeddings


def apply_preprocess(row):
    audio=row["audio"]
    
    row=processor(
        text=row["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )
    row["labels"]=row["labels"][0]
    row["my_embeddings"] = get_embedding(audio["array"])

    return row

hyperparams.yaml:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [8]:
# applying the processing function to the entire dataset
train_dataset=train_dataset.map(apply_preprocess)
valid_dataset=valid_dataset.map(apply_preprocess)
test_dataset=test_dataset.map(apply_preprocess)
# Columns to remove
columns_to_remove = [
    "audio_id",
    "language",
    "audio",
    "raw_text",
    "normalized_text",
    "gender",
    "speaker_id",
    "is_gold_transcript",
    "accent",
]

# Remove columns
train_dataset = train_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)
valid_dataset = valid_dataset.remove_columns(columns_to_remove)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 600). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
len(train_dataset)

4000

In [10]:
def clip(input_ids):
    return len(input_ids) < 180

train_dataset=train_dataset.filter(clip, input_columns=["input_ids"])
test_dataset=test_dataset.filter(clip, input_columns=["input_ids"])
valid_dataset=valid_dataset.filter(clip, input_columns=["input_ids"])

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
@dataclass
class Pad_collator:
    
    processor: Any
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_ids=[{"input_ids":feature["input_ids"]} for feature in features]
        label_features=[{"input_values":feature["labels"]} for feature in features]
        embeded_features = [feature["my_embeddings"] for feature in features]
        
        batch=processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
        
        batch["labels"]=batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1),-100)
        del batch["decoder_attention_mask"]
        
        if model.config.reduction_factor>1:
            target_lengths=torch.tensor([len(feature["input_values"]) for feature in label_features])
            target_lengths=target_lengths.new(
                [length-length%model.config.reduction_factor for length in target_lengths]
            )
            max_length=max(target_lengths)
            batch["labels"]=batch["labels"][:, :max_length]
        
        batch["my_embeddings"]=torch.tensor(embeded_features)
        
        return batch

In [12]:
collator = Pad_collator(processor=processor)

In [20]:
training_args=Seq2SeqTrainingArguments(
    output_dir="sohail3",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    learning_rate=0.0002,
    warmup_steps=50,
    max_steps=250,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=100,
    eval_steps=50,
    logging_steps=25,
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=False,
)

trainer=Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collator,
    tokenizer=processor,
)

trainer.train()

Step,Training Loss,Validation Loss
50,0.5717,0.511195
100,0.5235,0.494162
150,0.5013,0.479873
200,0.4858,0.474672
250,0.4771,0.471012


TrainOutput(global_step=250, training_loss=0.5293196563720703, metrics={'train_runtime': 1156.9723, 'train_samples_per_second': 13.829, 'train_steps_per_second': 0.216, 'total_flos': 2217259472950368.0, 'train_loss': 0.5293196563720703, 'epoch': 5.92})

In [29]:
trainer.evaluate(test_dataset)

AttributeError: 'SpeechT5Model' object has no attribute 'evaluate'

In [30]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torchaudio

checkpoint_path = "/kaggle/working/sohail/checkpoint-200"

processor = SpeechT5Processor.from_pretrained("sohail2003/pattern3.1")
model = SpeechT5ForTextToSpeech.from_pretrained("sohail2003/pattern3.1")

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)

def create_speaker_embedding(wave):
    with torch.no_grad():
        my_embeddings = speaker_model.encode_batch(torch.tensor(wave).to(device))
        my_embeddings = torch.nn.functional.normalize(my_embeddings, dim=2)
        return my_embeddings.squeeze().cpu().numpy()

def prepare_dataset(example):
    audio = example["audio"]
    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )
    example["labels"] = example["labels"][0]

    example["my_embeddings"] = create_speaker_embedding(audio["array"])
    return example

def generate_random_speaker_embedding():
    return torch.randn(1, 512).float()

def generate_speech(text):
    speaker_embedding = generate_random_speaker_embedding()
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
    if speech.dim() == 1:  
        speech = speech.unsqueeze(0)
    torchaudio.save("output.wav", speech, 16000)
    print("Speech saved to 'output.wav'")
    return speech


text_input = "why are you doing that are you crazy "
generated_speech = generate_speech(text_input)
from IPython.display import Audio
Audio("output.wav")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Speech saved to 'output.wav'
