In [1]:
!pip install fairseq -q
!pip install g2p_en -q

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
import torch

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface


class TTSModel:
    def __init__(self):
        models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
            "facebook/fastspeech2-en-ljspeech",
            arg_overrides={"vocoder": "hifigan", "fp16": False}
        )
        self.model = models[0]
        self.task = task
        
        TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
        self.generator = self.task.build_generator(models, cfg) 
        
    def get_sample(self, text):
        return TTSHubInterface.get_model_input(self.task, text)
    
    def get_durations(self, sample):
        # encoder.forward args: src_tokens, src_lengths=None, speaker=None, durations=None, pitches=None, energies=None,
        x, x_post, out_lens, log_dur_out, pitch_out, energy_out = self.model.encoder(**sample["net_input"])
        return torch.exp(log_dur_out)
    
    def simple_change(self, text, dur_factor=1.):
        sample = self.get_sample(text)
        durs = self.get_durations(sample)
        
        durs[sample["net_input"]["src_tokens"] == 11] *= dur_factor # 11 == ','
        sample["net_input"]["durations"] = durs.long()
        
        return sample
    
    def get_wav(self, sample):
        bsz, max_src_len = sample["net_input"]["src_tokens"].size()
        n_frames_per_step = self.model.encoder.n_frames_per_step
        out_dim = self.model.encoder.out_dim
        raw_dim = out_dim // n_frames_per_step
        
        feat, x_post, out_lens, log_dur_out, pitch_out, energy_out = self.model.encoder(**sample["net_input"])

        feat = feat.view(bsz, -1, raw_dim)
        feat = self.generator.gcmvn_denormalize(feat)

        out_lens = out_lens * n_frames_per_step
        finalized = [
            {
                "waveform": self.generator.get_waveform(feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]))
            }
            for b, l in zip(range(bsz), out_lens)
        ]

        return finalized[0]["waveform"], self.task.sr
    
    def full_tts(self, text):
        sample = TTSHubInterface.get_model_input(self.task, text)
        wav, rate = TTSHubInterface.get_prediction(self.task, self.model, self.generator, sample)
        return wav, rate
    
        
        

2024-03-21 11:44:03.088699: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 11:44:03.088829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 11:44:03.260416: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
tts = TTSModel()

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

hifigan.bin:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

pytorch_model.pt:   0%|          | 0.00/495M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

fbank_mfa_gcmvn_stats.npz:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

run_fast_speech_2.py:   0%|          | 0.00/306 [00:00<?, ?B/s]

hifigan.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/612 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/602 [00:00<?, ?B/s]



In [4]:
import IPython.display as ipd

text = "Hello, this is a test run"

wav, sr = tts.full_tts(text)
ipd.Audio(wav, rate=sr)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to /usr/share/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [5]:
import wandb
from kaggle_secrets import UserSecretsClient


secret_label = "wandb_key"
secret_value = UserSecretsClient().get_secret(secret_label)
wandb.login(key=secret_value) 
wandb.init(project="fastspeech_audio")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkvdmitrieva[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
wav, sr = tts.get_wav(tts.simple_change(text))
ipd.Audio(wav, rate=sr)

In [7]:
wandb.log({"fairseq model / test audio": wandb.Audio(wav.numpy(), caption=text, sample_rate=sr)})

In [8]:
wav, sr = tts.get_wav(tts.simple_change(text, dur_factor=10.))
ipd.Audio(wav, rate=sr)

In [9]:
wandb.log({"fairseq model / aug audio": wandb.Audio(wav.numpy(), caption=text, sample_rate=sr)})

In [10]:
!git clone https://github.com/speechbrain/speechbrain.git
%cd speechbrain
!pip install -q -r requirements.txt
!pip install --editable .


  pid, fd = os.forkpty()


Cloning into 'speechbrain'...
remote: Enumerating objects: 78043, done.[K
remote: Counting objects: 100% (817/817), done.[K
remote: Compressing objects: 100% (506/506), done.[K
remote: Total 78043 (delta 356), reused 613 (delta 282), pack-reused 77226[K
Receiving objects: 100% (78043/78043), 85.93 MiB | 20.46 MiB/s, done.
Resolving deltas: 100% (52135/52135), done.
/kaggle/working/speechbrain
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autopep8 2.0.4 requires pycodestyl

In [11]:
import torchaudio
from speechbrain.inference.TTS import FastSpeech2
from speechbrain.inference.vocoders import HIFIGAN


fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir="pretrained_models/tts-fastspeech2-ljspeech")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/tts-hifigan-ljspeech")


# example from doc
# input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
# mel_output, durations, pitch, energy = fastspeech2.encode_phoneme(
#   [input_phonemes],
#   pace=1.0,        # scale up/down the speed
#   pitch_rate=1.0,  # scale up/down the pitch
#   energy_rate=1.0, # scale up/down the energy
# )

# # Running Vocoder (spectrogram-to-waveform)
# waveforms = hifi_gan.decode_batch(mel_output)


hyperparams.yaml:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

spn_predictor.ckpt:   0%|          | 0.00/76.4M [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/181M [00:00<?, ?B/s]

hyperparams.yaml:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/129M [00:00<?, ?B/s]

ctc_lin.ckpt:   0%|          | 0.00/177k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

hyperparams.yaml:   0%|          | 0.00/1.16k [00:00<?, ?B/s]



generator.ckpt:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

In [12]:
input_text = "Hello, this is a test run"

mel_output, durations, pitch, energy = fastspeech2.encode_text(
  [input_text],
  pace=1.0,   
  pitch_rate=1.0,
  energy_rate=1.0,
)

waveforms = hifi_gan.decode_batch(mel_output)
ipd.Audio(waveforms.squeeze(1), rate=sr)

In [13]:
wandb.log({"speechbrain model / test audio": wandb.Audio(waveforms.squeeze().numpy(), caption=text, sample_rate=sr)})

In [14]:
input_text = "Hello, ; ; ; ; ; ; this is a test run."  # временный вариант

mel_output, durations, pitch, energy = fastspeech2.encode_text(
  [input_text],
  pace=1.0,
  pitch_rate=1.0,
  energy_rate=1.0,
)

waveforms = hifi_gan.decode_batch(mel_output)
ipd.Audio(waveforms.squeeze(1), rate=sr)

In [15]:
wandb.log({"speechbrain model / aug audio": wandb.Audio(waveforms.squeeze().numpy(), caption=text, sample_rate=sr)})

In [16]:
from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerModel, FastSpeech2ConformerHifiGan


tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
hifigan = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan")

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/809 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/109 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/281M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

Some weights of FastSpeech2ConformerHifiGan were not initialized from the model checkpoint at espnet/fastspeech2_conformer_hifigan and are newly initialized: ['mean', 'scale']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
input_text = "Hello, this is a test run."

model.eval()
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"]
output_dict = model(input_ids, return_dict=True)

print(output_dict.keys())

waveform = hifigan(output_dict["spectrogram"])
ipd.Audio(waveform.detach().numpy(), rate=22050)

odict_keys(['spectrogram', 'encoder_last_hidden_state', 'duration_outputs', 'pitch_outputs', 'energy_outputs'])


In [18]:
wandb.log({"espnet model / test audio": wandb.Audio(waveform.squeeze().detach().numpy(), caption=text, sample_rate=22050)})

In [19]:
import torch

new_durations = output_dict["duration_outputs"].clone()
new_durations[input_ids == 23] *= 10

model.train()
with torch.inference_mode():
    output_dict_2 = model(input_ids, 
                          spectrogram_labels=torch.zeros((1, new_durations.sum(), 80)), 
                          duration_labels=new_durations, 
                          pitch_labels=output_dict["pitch_outputs"],
                          energy_labels=output_dict["energy_outputs"],
                          return_dict=True)


    waveform_2 = hifigan(output_dict_2["spectrogram"])
ipd.Audio(waveform_2.detach().numpy(), rate=22050)

In [20]:
wandb.log({"espnet model / aug audio": wandb.Audio(waveform_2.squeeze().detach().numpy(), caption=text, sample_rate=22050)})