In [None]:
import librosa

array, sampling_rate = librosa.load(librosa.ex("trumpet"))
print(array.shape, sampling_rate)
print(array.shape[0] / sampling_rate)

In [None]:
import matplotlib.pyplot as plt
import librosa.display

plt.figure().set_figwidth(12)
librosa.display.waveshow(array, sr=sampling_rate)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

dft_input = array

# calculate the DFT
window = np.hanning(len(dft_input))
windowed_input = dft_input * window
dft = np.fft.rfft(windowed_input)

# get the amplitude spectrum in decibels
amplitude = np.abs(dft)
amplitude_db = librosa.amplitude_to_db(amplitude, ref=np.max)

# get the frequency bins
frequency = librosa.fft_frequencies(sr=sampling_rate, n_fft=len(dft_input))

plt.figure().set_figwidth(12)
plt.plot(frequency, amplitude_db)
plt.xlabel("Frequency (Hz)")
plt.ylabel("Amplitude (dB)")
plt.xscale("log")


The frequency spectrum of an audio signal contains the exact same information as its waveform — they are simply two different ways of looking at the same data (here, the first 4096 samples from the trumpet sound). Where the waveform plots the amplitude of the audio signal over time, the spectrum visualizes the amplitudes of the individual frequencies at a fixed point in time.

In [1]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", do_normalize=True)
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [23]:
text = """OLTP-based relational databases are, by definition, meant for transactional loads. For analytical loads, data lakes, data warehouses, data marts, there’s another list of databases. In theory, you can create data warehouses using OLTP databases, but at scale, it never ends well. Been there, done that."""

In [24]:

# text = "Don't count the days, make the days count. 1000 days past so fast, 1001 even faster"
inputs = processor(text=text, return_tensors="pt")
print(inputs)
print(inputs['input_ids'].shape)
print(len(text))
#its tokenizer is character based

{'input_ids': tensor([[ 4, 50, 52, 32, 49, 39, 25,  7, 12,  5, 14,  4, 13,  5, 15,  7,  6, 10,
          8,  9,  7, 15,  4, 14,  7,  6,  7, 25,  7, 12,  5, 12,  4,  7, 13,  5,
         23,  4, 25, 22,  4, 14,  5, 19, 10,  9, 10,  6, 10,  8,  9, 23,  4, 18,
          5,  7,  9,  6,  4, 19,  8, 13,  4,  6, 13,  7,  9, 12,  7, 17,  6, 10,
          8,  9,  7, 15,  4, 15,  8,  7, 14, 12, 26,  4, 55,  8, 13,  4,  7,  9,
          7, 15, 22,  6, 10, 17,  7, 15,  4, 15,  8,  7, 14, 12, 23,  4, 14,  7,
          6,  7,  4, 15,  7, 28,  5, 12, 23,  4, 14,  7,  6,  7,  4, 20,  7, 13,
          5, 11,  8, 16, 12,  5, 12, 23,  4, 14,  7,  6,  7,  4, 18,  7, 13,  6,
         12, 23,  4,  6, 11,  5, 13,  5,  3, 12,  4,  7,  9,  8,  6, 11,  5, 13,
          4, 15, 10, 12,  6,  4,  8, 19,  4, 14,  7,  6,  7, 25,  7, 12,  5, 12,
         26,  4, 30,  9,  4,  6, 11,  5,  8, 13, 22, 23,  4, 22,  8, 16,  4, 17,
          7,  9,  4, 17, 13,  5,  7,  6,  5,  4, 14,  7,  6,  7,  4, 20,  7, 13,
          5, 1

In [None]:
processor.tokenizer.decode(inputs["input_ids"][0])

In [None]:
tokenizer_vocab = {k for k, _ in processor.tokenizer.get_vocab().items()}
print(tokenizer_vocab)
print(len(tokenizer_vocab))

In [18]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
embeddings_dataset

Dataset({
    features: ['filename', 'xvector'],
    num_rows: 7931
})

In [25]:
import torch

speaker_embeddings = torch.tensor(embeddings_dataset[7000]["xvector"]).unsqueeze(0)
speaker_embeddings.shape

torch.Size([1, 512])

In [26]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [27]:
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [28]:
from IPython.display import Audio

Audio(speech, rate=16000)

In [34]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

torchvision is not available - cannot save figures


In [30]:
import librosa

In [32]:
from IPython.display import Audio
y, sr =librosa.load("../data/quran.wav", sr=22050)
print (type(y))
print (y.shape)
w_resample = librosa.resample(y, orig_sr=sr, target_sr=16000)
Audio(w_resample, rate=16000)


<class 'numpy.ndarray'>
(2646000,)


In [35]:
emd = create_speaker_embedding(w_resample)
print(emd.shape)

(512,)


In [36]:
speech = model.generate_speech(inputs["input_ids"], torch.tensor(emd).reshape(1, -1), vocoder=vocoder)

In [37]:
from IPython.display import Audio

Audio(speech, rate=16000)