In [1]:
import IPython.display as ipd

import torch
import torchaudio
import pathlib

import textless
from textless import dispatch_dense_model, dispatch_quantizer
from textless.data.speech_encoder import SpeechEncoder
from textless.data.quantized_datasets import QuantizedLibriSpeech
from textless.vocoders.tacotron2.vocoder import TacotronVocoder

Let us configure what dense model and quantizer we will use:

In [2]:
dense_model_name = "hubert-base-ls960" # "cpc-big-ll6k"
quantizer_name = "kmeans"
vocab_size = 200 # one of [50, 100, 200]

We can initialise a SpeechEncoder using its name; this way a corresponding checkpoint will be downloaded

In [3]:
encoder = SpeechEncoder.by_name(
    dense_model_name=dense_model_name,
    quantizer_model_name=quantizer_name,
    vocab_size=vocab_size,
    need_f0=False,
    deduplicate=True,
    f0_normalizer=None,
    f0_quantizer=None,
).cuda()

We will use a LibriSpeech dataset for our example. We can start with a vanilla version of it, load a single example and listen to it:

In [4]:
! mkdir -p datasets

In [5]:
raw_dataset = torchaudio.datasets.LIBRISPEECH(
    root="./datasets",
    url="dev-clean",
    download=True,
)

In [6]:
audio, input_sample_rate, *_ = raw_dataset[7]
audio

tensor([[-0.0003, -0.0006, -0.0006,  ..., -0.0003, -0.0003, -0.0003]])

In [7]:
ipd.Audio(audio, rate=input_sample_rate)

textless-lib provides a simple wrapper around it which will return a "textless" representation of datapoints:

In [8]:
dataset = QuantizedLibriSpeech(
    encoder,
    root="./datasets",
    url="dev-clean",
    download=False,
)

The datapoints are encoded using the provided SpeechEncoder model; each datapoint is a dictionary with a few key-value pairs:

In [9]:
datum = dataset[7]
datum.keys()

dict_keys(['units', 'durations', 'dense', 'rest'])

For instance, 'units' values contain discrete tokens and 'durations' encode their corresponding durations:

In [10]:
datum['units'][:10]

tensor([ 14, 131, 191,  11,  22,  86,  22, 125,  10, 154], dtype=torch.int32)

In [11]:
datum['durations'][:10]

tensor([4, 8, 8, 2, 1, 1, 1, 1, 1, 2])

Now let us initialise a corresponding Tacotron instance with a matching configuration

In [12]:
vocoder = TacotronVocoder.by_name(
    dense_model_name,
    quantizer_name,
    vocab_size,
).cuda()

In [13]:
resynth_audio = vocoder(datum['units'])
resynth_audio = resynth_audio.cpu().numpy()

In [14]:
ipd.Audio(resynth_audio, rate=vocoder.output_sample_rate)

In [15]:
import sys
sys.path.append(str(pathlib.Path(textless.__path__[0]).parent / 'examples' / 'gslm/'))
from sampler import UnitLanguageModelSampler

In [16]:
! mkdir -p LM && \
    wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km200/hubert200_lm.tgz -O LM/hubert200_lm.tgz && \
    cd LM/ && \
    tar -xvf hubert200_lm.tgz 

--2022-02-13 10:47:44--  https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km200/hubert200_lm.tgz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1450463515 (1.4G) [application/gzip]
Saving to: ‘LM/hubert200_lm.tgz’


2022-02-13 10:48:08 (60.0 MB/s) - ‘LM/hubert200_lm.tgz’ saved [1450463515/1450463515]

hubert200_lm/
hubert200_lm/dict.txt
hubert200_lm/checkpoint_best.pt


In [17]:
prompt = audio[:, :input_sample_rate * 5]
ipd.Audio(prompt, rate=input_sample_rate)

In [18]:
encoded = encoder(prompt)
units = encoded['units']
units

tensor([ 14, 131, 191,  11,  22, 125,  22, 125,  10, 154,  46,  49,  50,  12,
         93,  66,  31, 127, 160,  17, 112,  23,  96,  12, 172,  85,  89,  31,
         46, 190,  33,   9,  87, 157,  41, 136,   1, 111,  19, 141, 120, 152,
        133,  57, 113,  28,   1, 151, 192,  87,  19, 152,  36, 162, 166, 191,
          8,  11, 149, 125,   8, 125,  22, 125,  89, 174,  37,  79, 143, 104,
        136, 115, 172,  13, 156,  44, 187,  79, 104, 109,  38, 119,  51, 182,
         93,  66, 196, 128, 199,  33, 169, 136, 172,  71,  31, 144,  61, 198,
         12,  85,  89,  31, 100, 115, 177, 106, 193,  72, 170,  78, 111,  19,
         15,  41, 115,  54, 177, 106, 193, 148,  35,  69, 127, 170,   1,  95,
         30,  39, 152,  36, 149, 197,  20, 125,  20, 137,  92],
       device='cuda:0', dtype=torch.int32)

In [19]:
resynth_prompt = vocoder(units).cpu().numpy()
ipd.Audio(resynth_prompt, rate=vocoder.output_sample_rate)

In [20]:
sampler = UnitLanguageModelSampler.from_pretrained(model_name_or_path="LM/hubert200_lm")

In [21]:
sampling_kwargs = {
    "temperature": 0.7,
    "sampling": True,
    "beam": 1,
    "prefix_size": -1,
    "max_len_a": 0.0,
    "max_len_b": 400,
}

In [22]:
unit_str = " ".join(list(map(str, units.tolist())))
sampled_unit_str = sampler.sample([unit_str], **sampling_kwargs)[0]
continuation = torch.tensor([int(x) for x in sampled_unit_str.split()]).cuda()

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1631630839582/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [23]:
continuation

tensor([ 14, 131, 191,  11,  22, 125,  22, 125,  10, 154,  46,  49,  50,  12,
         93,  66,  31, 127, 160,  17, 112,  23,  96,  12, 172,  85,  89,  31,
         46, 190,  33,   9,  87, 157,  41, 136,   1, 111,  19, 141, 120, 152,
        133,  57, 113,  28,   1, 151, 192,  87,  19, 152,  36, 162, 166, 191,
          8,  11, 149, 125,   8, 125,  22, 125,  89, 174,  37,  79, 143, 104,
        136, 115, 172,  13, 156,  44, 187,  79, 104, 109,  38, 119,  51, 182,
         93,  66, 196, 128, 199,  33, 169, 136, 172,  71,  31, 144,  61, 198,
         12,  85,  89,  31, 100, 115, 177, 106, 193,  72, 170,  78, 111,  19,
         15,  41, 115,  54, 177, 106, 193, 148,  35,  69, 127, 170,   1,  95,
         30,  39, 152,  36, 149, 197,  20, 125,  20, 137,  92,  14, 131, 132,
         22, 125, 110,  18,  82, 182, 183,   3,  81, 170,  79,  71,  73, 109,
          3, 194,  12,  87, 157, 193, 170,  25,  51,  63,  93, 168,  19, 120,
         48,  62, 121,  59,  27, 115,  85, 154,  46, 148,  35,  

In [24]:
resynth_continuation = vocoder(continuation).cpu().numpy()[:10 * vocoder.output_sample_rate]
ipd.Audio(resynth_continuation, rate=vocoder.output_sample_rate)