In [1]:
import json
import torch
import re
from pathlib import Path
from src.models.hifi_gan.models import Generator, load_model as load_hifi
from src.train_config import TrainParams, load_config
from src.preprocessing.text.cleaners import english_cleaners
import subprocess
from scipy.io.wavfile import write as wav_write
from tqdm.notebook import tqdm


In [2]:
config = load_config("configs/esd_tune.yml")

In [3]:
device = config.device

In [4]:
checkpoint_path = Path(f"checkpoints/{config.checkpoint_name}")

In [5]:
generators = [file for file in (checkpoint_path / "hifi").rglob("*.*") if file.name.startswith("g_")]

In [6]:
G2P_MODEL_PATH = "models/en/g2p/english_g2p.zip"
G2P_OUTPUT_PATH = "predictions/to_g2p.txt"

In [7]:
def text_to_file(user_query: str) -> None:
    text_path = Path("tmp.txt")
    with open(text_path, "w") as fout:
        normalized_content = english_cleaners(user_query)
        normalized_content = " ".join(re.findall("[a-zA-Z]+", normalized_content))
        fout.write(normalized_content)
    subprocess.call(
        ["mfa", "g2p", G2P_MODEL_PATH, text_path.absolute(), G2P_OUTPUT_PATH]
    )
    text_path.unlink()

In [8]:
default = {"he": "HH IY1", "she": "SH IY1", "we": "W IY1", "be": "B IY0", "the": "DH AH0", "whenever": "W EH0 N EH1 V ER0", "year": "AH0 Y IH1 R"}

def parse_g2p(PHONEMES_TO_IDS, g2p_path: str = G2P_OUTPUT_PATH):
    with open(g2p_path, "r") as fin:
        phonemes_ids = []
        phonemes = []
        phonemes_ids.append(PHONEMES_TO_IDS[""])
        for line in fin:
            word, word_to_phones = line.rstrip().split("\t", 1)
            if word in default:
                word_to_phones = default[word]
            phonemes.extend(word_to_phones.split(" "))
            phonemes_ids.extend(
                [PHONEMES_TO_IDS[ph] for ph in word_to_phones.split(" ")]
            )
        phonemes_ids.append(PHONEMES_TO_IDS[""])
    return phonemes_ids

In [9]:
texts_old = [
    "Something must be done for them whenever they leave their current place and settle in a new home.",
    "He then really thought himself equal to it.",
    "I had felt that the best of life was over for me!",
    "I hope a hundred pounds a year would make them all comfortable.",
    "Perhaps it would have been as well if he had left it wholly to myself.",
    "A pleasant renewing of old acquaintances, that was all I had thought it, not foreseeing that I was shortly to plunge into this whole situation."
]

In [10]:
huawei_phones_old = """S AH1 M TH IH0 NG
M AH1 S T
B IY1
D AH1 N
F AO1 R
DH EH1 M
W EH0 N EH1 V ER0
DH EY1
L IY1 V
DH EH1 R
K ER1 AH0 N T
P L EY1 S
AH0 N D
S EH1 T AH0 L
IH0 N
AH0
N UW1
HH OW1 M


HH IY1
DH EH1 N
R IH1 L IY0
TH AO1 T
HH IH0 M S EH1 L F
IY1 K W AH0 L
T UW1
IH1 T


AY1
HH AE1 D
F EH1 L T
DH AE1 T
DH AH0
B EH1 S T
AH1 V
L AY1 F
W AA1 Z
OW1 V ER0
F AO1 R
M IY1


AY1
HH OW1 P
AH0
HH AH1 N D R AH0 D
P AW1 N D Z
AH0
Y IH1 R
W UH1 D
M EY1 K
DH EH1 M
AO1 L
K AH1 M F ER0 T AH0 B AH0 L


P ER0 HH AE1 P S
IH1 T
W UH1 D
HH AE1 V
B IH1 N
AE1 Z
W EH1 L
IH1 F
HH IY1
HH AE1 D
L EH1 F T
IH1 T
HH OW1 L IY0
T UW0
M AY2 S EH1 L F


AH0
P L EH1 Z AH0 N T
R IH0 N UW1 IH0 NG
AH1 V
OW1 L D
AH0 K W EY1 N T AH0 N S IH0 Z
 
DH AE1 T
W AA1 Z
AO1 L
AY1
HH AE1 D
TH AO1 T
IH1 T
 
N AA1 T
F AO0 R S IY1 IH0 NG
DH AE1 T
AY1
W AA1 Z
SH AO1 R T L IY0
T UW0
P L AH1 N JH
IH1 N T UW0
DH IH1 S
HH OW1 L
S IH2 CH UW0 EY1 SH AH0 N"""

In [11]:
test_texts = """
1	We got few vegetables and fruits , and became fish eaters .
2	Fifty yards ahead of her were the first of the rocks .
3	It seemed the ordained order of things that dogs should work .
4	The journey was continued at dawn .
5	Was it the rendezvous of those who were striving to work his ruin .
6	A dead man is of no use on a plantation .
7	The Claudine was leaving next morning for Honolulu .
8	Prosecutors have opened a massive investigation into allegations of fixing games and illegal betting.
9	Different telescope designs perform differently, and have different strengths and weaknesses.
10	Humans also judge distance by using the relative sizes of objects.
11	If this is true then those who tend to think creatively, really are somehow different.
12	But really in the grand scheme of things, this information is insignificant.
13	He had a private jet with three king-size beds, expensive rugs, porcelain vases and a dining area.
14	When I reached Atlanta my steadily increasing disappointment was not lessened. I found it a big, dull, red town.
15	"She woke Meg with a ""Merry Christmas"", and bade her see what was under her pillow. A green–covered book appeared, with the same picture inside, and a few words written by their mother, which made their one present very precious in their eyes."
16	Does Jane know about your new job? No, and don't you dare tell her! She will be furious!
17	Does she like ice cream or sweets? She likes any kind of ice cream. Chocolate, vanilla, strawberry, banana, the one with caramel, coconut, any you can think of!
18	Earthquakes damage all structures, including bridges.
19	Luckily, this kind of collapse is relatively infrequent.
20	Was it using language that caused their brains to develop?
21	If everyone followed a similar plan, the results would be impressive.
22	Next, the hero overcomes obstacles on the way to facing their greatest challenge.
23	For more than two hundred years the pessimists have been winning the public debate.
24	It's wearing me out trying to juggle work with looking after my children and my family.
25	Young people want to feel supported and appreciated by their company and their superiors.
26	We start to see the evidence of early human civilisation, through cave paintings for example.
27	"In this culture a so-called ""smile of respect"" is seen as insincere and often regarded with suspicion."
28	We can express complex thoughts, convey subtle emotions and communicate about some abstract concepts.
29	The activists send a clear message to companies that people are no longer willing to accept the environmental and human cost of overconsumption.
30	All this is thanks to his childhood in the mountains and to genetics, but it is his mental strength that sets him apart.
"""

In [12]:
huawei_phones = [
    ' W IY1 G AA1 T F Y UW1 V EH1 JH T AH0 B AH0 L Z AH0 N D F R UW1 T S  AH0 N D B IH0 K EY1 M F IH1 SH IY1 T ER0 Z  ',
    ' F IH1 F T IY0 Y AA1 R D Z AH0 HH EH1 D AH1 V HH ER1 W ER1 DH AH0 F ER1 S T AH1 V DH AH0 R AA1 K S  ',
    ' IH1 T S IY1 M D DH IY0 AO0 R D EY1 N D AO1 R D ER0 AH1 V TH IH1 NG Z DH AE1 T D AA1 G Z SH UH1 D W ER1 K  ',
    ' DH AH0 JH ER1 N IY0 W AA1 Z K AH0 N T IH1 N Y UW0 D AE1 T D AO1 N  ',
    ' W AA1 Z IH1 T DH AH0 R AA1 N D IH0 V UW2 AH1 V DH OW1 Z HH UW1 W ER1 S T R AY1 V IH0 NG T UW1 W ER1 K HH IH1 Z R UW1 AH0 N  ',
    ' AH0 D EH1 D M AE1 N IH1 Z AH1 V N OW1 Y UW1 S AA1 N AH0 P L AE2 N T EY1 SH AH0 N  ',
    ' DH AH0 K L AO0 D IY1 N W AA1 Z L IY1 V IH0 NG N EH1 K S T M AO1 R N IH0 NG F AO1 R HH AA2 N AH0 L UW1 L UW0  ',
    ' P R AA1 S IH0 K Y UW2 T ER0 Z HH AE1 V OW1 P AH0 N D AH0 M AE1 S IH0 V IH2 N V EH2 S T AH0 G EY1 SH AH0 N IH1 N T UW0 AE2 L AH0 G EY1 SH AH0 N Z AH1 V F IH1 K S IH0 NG G EY1 M Z AH0 N D IH2 L IY1 G AH0 L B EH1 T IH0 NG  ',
    ' D IH1 F ER0 AH0 N T T EH1 L AH0 S K OW2 P D IH0 Z AY1 N Z P ER0 F AO1 R M D IH1 F R AH0 N T L IY0  AH0 N D HH AE1 V D IH1 F ER0 AH0 N T S T R EH1 NG K TH S AH0 N D W IY1 K N AH0 S AH0 Z  ',
    ' HH Y UW1 M AH0 N Z AO1 L S OW0 JH AH1 JH D IH1 S T AH0 N S B AY1 Y UW1 Z IH0 NG DH AH0 R EH1 L AH0 T IH0 V S AY1 Z AH0 Z AH1 V AA1 B JH EH0 K T S  ',
    ' IH1 F DH IH1 S IH1 Z T R UW1 DH EH1 N DH OW1 Z HH UW1 T EH1 N D T UW1 TH IH1 NG K K R IY0 EY1 T IH0 V L IY0  R IH1 L IY0 AA1 R S AH1 M HH AW2 D IH1 F ER0 AH0 N T  ',
    ' B AH1 T R IH1 L IY0 IH0 N DH AH0 G R AE1 N D S K IY1 M AH1 V TH IH1 NG Z  DH IH1 S IH2 N F ER0 M EY1 SH AH0 N IH1 Z IH2 N S IH0 G N Y IH1 F IH0 K AH0 N T  ',
    ' HH IY1 HH AE1 D AH0 P R AY1 V AH0 T JH EH1 T W IH1 DH TH R IY1 K IH1 NG S AY1 Z B EH1 D Z  IH0 K S P EH1 N S IH0 V R AH1 G Z  P AO1 R S AH0 L AH0 N V EY1 S AH0 Z AH0 N D AH0 D AY1 N IH0 NG EH1 R IY0 AH0  ',
    ' W EH1 N AY1 R IY1 CH T AE0 T L AE1 N T AH0 M AY1 S T EH1 D AH0 L IY0 IH2 N K R IY1 S IH0 NG D IH2 S AH0 P OY1 N T M AH0 N T W AA1 Z N AA1 T L EH1 S AH0 N D  AY1 F AW1 N D IH1 T AH0 B IH1 G  D AH1 L  R EH1 D T AW1 N  ',
    ' SH IY1 W OW1 K M EH1 G W IH1 DH AH0 M EH1 R IY0 K R IH1 S M AH0 S  AH0 N D B EY1 D HH ER1 S IY1 W AH1 T W AA1 Z AH1 N D ER0 HH ER1 P IH1 L OW0  AH0 G R IY1 N K AH1 V ER0 D B UH1 K AH0 P IH1 R D  W IH1 DH DH AH0 S EY1 M P IH1 K CH ER0 IH2 N S AY1 D  AH0 N D AH0 F Y UW1 W ER1 D Z R IH1 T AH0 N B AY1 DH EH1 R M AH1 DH ER0  W IH1 CH M EY1 D DH EH1 R W AH1 N P R EH1 Z AH0 N T V EH1 R IY0 P R EH1 SH AH0 S IH0 N DH EH1 R AY1 Z  ',
    ' D AH1 Z JH EY1 N N OW1 AH0 B AW1 T Y AO1 R N UW1 JH AA1 B  N OW1  AH0 N D D OW1 N T Y UW1 D EH1 R T EH1 L HH ER1  SH IY1 W IH1 L B IY1 F Y UH1 R IY0 AH0 S  ',
    ' D AH1 Z SH IY1 L AY1 K AY1 S K R IY1 M AO1 R S W IY1 T S  SH IY1 L AY1 K S EH1 N IY0 K AY1 N D AH1 V AY1 S K R IY1 M  CH AO1 K L AH0 T  V AH0 N IH1 L AH0  S T R AO1 B EH2 R IY0  B AH0 N AE1 N AH0  DH AH0 W AH1 N W IH1 DH K EH1 R AH0 M AH0 L  K OW1 K AH0 N AH2 T  EH1 N IY0 Y UW1 K AE1 N TH IH1 NG K AH1 V  ',
    ' ER1 TH K W EY2 K S D AE1 M AH0 JH AO1 L S T R AH1 K CH ER0 Z  IH2 N K L UW1 D IH0 NG B R IH1 JH AH0 Z  ',
    ' L AH1 K AH0 L IY0  DH IH1 S K AY1 N D AH1 V K AH0 L AE1 P S IH1 Z R EH1 L AH0 T IH0 V L IY0 IH2 N F R IY1 K W AH0 N T  ',
    ' W AA1 Z IH1 T Y UW1 Z IH0 NG L AE1 NG G W AH0 JH DH AE1 T K AA1 Z D DH EH1 R B R EY1 N Z T UW1 D IH0 V EH1 L AH0 P  ',
    ' IH1 F EH1 V R IY0 W AH2 N F AA1 L OW0 D AH0 S IH1 M AH0 L ER0 P L AE1 N  DH AH0 R IH0 Z AH1 L T S W UH1 D B IY1 IH2 M P R EH1 S IH0 V  ',
    ' N EH1 K S T  DH AH0 HH IH1 R OW0 OW1 V ER0 K AH2 M Z AA1 B S T AH0 K AH0 L Z AA1 N DH AH0 W EY1 T UW1 F EY1 S IH0 NG DH EH1 R G R EY1 T AH0 S T CH AE1 L AH0 N JH  ',
    ' F AO1 R M AO1 R DH AE1 N T UW1 HH AH1 N D R AH0 D Y IH1 R Z DH AH0 P EH1 S AH0 M IH0 S T S HH AE1 V B IH1 N W IH1 N IH0 NG DH AH0 P AH1 B L IH0 K D AH0 B EY1 T  ',
    ' IH1 T S W EH1 R IH0 NG M IY1 AW1 T T R AY1 IH0 NG T UW1 JH AH1 G AH0 L W ER1 K W IH1 DH L UH1 K IH0 NG AE1 F T ER0 M AY1 CH IH1 L D R AH0 N AH0 N D M AY1 F AE1 M AH0 L IY0  ',
    ' Y AH1 NG P IY1 P AH0 L W AA1 N T T UW1 F IY1 L S AH0 P AO1 R T IH0 D AH0 N D AH0 P R IY1 SH IY0 EY2 T IH0 D B AY1 DH EH1 R K AH1 M P AH0 N IY2 AH0 N D DH EH1 R S UW0 P IH1 R IY0 ER0 Z  ',
    ' W IY1 S T AA1 R T T UW1 S IY1 DH IY0 EH1 V AH0 D AH0 N S AH1 V ER1 L IY0 HH Y UW1 M AH0 N S IH1 V AH0 L IH0 S EY1 SH AH0 N  TH R UW1 K EY1 V P EY1 N T IH0 NG Z F AO1 R IH0 G Z AE1 M P AH0 L  ',
    ' IH0 N DH IH1 S K AH1 L CH ER0 AH0 S OW1 K AO1 L D S M AY1 L AH1 V R IH0 S P EH1 K T IH1 Z S IY1 N AE1 Z IH2 N S IH0 N S IH1 R AH0 N D AO1 F AH0 N R IH0 G AA1 R D IH0 D W IH1 DH S AH0 S P IH1 SH AH0 N  ',
    ' W IY1 K AE1 N IH0 K S P R EH1 S K AA1 M P L EH0 K S TH AO1 T S  K AH0 N V EY1 S AH1 T AH0 L IH0 M OW1 SH AH0 N Z AH0 N D K AH0 M Y UW1 N AH0 K EY2 T AH0 B AW1 T S AH1 M AE1 B S T R AE0 K T K AA1 N S EH0 P T S  ',
    ' DH IY0 AE1 K T AH0 V AH0 S T S S EH1 N D AH0 K L IH1 R M EH1 S AH0 JH T UW1 K AH1 M P AH0 N IY2 Z DH AE1 T P IY1 P AH0 L AA1 R N OW1 L AO1 NG G ER0 W IH1 L IH0 NG T UW1 AE0 K S EH1 P T DH IY0 IH0 N V AY2 R AH0 N M EH1 N T AH0 L AH0 N D HH Y UW1 M AH0 N K AA1 S T AH1 V OW1 V ER0 K AH0 N S AH2 M P SH AH0 N  ',
    ' AO1 L DH IH1 S IH1 Z TH AE1 NG K S T UW1 HH IH1 Z CH AY1 L D HH UH2 D IH0 N DH AH0 M AW1 N T AH0 N Z AH0 N D T UW1 JH AH0 N EH1 T IH0 K S  B AH1 T IH1 T IH1 Z HH IH1 Z M EH1 N T AH0 L S T R EH1 NG K TH DH AE1 T S EH1 T S HH IH1 M AH0 P AA1 R T  '
]

In [13]:
def to_phones(PHONEMES_TO_IDS, phones):
    """For old ones"""
    phonemes_ids = []
    phonemes_ids.append(PHONEMES_TO_IDS[""])
    for line in phones.split("\n"):
        if not line:
            continue
        word_to_phones = line
        phonemes_ids.extend(
            [PHONEMES_TO_IDS[ph] for ph in word_to_phones.split(" ")]
        )
    phonemes_ids.append(PHONEMES_TO_IDS[""])
    return phonemes_ids


def to_phones(PHONEMES_TO_IDS, phones):
    """For new ones"""
    phonemes_ids = (
       [PHONEMES_TO_IDS[ph] for ph in phones.split(" ")]
    )
    return phonemes_ids

In [14]:
phonemes_list = []
with open(checkpoint_path / "feature"/ "phonemes.json") as f:
    phonemes_to_ids = json.load(f)
for hp in huawei_phones:
    phoneme_ids = to_phones(phonemes_to_ids, hp)
    phonemes_list.append(phoneme_ids)

In [15]:
feature_model = torch.load(checkpoint_path / "feature" / "feature_model.pth", map_location=device)

In [16]:
feature_model = feature_model.eval()

In [17]:
def get_tacotron_batch(
    phonemes_ids, reference, speaker_id, device, mels_mean, mels_std
):
    text_lengths_tensor = torch.LongTensor([len(phonemes_ids)])
    reference = (reference - mels_mean) / mels_std
    reference = reference.permute(0, 2, 1).to(device)
    phonemes_ids_tensor = torch.LongTensor(phonemes_ids).unsqueeze(0).to(device)
    speaker_ids_tensor = torch.LongTensor([speaker_id]).to(device)
    return phonemes_ids_tensor, text_lengths_tensor, speaker_ids_tensor, reference

In [18]:
reference_pathes = Path("references/")

In [19]:
generated_path = Path(f"generated_hifi/{config.checkpoint_name}")

In [20]:
with open(checkpoint_path / "feature"/ "speakers.json") as f:
    speaker_to_id = json.load(f)

In [21]:
mels_mean = torch.load(checkpoint_path / "feature" / "mels_mean.pth").float()
mels_std = torch.load(checkpoint_path / "feature" / "mels_std.pth").float()

In [22]:
for reference in tqdm(list(reference_pathes.rglob("*.pkl"))):
    speaker = reference.parent.name
    speaker_id = speaker_to_id[speaker]
    ref_mel = torch.load(reference)
    for i, phonemes in enumerate(phonemes_list):
        batch = get_tacotron_batch(phonemes, ref_mel, speaker_id, device, mels_mean, mels_std)
        with torch.no_grad():
            mels = feature_model.inference(batch)
            mels = mels.permute(0, 2, 1).squeeze(0)
            mels = mels * mels_std.to(device) + mels_mean.to(device)
            x = mels.unsqueeze(0)
            for generator_path in generators:
                state_dict = torch.load(generator_path, map_location="cpu")
                state_dict["generator"] = {k: v.to(device) for k, v in state_dict["generator"].items()}
                generator = Generator(config=config.train_hifi.model_param, num_mels=config.n_mels).to(device)
                generator.load_state_dict(state_dict["generator"])
                generator.remove_weight_norm()
                generator.eval()
                y_g_hat = generator(x)
                audio = y_g_hat.squeeze()
                audio = audio * 32768
                audio = audio.type(torch.int16).detach().cpu().numpy()
                save_path = generated_path / generator_path.stem / speaker / reference.stem
                save_path.mkdir(exist_ok=True, parents=True)
                wav_write(save_path / f"{i + 1}.wav", 22050, audio)
                torch.cuda.empty_cache()


  0%|          | 0/45 [00:00<?, ?it/s]