In [28]:
import os
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = r'C:\Program Files\eSpeak NG\libespeak-ng.dll' # path to espeak-ng dll on Windows

In [30]:
import sys
import os
import torch
import unicodedata
from phonemizer import phonemize
import re


sys.path.insert(0, os.path.join(os.getcwd(), 'Matcha_TTS_main')) # Matcha_TTS_main is the code from the original repo 
from matcha.models.matcha_tts import MatchaTTS

In [31]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Define symbols


In [32]:
_pad = '_'
_punctuation = ';:,.!?¡¿—…"«»"" '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Combine all symbols
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)


# Create symbol to ID mapping
_symbol_to_id = {s: i for i, s in enumerate(symbols)}


In [33]:
text = "Hello, Mr. Arezki! How are you?"
print("Original text:", text)

Original text: Hello, Mr. Arezki! How are you?


##### 1) Convert text to ASCII, removing accents


In [34]:
text = unicodedata.normalize('NFKD', text)
text = text.encode('ascii', 'ignore').decode('ascii')
print(f"After ASCII conversion: '{text}'")

After ASCII conversion: 'Hello, Mr. Arezki! How are you?'


##### 2) Lowercase


In [35]:
text = text.lower()
print(f"After lowercase: '{text}'")

After lowercase: 'hello, mr. arezki! how are you?'


##### 3)  Expand abbreviations




In [36]:
abbreviations = {
        'mr.': 'mister',
        'mrs.': 'misess',
        'dr.': 'doctor',
        'st.': 'saint',
        'co.': 'company',
        'jr.': 'junior',
        'sr.': 'senior',
        'etc.': 'et cetera',
        'vs.': 'versus',
        'ltd.': 'limited',
    }
    




for abbr, expansion in abbreviations.items():
        text = text.replace(abbr, expansion)
print(f"After expanding abbreviations: '{text}'")



After expanding abbreviations: 'hello, mister arezki! how are you?'


##### 4) Convert to phonemes using phonemizer


In [37]:

phonemes = phonemize(
    text,
    language='en-us',
    backend='espeak',
    strip=True,
    preserve_punctuation=True,
    with_stress=True)
print(f"After phonemization: '{phonemes}'")

After phonemization: 'həlˈoʊ, mˈɪstɚɹ ˈæɹɛzki! hˈaʊ ɑːɹ juː?'


#### 5) Remove brackets


In [38]:

phonemes = re.sub(r'\[.*?\]', '', phonemes)
phonemes = re.sub(r'\(.*?\)', '', phonemes)
phonemes = re.sub(r'\{.*?\}', '', phonemes) 
print(f"After removing brackets: '{phonemes}'")

After removing brackets: 'həlˈoʊ, mˈɪstɚɹ ˈæɹɛzki! hˈaʊ ɑːɹ juː?'


##### 6) Collapse whitespace


In [39]:
phonemes = re.sub(r'\s+', ' ', phonemes).strip()
print(f"After collapsing whitespace: '{phonemes}'")

After collapsing whitespace: 'həlˈoʊ, mˈɪstɚɹ ˈæɹɛzki! hˈaʊ ɑːɹ juː?'


##### 7) Convert phonetic text to sequence of IDs


In [40]:

sequence = []
    
# Convert each character/phoneme to its ID
for symbol in text:
    if symbol in _symbol_to_id:
        symbol_id = _symbol_to_id[symbol]
        sequence.append(symbol_id)
    else:
        # If symbol not in vocabulary, skip it or use a default
        print(f"Warning: Symbol '{symbol}' not in vocabulary, skipping")

print(f"Final sequence of IDs: {sequence}")

Final sequence of IDs: [50, 47, 54, 54, 57, 3, 16, 55, 51, 61, 62, 47, 60, 16, 43, 60, 47, 68, 53, 51, 5, 16, 50, 57, 65, 16, 43, 60, 47, 16, 67, 57, 63, 6]


##### 8) Intersperse with blanks (ID 0)


In [41]:
sequence_with_blanks = []
for i, element in enumerate(sequence):
    sequence_with_blanks.append(element)
    if i < len(sequence) - 1:  # Don't add after last element
        sequence_with_blanks.append(0)
print(f"Sequence with blanks: {sequence_with_blanks}")


Sequence with blanks: [50, 0, 47, 0, 54, 0, 54, 0, 57, 0, 3, 0, 16, 0, 55, 0, 51, 0, 61, 0, 62, 0, 47, 0, 60, 0, 16, 0, 43, 0, 60, 0, 47, 0, 68, 0, 53, 0, 51, 0, 5, 0, 16, 0, 50, 0, 57, 0, 65, 0, 16, 0, 43, 0, 60, 0, 47, 0, 16, 0, 67, 0, 57, 0, 63, 0, 6]


##### 9) Convert to PyTorch tensor


In [42]:
x = torch.tensor(sequence_with_blanks, dtype=torch.long, device=device)
x = x.unsqueeze(0)  # Same as [None] - adds dimension at front
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)



print(f"Final tensor shape: {x.shape}")

Final tensor shape: torch.Size([1, 67])


In [47]:

final_result = {
    'x_orig': text,              # Original text
    'x_phones': phonemes,         # Phonetic representation
    'x': x,                       # Tensor with batch dimension
    'x_lengths': x_lengths,       # Length tensor
    'sequence': sequence 
        }

print(final_result)


{'x_orig': 'hello, mister arezki! how are you?', 'x_phones': 'həlˈoʊ, mˈɪstɚɹ ˈæɹɛzki! hˈaʊ ɑːɹ juː?', 'x': tensor([[50,  0, 47,  0, 54,  0, 54,  0, 57,  0,  3,  0, 16,  0, 55,  0, 51,  0,
         61,  0, 62,  0, 47,  0, 60,  0, 16,  0, 43,  0, 60,  0, 47,  0, 68,  0,
         53,  0, 51,  0,  5,  0, 16,  0, 50,  0, 57,  0, 65,  0, 16,  0, 43,  0,
         60,  0, 47,  0, 16,  0, 67,  0, 57,  0, 63,  0,  6]]), 'x_lengths': tensor([67]), 'sequence': [50, 47, 54, 54, 57, 3, 16, 55, 51, 61, 62, 47, 60, 16, 43, 60, 47, 68, 53, 51, 5, 16, 50, 57, 65, 16, 43, 60, 47, 16, 67, 57, 63, 6]}
