In [1]:
import torch
import torchaudio
from torchaudio.pipelines import MMS_FA
import torchaudio.functional as F
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# pip install --upgrade torch torchaudio


In [3]:
print(torch.__version__)
print(torchaudio.__version__)

2.4.1+cpu
2.4.1+cpu


In [4]:
print(device)

cpu


In [None]:
# json_path = Path("/home/mendo/Downloads/LM/LM-5/LM-TTS-Main/data_marte_wav/Bulu/json_files/MAT.json")

In [5]:
audio_path = Path("home/mendo/Downloads/LM/LM-5/LM-TTS-Main/data_marte_wav/Bulu_OT_NT_audios/Bulu_NT_audio/wavs_16/MAT/MAT_001.wav")
json_path = Path("/home/mendo/Downloads/LM/LM-5/LM-TTS-Main/data_marte_wav/Bafia/json_files/MAT.json")
book, chapter = json_path.stem, audio_path.stem

In [11]:
chapter

'MAT_001'

In [12]:
import json
import re
import string
import unicodedata
from unidecode import unidecode
from num2words import num2words

In [13]:


def preprocess_verse(text: str) -> str:
    text = unidecode(text)
    text = unicodedata.normalize('NFKC', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("\s+", " ", text)
    return text

def load_transcripts(json_path, chapter):
    with open(json_path, "r") as f:
        data = json.load(f)
        print(data)
    
    # convert MAT.19.1 -> MAT_019
    get_chapter = lambda x: x.split('.')[0] + '_' + x.split('.')[1].zfill(3)
    # filter by book and chapter
    transcripts = [d["verset"] for d in data if get_chapter(d["numVerset"]) == chapter]
    verse_ids = [d["numVerset"] for d in data if get_chapter(d["numVerset"]) == chapter]
    return verse_ids, transcripts

In [14]:
verse_ids, transcripts = load_transcripts(json_path, chapter)

[{'numVerset': 'MAT.1.1', 'verset': 'Kalate éndane Yésus Krist, e mona David, e mon Abraham.'}, {'numVerset': 'MAT.1.2', 'verset': 'Abraham a nga biaé Izak, Izak a nga biaé Yakob, Yakob a nga biaé Yuda baa be bobenyañ. '}, {'numVerset': 'MAT.1.3', 'verset': 'Yuda a nga biaé Farés a Zara aluʼu baa Tamar. Farés a nga biaé Esrôm, Esrôm a nga biaé Aram. '}, {'numVerset': 'MAT.1.4', 'verset': 'Aram a nga biaé Aminadab, Aminadab ke a biaé Naasôn. Naasôn ke a biaé Salmôn. '}, {'numVerset': 'MAT.1.5', 'verset': 'Salmôn a nga biaé Bôaz aluʼu baa Rahab. Bôaz ke a biaé Yôbéd aluʼu baa Rut. Yôbéd a nga biaé Izayi. '}, {'numVerset': 'MAT.1.6', 'verset': 'Izayi ke a biaé David, njôô bôt. David a nga biaé Solômon aluʼu baa minga Urie, '}, {'numVerset': 'MAT.1.7', 'verset': 'Solômon a nga biaé Rôbôam, Rôbôam a nga biaé Abia, Abia ke a nga biaé Asaf. '}, {'numVerset': 'MAT.1.8', 'verset': 'Asaf a nga biaé Yôzafat, Yôzafat ke a nga biaé Yôram, Yôram ke a biaé Ozias. '}, {'numVerset': 'MAT.1.9', 'verset'

In [15]:
verse_ids

['MAT.1.1',
 'MAT.1.2',
 'MAT.1.3',
 'MAT.1.4',
 'MAT.1.5',
 'MAT.1.6',
 'MAT.1.7',
 'MAT.1.8',
 'MAT.1.9',
 'MAT.1.10',
 'MAT.1.11',
 'MAT.1.12',
 'MAT.1.13',
 'MAT.1.14',
 'MAT.1.15',
 'MAT.1.16',
 'MAT.1.17',
 'MAT.1.18',
 'MAT.1.19',
 'MAT.1.20',
 'MAT.1.21',
 'MAT.1.22',
 'MAT.1.23',
 'MAT.1.24',
 'MAT.1.25']

In [16]:
transcripts

['Kalate éndane Yésus Krist, e mona David, e mon Abraham.',
 'Abraham a nga biaé Izak, Izak a nga biaé Yakob, Yakob a nga biaé Yuda baa be bobenyañ. ',
 'Yuda a nga biaé Farés a Zara aluʼu baa Tamar. Farés a nga biaé Esrôm, Esrôm a nga biaé Aram. ',
 'Aram a nga biaé Aminadab, Aminadab ke a biaé Naasôn. Naasôn ke a biaé Salmôn. ',
 'Salmôn a nga biaé Bôaz aluʼu baa Rahab. Bôaz ke a biaé Yôbéd aluʼu baa Rut. Yôbéd a nga biaé Izayi. ',
 'Izayi ke a biaé David, njôô bôt. David a nga biaé Solômon aluʼu baa minga Urie, ',
 'Solômon a nga biaé Rôbôam, Rôbôam a nga biaé Abia, Abia ke a nga biaé Asaf. ',
 'Asaf a nga biaé Yôzafat, Yôzafat ke a nga biaé Yôram, Yôram ke a biaé Ozias. ',
 'Ozias a nga biaé Yôatam, Yôatam a nga biaé Akaz, Akaz ke a biaé Ezékias; ',
 'Ezékias a nga biaé Manasé, Manasé a nga biaé Amôs, Amôs ke a nga biaé Yôzias; ',
 'Yôzias ke a biaé Yékônias a bobenyañ, valé be too minkôm e Babilôn. ',
 'E mvus éto minkôme ya Babilôn, Yékônias a nga biaé Salatiél, Salatiél ke a bia

In [17]:
verses = [preprocess_verse(v) for v in transcripts]

In [18]:
transcripts[0]

'Kalate éndane Yésus Krist, e mona David, e mon Abraham.'

In [19]:
verses[0]

'kalate endane yesus krist e mona david e mon abraham'

In [20]:
verses[0]

'kalate endane yesus krist e mona david e mon abraham'

In [21]:
augmented_verses = ["*"] * len(verses) * 2
augmented_verses[1::2] = verses

words = [verse.split() for verse in verses]
augmented_words = [word for verse in augmented_verses for word in verse.split()]

In [22]:
augmented_verses

['*',
 'kalate endane yesus krist e mona david e mon abraham',
 '*',
 'abraham a nga biae izak izak a nga biae yakob yakob a nga biae yuda baa be bobenyan ',
 '*',
 'yuda a nga biae fares a zara aluu baa tamar fares a nga biae esrom esrom a nga biae aram ',
 '*',
 'aram a nga biae aminadab aminadab ke a biae naason naason ke a biae salmon ',
 '*',
 'salmon a nga biae boaz aluu baa rahab boaz ke a biae yobed aluu baa rut yobed a nga biae izayi ',
 '*',
 'izayi ke a biae david njoo bot david a nga biae solomon aluu baa minga urie ',
 '*',
 'solomon a nga biae roboam roboam a nga biae abia abia ke a nga biae asaf ',
 '*',
 'asaf a nga biae yozafat yozafat ke a nga biae yoram yoram ke a biae ozias ',
 '*',
 'ozias a nga biae yoatam yoatam a nga biae akaz akaz ke a biae ezekias ',
 '*',
 'ezekias a nga biae manase manase a nga biae amos amos ke a nga biae yozias ',
 '*',
 'yozias ke a biae yekonias a bobenyan vale be too minkom e babilon ',
 '*',
 'e mvus eto minkome ya babilon yekonias a n

In [23]:
words

[['kalate',
  'endane',
  'yesus',
  'krist',
  'e',
  'mona',
  'david',
  'e',
  'mon',
  'abraham'],
 ['abraham',
  'a',
  'nga',
  'biae',
  'izak',
  'izak',
  'a',
  'nga',
  'biae',
  'yakob',
  'yakob',
  'a',
  'nga',
  'biae',
  'yuda',
  'baa',
  'be',
  'bobenyan'],
 ['yuda',
  'a',
  'nga',
  'biae',
  'fares',
  'a',
  'zara',
  'aluu',
  'baa',
  'tamar',
  'fares',
  'a',
  'nga',
  'biae',
  'esrom',
  'esrom',
  'a',
  'nga',
  'biae',
  'aram'],
 ['aram',
  'a',
  'nga',
  'biae',
  'aminadab',
  'aminadab',
  'ke',
  'a',
  'biae',
  'naason',
  'naason',
  'ke',
  'a',
  'biae',
  'salmon'],
 ['salmon',
  'a',
  'nga',
  'biae',
  'boaz',
  'aluu',
  'baa',
  'rahab',
  'boaz',
  'ke',
  'a',
  'biae',
  'yobed',
  'aluu',
  'baa',
  'rut',
  'yobed',
  'a',
  'nga',
  'biae',
  'izayi'],
 ['izayi',
  'ke',
  'a',
  'biae',
  'david',
  'njoo',
  'bot',
  'david',
  'a',
  'nga',
  'biae',
  'solomon',
  'aluu',
  'baa',
  'minga',
  'urie'],
 ['solomon',
  'a',
  

In [24]:
augmented_words

['*',
 'kalate',
 'endane',
 'yesus',
 'krist',
 'e',
 'mona',
 'david',
 'e',
 'mon',
 'abraham',
 '*',
 'abraham',
 'a',
 'nga',
 'biae',
 'izak',
 'izak',
 'a',
 'nga',
 'biae',
 'yakob',
 'yakob',
 'a',
 'nga',
 'biae',
 'yuda',
 'baa',
 'be',
 'bobenyan',
 '*',
 'yuda',
 'a',
 'nga',
 'biae',
 'fares',
 'a',
 'zara',
 'aluu',
 'baa',
 'tamar',
 'fares',
 'a',
 'nga',
 'biae',
 'esrom',
 'esrom',
 'a',
 'nga',
 'biae',
 'aram',
 '*',
 'aram',
 'a',
 'nga',
 'biae',
 'aminadab',
 'aminadab',
 'ke',
 'a',
 'biae',
 'naason',
 'naason',
 'ke',
 'a',
 'biae',
 'salmon',
 '*',
 'salmon',
 'a',
 'nga',
 'biae',
 'boaz',
 'aluu',
 'baa',
 'rahab',
 'boaz',
 'ke',
 'a',
 'biae',
 'yobed',
 'aluu',
 'baa',
 'rut',
 'yobed',
 'a',
 'nga',
 'biae',
 'izayi',
 '*',
 'izayi',
 'ke',
 'a',
 'biae',
 'david',
 'njoo',
 'bot',
 'david',
 'a',
 'nga',
 'biae',
 'solomon',
 'aluu',
 'baa',
 'minga',
 'urie',
 '*',
 'solomon',
 'a',
 'nga',
 'biae',
 'roboam',
 'roboam',
 'a',
 'nga',
 'biae',
 'abia

### SEGMENTATION ET ALIGNEMENT VERSE-AUDIO (MMS-CTC FORCE ALIGNMENT API)

In [None]:
bundle = MMS_FA
model = bundle.get_model(with_star=True).to(device)
LABELS = bundle.get_labels()
DICTIONARY = bundle.get_dict()

In [None]:
chunk_size_s = 15
waveform, sr = torchaudio.load(audio_path)
chunk_size_frames = chunk_size_s * sr
chunks = [waveform[:, i : i + chunk_size_frames] for i in range(0, waveform.shape[1], chunk_size_frames)]

In [None]:
emissions = []

with torch.inference_mode():
    for chunk in chunks:
        if chunk.size(1) >= 400:
            emission, _ = model(chunk.to(device))
            emissions.append(emission)

emission = torch.cat(emissions, dim=1)
assert len(DICTIONARY) == emission.shape[2]
num_frames = emission.size(1)

In [None]:
# probs = torch.softmax(emission, dim=2)
# greedy_path = torch.argmax(probs, dim=-1).squeeze().cpu().numpy()
# predicted_tokens = [LABELS[i] == "*" for i in greedy_path]

In [None]:
def align(emission, tokens):
    targets = torch.tensor([tokens], dtype=torch.int32, device=device)
    alignments, scores = F.forced_align(emission, targets, blank=0)

    alignments, scores = alignments[0], scores[0]  # remove batch dimension for simplicity
    scores = scores.exp()  # convert back to probability
    return alignments, scores

def unflatten(list_, lengths):
    assert len(list_) == sum(lengths)
    i = 0
    ret = []
    for l in lengths:
        ret.append(list_[i : i + l])
        i += l
    return ret

def compute_alignments(emission, transcript, dictionary):
    tokens = [dictionary[char] for word in transcript for char in word]
    alignment, scores = align(emission, tokens)
    token_spans = F.merge_tokens(alignment, scores)
    word_spans = unflatten(token_spans, [len(word) for word in transcript])
    return word_spans

In [None]:
word_spans = compute_alignments(emission, augmented_words, DICTIONARY)

In [None]:
from IPython.display import Audio

def _score(spans):
    return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)

def preview_word(waveform, spans, num_frames, transcript, sample_rate=bundle.sample_rate):
    ratio = waveform.size(1) / num_frames
    x0 = int(ratio * spans[0].start)
    x1 = int(ratio * spans[-1].end)
    print(f"{transcript} ({_score(spans):.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
    segment = waveform[:, x0:x1]
    return Audio(segment.numpy(), rate=sample_rate)

In [None]:
word_only_spans = [spans for spans, word in zip(word_spans, augmented_words) if word != "*"]
assert len(word_only_spans) == sum(len(word) for word in words)

In [None]:
segments, labels = [], []
start = 0
for verse_words in words:
    end = start + len(verse_words)
    verse_spans = word_only_spans[start: end]
    ratio = waveform.size(1) / num_frames
    x0 = int(ratio * verse_spans[0][0].start)
    x1 = int(ratio * verse_spans[-1][-1].end)
    transcript = " ".join(verse_words)
    segment = waveform[:, x0: x1]
    start = end
    segments.append(segment)
    labels.append(transcript)

In [None]:
assert len(segments) == len(verse_ids)

In [None]:
from IPython.display import Audio

idx = -1
print(labels[idx])
Audio(segments[idx].numpy(), rate=bundle.sample_rate)

In [None]:
from pathlib import Path
from scipy.io.wavfile import write

# MAT.1.2 -> MAT_001

output_dir = Path("../outputs/openbible_swahili/")
output_dir = output_dir / book / chapter
output_dir.mkdir(parents=True, exist_ok=True)

for verse_id, segment, label in zip(verse_ids, segments, labels):
    verse_number = verse_id.split(".")[-1].zfill(3)
    verse_file_name = chapter + "_" + verse_number
    audio_path = (output_dir / verse_file_name).with_suffix(".wav")
    transcript_path = (output_dir / verse_file_name).with_suffix(".txt")
    write(audio_path, bundle.sample_rate, segment.squeeze().numpy())
    with open(transcript_path, "w") as f:
        f.write(label)