# TIMIT Test
- Testing the base Wav2Vec2Phoneme2 Model on TIMIT audio
    - Note: this does not include TIMIT adjusted transcription (i.e. `kcl` to `k` and `k` to `REL`ease)

In [15]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
import torch

In [16]:
# Load in Model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")



In [17]:
# Constants
audio_path = "./TestAudioData/TheMealWasCooked_NormalSpeed.wav"
TARGET_SR = 16000

In [18]:
# Resample Audio
waveform, original_sr = librosa.load(audio_path)
resampled_audio = librosa.resample(waveform, orig_sr=original_sr,target_sr=TARGET_SR)
print(resampled_audio.shape)

(39765,)


In [19]:
# Process Audio
input_values = processor(resampled_audio, return_tensors="pt",sampling_rate=TARGET_SR).input_values
input_values = torch.reshape(input_values, (1, -1))

with torch.no_grad():
    logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
# => should give ['m ɪ s t ɚ k w ɪ l t ɚ ɪ z ð ɪ ɐ p ɑː s əl l ʌ v ð ə m ɪ d əl k l æ s ɪ z æ n d w iː aʊ ɡ l æ d t ə w ɛ l k ə m h ɪ z ɡ ɑː s p ə']
# print("这是我第一次使用word2vec2phoneme")
print(transcription)
print(input_values.shape)

['ð ə m iː l w ʌ z k ʊ k t b ə f oː ð ə b ɛ l ɹ ɪ ŋ']
torch.Size([1, 39765])


In [20]:
# Verify Audio
import soundfile as sf
sf.write('./TestAudioData/output.wav', resampled_audio, TARGET_SR)