In [23]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import Dataset, load_dataset
import soundfile as sf
import torch
from jiwer import wer
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

In [13]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch


def map_to_pred(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
    # input_values = inputs.input_values.to("cuda")
    input_values = inputs.input_values
    
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    batch["transcription"] = transcription
    return batch

In [30]:
def read_audio(audio_path):
    ext = Path(audio_path).suffix
    if ext in [".wav", ".flac"]:
        y, sr = sf.read(audio_path, dtype="int16")
        if y.ndim == 2:
            print("This audio file is stereo!")
        y = y.astype(np.int32)
    elif ext == ".pcm":
        y = np.memmap(audio_path, dtype="h", mode="r")
    return y, sr


def normalize_signal(signal):
    return signal / (2 ** 15)


def create_pebble_dataset(data_dir):
    data_dir = Path(data_dir)

    ls_audio_path = []
    ls_signal_norm = []
    ls_transcript = []
    for audio_path in tqdm(sorted(data_dir.glob("*/*.wav"))):
        signal, _ = read_audio(audio_path)
        signal_norm = normalize_signal(signal)
        
        txt_path = Path(str(audio_path).replace("wav", "txt"))
        with open(txt_path, mode="r") as f:
                transcript = f.read()
        ls_signal_norm.append(signal_norm)
        ls_transcript.append(transcript)
        ls_audio_path.append(str(audio_path))

    dic_for_ds = {"file": ls_audio_path, "audio": ls_signal_norm, "text": ls_transcript}
    ds = Dataset.from_dict(dic_for_ds)
    return ds

In [31]:

# processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")

# # model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')
# model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
# ds = load_dataset(path="kresnik/zeroth_korean", name="clean", split="test")


processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")

model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
ds = create_pebble_dataset("/Users/jongbeom.kim/project/corpus_raw/b2b_projects/2022/2022-PB-01/samples/male/processing")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 3000/3000 [00:04<00:00, 702.37it/s]


In [32]:
ds[0]

{'file': '/Users/jongbeom.kim/project/corpus_raw/b2b_projects/2022/2022-PB-01/samples/male/processing/male_121_019/male_121_019_male_(10)_c_index_speed_12001.wav',
 'audio': [0.0,
  3.0517578125e-05,
  0.0,
  0.0,
  0.0,
  6.103515625e-05,
  0.0,
  3.0517578125e-05,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0517578125e-05,
  -9.1552734375e-05,
  -3.0517578125e-05,
  -6.103515625e-05,
  -6.103515625e-05,
  -9.1552734375e-05,
  -3.0517578125e-05,
  -6.103515625e-05,
  -9.1552734375e-05,
  -6.103515625e-05,
  -0.0001220703125,
  -0.0001220703125,
  -6.103515625e-05,
  -6.103515625e-05,
  -3.0517578125e-05,
  -6.103515625e-05,
  0.0,
  -3.0517578125e-05,
  0.0,
  0.0,
  -3.0517578125e-05,
  0.0,
  0.0,
  0.0,
  -3.0517578125e-05,
  -3.0517578125e-05,
  0.0,
  3.0517578125e-05,
  0.0,
  3.0517578125e-05,
  -3.0517578125e-05,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  -3.0517578125e-05,
  -3.0517578125e-05,
  -3.0517578125e-05,
  -3.0517578125e-05,
  -3.0517578125e-05,
  -3.0517578125e-05,
  0

In [12]:
ds[0]

{'file': '/Users/jongbeom.kim/.cache/huggingface/datasets/downloads/extracted/7b239cc48d3b22740312008bea001966ffc71e97561eb44722219e1c021db7e6/test_data_01/003/132/132_003_0014.flac',
 'audio': {'path': '/Users/jongbeom.kim/.cache/huggingface/datasets/downloads/extracted/7b239cc48d3b22740312008bea001966ffc71e97561eb44722219e1c021db7e6/test_data_01/003/132/132_003_0014.flac',
  'array': array([ 0.0000000e+00,  3.0517578e-05,  0.0000000e+00, ...,
         -1.2207031e-04,  1.8310547e-04, -1.2207031e-04], dtype=float32),
  'sampling_rate': 16000},
 'text': '대외변수에 실적 불안감까지 겹치면서 코스피 낙폭이 어느 정도 수준까지 이어질지 가늠하기도 어려운 상황이다',
 'speaker_id': 132,
 'chapter_id': 3,
 'id': '132_003_0014'}

In [33]:

test_ds = ds
test_ds = test_ds.select([0, 1, 3, 4, 5])
test_ds = test_ds.map(map_to_array)

result = test_ds.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"])

print("WER:", wer(result["text"], result["transcription"]))

100%|██████████| 5/5 [00:00<00:00, 32.34ex/s]
100%|██████████| 1/1 [00:03<00:00,  3.49s/ba]

WER: 1.0





In [34]:
result["text"]

['전원 모드', '일괄 점등', '불 켜줘', '전원 켜', '전원 켜줘']

In [35]:
result["transcription"]

['다', '그', '난', '언다', '홍 껑']