# model import

In [1]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab_jamos.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

In [2]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [3]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

In [4]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    ".\service_1\Assets\jamo_base_model",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

In [5]:
print(model)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureExtractor(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (4): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,)

# model predict

In [6]:
import librosa
import pandas as pd
import numpy as np
import torch

In [7]:
array,_ = librosa.load('./dataset/audio/script1_g_0044-6001-01-01-KSM-F-05-A.wav',16000)
array = processor(array, sampling_rate=16000).input_values[0]

In [8]:
pred = model.forward(torch.from_numpy(array.reshape(1,-1)))

In [9]:
def pred_decode(pred):
    pred_logits = pred['logits'].detach().numpy()
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    return pred_str

In [10]:
pred_str = ''.join(pred_decode(pred))
print(pred_str)

ㅂㅗ<pad>ㄱ<pad>ㅗ ㅇ<pad>ㅣ<pad>ㅆ<pad>ㄴ<pad>ㅡ<pad>ㄴ<pad> ㅇ<pad>ㅕ<pad>ㅇ<pad>ㅅ<pad>ㅏ<pad>ㅇ<pad> ㅈ<pad>ㅓ<pad>ㅇ<pad>ㅈ<pad>ㅣ<pad>ㅅ<pad>ㅣ<pad>ㅋ<pad>ㅕ ㅈㅝ


In [11]:
import re
remove_pad_token = re.sub('<pad>','',pred_str)
print(remove_pad_token)

ㅂㅗㄱㅗ ㅇㅣㅆㄴㅡㄴ ㅇㅕㅇㅅㅏㅇ ㅈㅓㅇㅈㅣㅅㅣㅋㅕ ㅈㅝ


# to onnx

In [12]:
import torch

In [13]:
torch.onnx.export(model,
                  (torch.from_numpy(array.reshape(1,-1))),
                  "./outputs/jamo_base_model.onnx",
                  input_names=["input"],
                  output_names=["output"],
                  )

  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


In [14]:
import onnxruntime

In [15]:
session = onnxruntime.InferenceSession('./outputs/jamo_base_model.onnx')

In [16]:
ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(array)
results = session.run(["output"],{"input":array.reshape(1,-1)})
# print(results)
print(results[0].shape)
print(np.argmax(results[0],axis=-1))

(1, 179, 111)
[[109 107   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  31   0
    0 107 107   4   4   4  13   0   8   0  12  12   0  20   0  50   0  20
   20   0   4   4  13   0   0  42   0  13  13   0   0   0  33  33   0   0
   67   0   0  13   0   0   0   0   4  85  85   0  35   0   0  13  13   0
    0   0   0   0  85   0   0   8   8   0   0   0  33  33   0   8   0   0
    0   0  17  17   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0  42   4   4  85  85  98  98   4]]


In [17]:
pred_ids = np.argmax(results[0],axis=-1)

In [18]:
import json

with open('vocab_jamos.json','r') as f:
    word_to_index = json.load(f)
print(word_to_index)    

{'ㅣ': 8, 'ㄶ': 9, 'V': 10, 'C': 11, 'ㅆ': 12, 'ㅇ': 13, 'k': 14, 't': 15, 'ㄼ': 16, 'ㅋ': 17, 'ㅚ': 18, 'ㄵ': 19, 'ㄴ': 20, 'ㅉ': 21, 'ㅜ': 22, 'b': 23, 'o': 24, 'B': 25, 'ㅛ': 26, 'v': 27, 'I': 28, 'i': 29, 'X': 30, 'ㄱ': 31, 'ㅄ': 32, 'ㅅ': 33, 'ㅠ': 34, 'ㅓ': 35, 'L': 36, 'm': 37, 'Z': 38, 'q': 39, 'ㄸ': 40, 'G': 41, 'ㅕ': 42, 'K': 43, 'd': 44, 'S': 45, 'Y': 46, 'M': 47, 'h': 48, 'w': 49, 'ㅡ': 50, 'ㅍ': 51, 'ㅐ': 52, 'j': 53, 'ㄷ': 54, 'ㄽ': 55, 'p': 56, 'ㄾ': 57, 'e': 58, 'N': 59, 'ㅞ': 60, 'x': 61, 'ㅒ': 62, 'ㅑ': 63, 'H': 64, 'r': 65, 'T': 66, 'ㅏ': 67, 'g': 68, 'ㄹ': 69, 'ㅀ': 70, 'ㄻ': 71, 'J': 72, 'u': 73, 'A': 74, 'ㄿ': 75, 'y': 76, 'F': 77, 'ㄲ': 78, 'c': 79, 'ㅔ': 80, 'ㅎ': 81, 'O': 82, 'ㅌ': 83, 'ㅢ': 84, 'ㅈ': 85, 'ㅁ': 86, 'ㅊ': 87, 'ㅙ': 88, 'E': 89, 'ㅖ': 90, 'P': 91, 'n': 92, 'Q': 93, 'l': 94, 'ㄳ': 95, 'ㅟ': 96, 'z': 97, 'ㅝ': 98, 'D': 99, 's': 100, 'ㅘ': 101, 'ㅃ': 102, 'R': 103, 'f': 104, 'a': 105, 'W': 106, 'ㅗ': 107, 'U': 108, 'ㅂ': 109, 'ㄺ': 110, '<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, '<b>': 5, 

In [19]:
index_to_word = {index:word for word,index in word_to_index.items()}
print(index_to_word)

{8: 'ㅣ', 9: 'ㄶ', 10: 'V', 11: 'C', 12: 'ㅆ', 13: 'ㅇ', 14: 'k', 15: 't', 16: 'ㄼ', 17: 'ㅋ', 18: 'ㅚ', 19: 'ㄵ', 20: 'ㄴ', 21: 'ㅉ', 22: 'ㅜ', 23: 'b', 24: 'o', 25: 'B', 26: 'ㅛ', 27: 'v', 28: 'I', 29: 'i', 30: 'X', 31: 'ㄱ', 32: 'ㅄ', 33: 'ㅅ', 34: 'ㅠ', 35: 'ㅓ', 36: 'L', 37: 'm', 38: 'Z', 39: 'q', 40: 'ㄸ', 41: 'G', 42: 'ㅕ', 43: 'K', 44: 'd', 45: 'S', 46: 'Y', 47: 'M', 48: 'h', 49: 'w', 50: 'ㅡ', 51: 'ㅍ', 52: 'ㅐ', 53: 'j', 54: 'ㄷ', 55: 'ㄽ', 56: 'p', 57: 'ㄾ', 58: 'e', 59: 'N', 60: 'ㅞ', 61: 'x', 62: 'ㅒ', 63: 'ㅑ', 64: 'H', 65: 'r', 66: 'T', 67: 'ㅏ', 68: 'g', 69: 'ㄹ', 70: 'ㅀ', 71: 'ㄻ', 72: 'J', 73: 'u', 74: 'A', 75: 'ㄿ', 76: 'y', 77: 'F', 78: 'ㄲ', 79: 'c', 80: 'ㅔ', 81: 'ㅎ', 82: 'O', 83: 'ㅌ', 84: 'ㅢ', 85: 'ㅈ', 86: 'ㅁ', 87: 'ㅊ', 88: 'ㅙ', 89: 'E', 90: 'ㅖ', 91: 'P', 92: 'n', 93: 'Q', 94: 'l', 95: 'ㄳ', 96: 'ㅟ', 97: 'z', 98: 'ㅝ', 99: 'D', 100: 's', 101: 'ㅘ', 102: 'ㅃ', 103: 'R', 104: 'f', 105: 'a', 106: 'W', 107: 'ㅗ', 108: 'U', 109: 'ㅂ', 110: 'ㄺ', 0: '<pad>', 1: '<s>', 2: '</s>', 3: '<unk>', 4: '|', 5: '<b>', 

In [20]:
pred_str = [index_to_word[idx] for idx in pred_ids.flatten()]
print(pred_str)

['ㅂ', 'ㅗ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'ㄱ', '<pad>', '<pad>', 'ㅗ', 'ㅗ', '|', '|', '|', 'ㅇ', '<pad>', 'ㅣ', '<pad>', 'ㅆ', 'ㅆ', '<pad>', 'ㄴ', '<pad>', 'ㅡ', '<pad>', 'ㄴ', 'ㄴ', '<pad>', '|', '|', 'ㅇ', '<pad>', '<pad>', 'ㅕ', '<pad>', 'ㅇ', 'ㅇ', '<pad>', '<pad>', '<pad>', 'ㅅ', 'ㅅ', '<pad>', '<pad>', 'ㅏ', '<pad>', '<pad>', 'ㅇ', '<pad>', '<pad>', '<pad>', '<pad>', '|', 'ㅈ', 'ㅈ', '<pad>', 'ㅓ', '<pad>', '<pad>', 'ㅇ', 'ㅇ', '<

In [21]:
remove_pad_token = re.sub('<pad>','',''.join(pred_str))
print(remove_pad_token)

ㅂㅗㄱㅗㅗ|||ㅇㅣㅆㅆㄴㅡㄴㄴ||ㅇㅕㅇㅇㅅㅅㅏㅇ|ㅈㅈㅓㅇㅇㅈㅣㅣㅅㅅㅣㅋㅋㅕ||ㅈㅈㅝㅝ|


In [22]:
ctc = []
tmp = ""
for s in remove_pad_token:
    if s == '|':
        s = " "
    if s == tmp:
        continue
    else:
        ctc.append(s)
    tmp = s
print(ctc)

['ㅂ', 'ㅗ', 'ㄱ', 'ㅗ', ' ', 'ㅇ', 'ㅣ', 'ㅆ', 'ㄴ', 'ㅡ', 'ㄴ', ' ', 'ㅇ', 'ㅕ', 'ㅇ', 'ㅅ', 'ㅏ', 'ㅇ', ' ', 'ㅈ', 'ㅓ', 'ㅇ', 'ㅈ', 'ㅣ', 'ㅅ', 'ㅣ', 'ㅋ', 'ㅕ', ' ', 'ㅈ', 'ㅝ', ' ']


In [23]:
"".join(ctc)

'ㅂㅗㄱㅗ ㅇㅣㅆㄴㅡㄴ ㅇㅕㅇㅅㅏㅇ ㅈㅓㅇㅈㅣㅅㅣㅋㅕ ㅈㅝ '

In [24]:
from unicode import join_jamos
join_jamos("".join(ctc))

'보고 있는 영상 정지시켜 줘 '