# Korean Speech Recognition using Wav2Vec2 Model
This notebook demonstrates how to use the pre-trained Wav2Vec2 model from Hugging Face to convert Korean speech to text.

In [1]:
# Import necessary libraries
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
from jamo import h2j, j2hcj, is_jamo

## Load Model and Processor
Load the pre-trained Wav2Vec2 model and processor from the Hugging Face Hub.

In [2]:
# Load the model and processor
model_name = "Kkonjeong/wav2vec2-base-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()
model.to("cuda")

## Jamo Combination Function
Define a function to combine Jamo characters into complete Hangul text.

In [3]:
# Function to combine Jamo characters
def combine_jamo(jamo_text):
    hangul_text = ""
    buffer = []

    # Function to validate Jamo characters
    def is_valid_jamo(jamo):
        return 0x1100 <= ord(jamo) <= 0x11FF

    def flush_buffer(buffer):
        if len(buffer) == 3:
            initial = ord(buffer[0]) - 0x1100
            medial = ord(buffer[1]) - 0x1161
            final = ord(buffer[2]) - 0x11A7
            if (0 <= initial < 19) and (0 <= medial < 21) and (0 <= final < 28):
                return chr(0xAC00 + initial * 588 + medial * 28 + final)
        elif len(buffer) == 2:
            initial = ord(buffer[0]) - 0x1100
            medial = ord(buffer[1]) - 0x1161
            if (0 <= initial < 19) and (0 <= medial < 21):
                return chr(0xAC00 + initial * 588 + medial * 28)
        elif len(buffer) == 1:
            return buffer[0]
        return ''.join(buffer)

    for jamo in jamo_text:
        if is_jamo(jamo) and is_valid_jamo(jamo):
            buffer.append(jamo)
            if len(buffer) == 3:
                hangul_text += flush_buffer(buffer)
                buffer = []
        else:
            hangul_text += flush_buffer(buffer)
            hangul_text += jamo
            buffer = []

    if buffer:
        hangul_text += flush_buffer(buffer)

    return hangul_text

## Speech-to-Text Prediction Function
Define a function to perform inference on an audio file and predict the text.

In [4]:
# Function to perform inference on an audio file and predict the text
def predict_from_audio(audio_path):
    # Load the audio file
    speech_array, sampling_rate = torchaudio.load(audio_path)

    # Resample if the sampling rate is not 16000Hz
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)

    # Convert to 2D tensor
    speech_array = speech_array.squeeze().numpy()

    # Preprocess the audio file
    input_values = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_values
    input_values = input_values.to("cuda")

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the Jamo text
    pred_ids = torch.argmax(logits, dim=-1)
    jamo_text = processor.batch_decode(pred_ids)[0]

    # Combine the Jamo text to form the final sentence
    final_text = combine_jamo(jamo_text)

    return final_text

## Example: Predicting Text from an Audio File
Use the specified audio file to predict the text.

In [5]:
# Example: Predicting text from an audio file
audio_path = "jiwon_.wav"
predicted_text = predict_from_audio(audio_path)
print("Predicted Text: ", predicted_text)

Predicted Text:  Example text
