# Wav2Vec2 Fine-Tuning for Korean Speech Recognition
This notebook demonstrates the fine-tuning of the Wav2Vec2 model for Korean speech recognition using the Zeroth-Korean dataset.

In [None]:
!pip install transformers[torch] accelerate -U
!pip install datasets torchaudio -U
!pip install jiwer jamo
!pip install tensorboard

## Data Preprocessing

In [None]:
from datasets import load_dataset
import re
from jamo import h2j, j2hcj

# Load Zeroth-Korean dataset
dataset = load_dataset('zeroth_korean', 'clean')

# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^ ㄱ-ㅣ가-힣]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply text cleaning and Jamo separation
def prepare_data(batch):
    batch['text'] = clean_text(batch['text'])
    batch['text'] = j2hcj(h2j(batch['text']))
    return batch

dataset = dataset.map(prepare_data)

## Tokenizer and Vocabulary

In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor

# Define the tokenizer
vocab_dict = {c: i for i, c in enumerate(set(''.join(dataset['train']['text'])))}
vocab_dict['[PAD]'] = len(vocab_dict)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(vocab_dict)

# Define the processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

## Model Configuration

In [None]:
from transformers import Wav2Vec2ForCTC

# Load the Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')

# Configure the model
model.config.update({
    'attention_dropout': 0.0,
    'hidden_dropout': 0.0,
    'feat_proj_dropout': 0.0,
    'mask_time_prob': 0.05,
    'gradient_checkpointing': True
})

## Training

In [None]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    group_by_length=True,
    per_device_train_batch_size=32,
    evaluation_strategy='steps',
    num_train_epochs=10,
    gradient_accumulation_steps=2,
    fp16=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=processor.feature_extractor,
)

# Train the model
trainer.train()

## Evaluation

In [None]:
from datasets import load_metric

# Load CER metric
cer = load_metric('cer')

# Evaluate the model
results = trainer.evaluate()
print(f'CER: {results['eval_cer']}')

## Inference

In [None]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio

# Load the model and processor
model_name = 'Kkonjeong/wav2vec2-base-korean'
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Perform inference on an audio file
def predict(file_path):
    # Load and preprocess the audio file
    speech_array, sampling_rate = torchaudio.load(file_path)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        speech_array = resampler(speech_array)
    input_values = processor(speech_array.squeeze().numpy(), sampling_rate=16000).input_values[0]
    input_values = torch.tensor(input_values).unsqueeze(0).to('cuda')
    
    # Get model predictions
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

audio_file_path = 'jiwon_.wav'
transcription = predict(audio_file_path)
print('Transcription:', transcription)