In [40]:
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import wave
import contextlib
import torch
import torchaudio
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import shutil
import re
import random
import json
import evaluate  # Thay thế load_metric bằng evaluate
from datasets import load_dataset, Audio, Dataset
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments,
                          Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, DataCollatorWithPadding, AutoConfig)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
%matplotlib inline




In [41]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [42]:
# Kiểm tra thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sử dụng thiết bị: {device}")

# Đường dẫn đến dataset VIVOS
train_audio_path = 'vivos/train/waves'
train_prompts_path = 'vivos/train/prompts.txt'
train_genders_path = 'vivos/train/genders.txt'

test_audio_path = 'vivos/test/waves'
test_prompts_path = 'vivos/test/prompts.txt'
test_genders_path = 'vivos/test/genders.txt'

Sử dụng thiết bị: cpu


In [43]:
# Hàm để đọc file prompts.txt và trả về DataFrame
def load_prompts(prompts_path):
    transcripts = []
    with open(prompts_path, 'r', encoding='utf-8') as f:
        for line in f:
            id, text = line.strip().split(' ', 1)
            transcripts.append({'id': id, 'text': text.lower()})
    return pd.DataFrame(transcripts)

# Tạo DataFrame cho tập train và test
train_transcripts = load_prompts(train_prompts_path)
test_transcripts = load_prompts(test_prompts_path)

In [44]:
# Thêm đường dẫn âm thanh vào DataFrame
def get_audio_path(audio_base_path, audio_id):
    speaker = audio_id.split('_')[0]
    return os.path.join(audio_base_path, speaker, audio_id + '.wav')

train_transcripts['audio'] = train_transcripts['id'].apply(lambda x: get_audio_path(train_audio_path, x))
test_transcripts['audio'] = test_transcripts['id'].apply(lambda x: get_audio_path(test_audio_path, x))

In [45]:
# Loại bỏ các ký tự đặc biệt và chuyển văn bản về chữ thường
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"“%‘”�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

# Áp dụng hàm tiền xử lý dữ liệu
train_transcripts = train_transcripts.apply(remove_special_characters, axis=1)
test_transcripts = test_transcripts.apply(remove_special_characters, axis=1)

In [46]:
# Tải processor và model từ mô hình pre-trained
# processor giữ nguyên do không tham gia vào quá trình fine-tune
checkpoint = torch.load("/kaggle/input/asr-nvlb-24-30/wav2vec2-vivos/checkpoints/checkpoint_epoch_29.pth", map_location=device)
processor = Wav2Vec2Processor.from_pretrained("/kaggle/input/asr-nvlb-24-30/wav2vec2-vivos/checkpoints/processor") 

model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", 
                                       attention_dropout=0.25,       # Tăng dropout đế giảm overfitting
                                       hidden_dropout=0.25, activation_dropout=0.25,
                                       ctc_loss_reduction="mean",
                                       pad_token_id = processor.tokenizer.pad_token_id)
model.load_state_dict(checkpoint['model_state_dict'])

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/asr-nvlb-24-30/wav2vec2-vivos/checkpoints/checkpoint_epoch_29.pth'