# ====================================================================
# PROYEK TUGAS AKHIR DEEP LEARNING: SPEECH RECOGNITION
# Judul: Speech Recognition (Suara ke Teks) menggunakan Mozilla Common Voice
# Model: Wav2Vec2 Fine-tuning
# Kelompok 7 - Studi Kasus 2
# Muhammad Jovi Syawal Difa - 227006516003
# Joevan Pramana Achmad - 227006516015
# Rifki Eko Pratomo - 227006516024
# Tongam Deni Gamaliel Situmoran - 227006516061
# ====================================================================

In [1]:
# =============================================================================
# 1. SETUP ENVIRONMENT
# =============================================================================
from google.colab import drive
import os
print("🔗 Mounting Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive berhasil di-mount!")

print("📦 Installing required libraries...")
!pip install -q transformers datasets torchaudio librosa jiwer soundfile accelerate pandas
print("✅ Semua library berhasil diinstall!")

🔗 Mounting Google Drive...
Mounted at /content/drive
✅ Google Drive berhasil di-mount!
📦 Installing required libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# =============================================================================
# IMPORT LIBRARIES
# =============================================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import librosa
import soundfile as sf
import json
from IPython.display import Audio, display
import warnings
warnings.filterwarnings('ignore')

from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from jiwer import wer
from dataclasses import dataclass
from typing import Dict, List, Union

# Set random seed untuk reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✅ Setup environment selesai!")

✅ Setup environment selesai!


In [3]:
# =============================================================================
# 2. KONFIGURASI DATASET PATH
# =============================================================================
DATASET_PATH = "/content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en"
CLIPS_PATH = os.path.join(DATASET_PATH, "clips")
VALIDATED_TSV = os.path.join(DATASET_PATH, "validated.tsv")

print(f"📁 Dataset path: {DATASET_PATH}")
print(f"🎵 Audio clips path: {CLIPS_PATH}")
print(f"📋 Metadata file: {VALIDATED_TSV}")

📁 Dataset path: /content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en
🎵 Audio clips path: /content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en/clips
📋 Metadata file: /content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en/validated.tsv


In [4]:
# =============================================================================
# 3. LOAD DATA & CREATE VOCABULARY
# =============================================================================
try:
    df = pd.read_csv(VALIDATED_TSV, sep='\t')
    df['audio_path'] = df['path'].apply(lambda x: os.path.join(CLIPS_PATH, x))

    file_exists = df['audio_path'].apply(os.path.exists)
    df_valid = df[file_exists].copy()

    # Gunakan semua data yang valid karena model lebih ringan
    print(f"👍 Ditemukan {len(df_valid):,} file audio yang valid.")

    # Buat Vocabulary dari karakter unik
    all_text = " ".join(df_valid["sentence"].str.lower())
    vocab = sorted(list(set(all_text)))

    # CTC blank token di index 0, pad token di index terakhir
    ctc_blank_token = "^"
    pad_token = "_"
    vocab.insert(0, ctc_blank_token)
    vocab.append(pad_token)

    char_to_int = {char: i for i, char in enumerate(vocab)}
    int_to_char = {i: char for i, char in enumerate(vocab)}

    print("🔤 Vocabulary created:")
    print(f"   - Size: {len(vocab)}")
    print(f"   - Characters: {''.join(vocab)}")

    # Split data
    full_dataset = Dataset.from_pandas(df_valid)
    dataset_split = full_dataset.train_test_split(test_size=0.2, seed=42)
    raw_datasets = DatasetDict({
        'train': dataset_split['train'],
        'test': dataset_split['test']
    })
    print(f"\n📈 Training samples: {len(raw_datasets['train']):,}")
    print(f"📉 Testing samples:  {len(raw_datasets['test']):,}")

except Exception as e:
    print(f"❌ Error: {e}")

👍 Ditemukan 137 file audio yang valid.
🔤 Vocabulary created:
   - Size: 35
   - Characters: ^ "',-.?abcdefghijklmnopqrstuvwxyz_

📈 Training samples: 109
📉 Testing samples:  28


In [5]:
# =============================================================================
# 4. PREPROCESSING (AUDIO TO SPECTROGRAM)
# =============================================================================
# Transformasi audio ke Mel Spectrogram
mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000, n_mels=128
)

def preprocess_function(batch):
    try:
        # Load audio
        speech, sr = sf.read(batch["audio_path"])
        if sr != 16000:
            speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
        speech_tensor = torch.tensor(speech, dtype=torch.float32)

        # Buat spectrogram
        mel_spec = mel_spectrogram_transform(speech_tensor)
        batch["input_features"] = mel_spec.squeeze().T # (Time, Freq)

        # Tokenize text
        text = batch["sentence"].lower()
        batch["labels"] = [char_to_int[char] for char in text]
        return batch
    except Exception:
        return None # Abaikan file yang error

print("🔄 Applying preprocessing...")
processed_datasets = raw_datasets.map(
    preprocess_function,
    remove_columns=raw_datasets["train"].column_names
).filter(lambda x: x is not None)

print("✅ Preprocessing selesai!")

🔄 Applying preprocessing...


Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Filter:   0%|          | 0/109 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28 [00:00<?, ? examples/s]

✅ Preprocessing selesai!


In [21]:
# =============================================================================
# 5. DEFINE THE CNN-RNN MODEL
# =============================================================================
class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats // 2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=1)
        self.rnn = nn.GRU(
            input_size=32 * n_feats,
            hidden_size=rnn_dim,
            num_layers=n_rnn_layers,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Linear(rnn_dim * 2, n_class)
        self.ctc_loss = nn.CTCLoss(blank=char_to_int[ctc_blank_token], zero_infinity=True)

    def forward(self, input_values, labels=None, **kwargs):
        # input_values shape: (batch, time, freq)
        x = input_values.unsqueeze(1) # (batch, 1, time, freq)
        x = self.cnn(x) # (batch, channels, time, freq)

        b, c, t, f = x.size()
        x = x.view(b, t, c * f) # (batch, time, channels * freq)

        x, _ = self.rnn(x)
        x = self.classifier(x)
        log_probs = nn.functional.log_softmax(x, dim=2)

        loss = None
        if labels is not None:
            input_lengths = torch.full(size=(b,), fill_value=log_probs.size(1), dtype=torch.long)

            # DIUBAH: Menghitung panjang label dari tensor yang sudah di-pad
            labels_mask = labels >= 0
            label_lengths = labels_mask.sum(dim=1)

            # DIUBAH: Mengganti -100 dengan ID token pad yang sebenarnya untuk fungsi loss
            loss_labels = labels.clone()
            loss_labels[loss_labels == -100] = char_to_int[pad_token]

            loss = self.ctc_loss(log_probs.permute(1, 0, 2), loss_labels, input_lengths, label_lengths)

        return (loss, log_probs) if loss is not None else (None, log_probs)

# Inisialisasi model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpeechRecognitionModel(
    n_cnn_layers=1,
    n_rnn_layers=3,
    rnn_dim=256,
    n_class=len(char_to_int),
    n_feats=128,
    stride=2,
    dropout=0.1
).to(device)

print("✅ Model CNN-RNN berhasil dibuat!")
print(model)

✅ Model CNN-RNN berhasil dibuat!
SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rnn): GRU(2048, 256, num_layers=3, batch_first=True, bidirectional=True)
  (classifier): Linear(in_features=512, out_features=35, bias=True)
  (ctc_loss): CTCLoss()
)


In [22]:
# =============================================================================
# 6. DATA COLLATOR & EVALUATION METRIC
# =============================================================================
@dataclass
class DataCollatorSpeech:
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # DIUBAH: Mengekstrak fitur dan label dengan benar
        input_features = [f["input_features"] for f in features]
        labels = [f["labels"] for f in features]

        # Pad input features menjadi tensor
        padded_inputs = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(feat) for feat in input_features],
            batch_first=True,
            padding_value=0.0
        )

        # DIUBAH: Pad labels menjadi tensor, menggunakan -100 sebagai penanda padding
        padded_labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l) for l in labels],
            batch_first=True,
            padding_value=-100
        )

        return {"input_values": padded_inputs, "labels": padded_labels}

data_collator = DataCollatorSpeech()
print("✅ Data collator siap!")

def compute_metrics(pred):
    pred_log_probs = pred.predictions
    pred_ids = np.argmax(pred_log_probs, axis=-1)

    # Decode predictions
    pred_str = []
    for ids in pred_ids:
        text = ""
        for i, token_id in enumerate(ids):
            if token_id == char_to_int[ctc_blank_token]:
                continue
            if i > 0 and token_id == ids[i-1]: # Hapus duplikat
                continue
            text += int_to_char.get(token_id, "")
        pred_str.append(text)

    # DIUBAH: Decode labels dengan menangani padding -100
    pred.label_ids[pred.label_ids == -100] = char_to_int[pad_token]
    label_str = []
    for label_ids in pred.label_ids:
        # Filter token pad sebelum decoding
        filtered_ids = [l for l in label_ids if l != char_to_int[pad_token]]
        label_str.append("".join([int_to_char.get(l, "") for l in filtered_ids]))

    wer_score = wer(label_str, pred_str)
    return {"wer": wer_score}

print("✅ Fungsi evaluasi WER siap!")

✅ Data collator siap!
✅ Fungsi evaluasi WER siap!


In [23]:
# =============================================================================
# 7. TRAINING ARGUMENTS
# =============================================================================
output_dir = "/content/drive/MyDrive/cnn-rnn-common-voice-en"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    fp16=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=50,
    learning_rate=1e-4,
    warmup_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    eval_strategy="steps",
    report_to="none",
    remove_unused_columns=False,
)
print("✅ Training arguments siap!")

✅ Training arguments siap!


In [24]:
# =============================================================================
# 8. INITIALIZE AND RUN TRAINER
# =============================================================================
class CustomTrainer(Trainer):
    # DIUBAH: Menambahkan **kwargs untuk menerima argumen tak terduga
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    compute_metrics=compute_metrics,
)

print("🚀 Memulai fine-tuning model...")
try:
    trainer.train()
    print("✅ Training selesai!")
    trainer.save_model(output_dir)
    # Simpan vocabulary
    with open(os.path.join(output_dir, 'vocab.json'), 'w') as f:
        json.dump(char_to_int, f)
    print(f"💾 Model & vocabulary berhasil disimpan di: {output_dir}")
except Exception as e:
    print(f"❌ Error saat training: {e}")

🚀 Memulai fine-tuning model...


Step,Training Loss,Validation Loss


✅ Training selesai!
💾 Model & vocabulary berhasil disimpan di: /content/drive/MyDrive/cnn-rnn-common-voice-en


In [25]:
# =============================================================================
# 9. EVALUASI FINAL & TESTING
# =============================================================================
print("\n📊 Melakukan evaluasi final...")
eval_results = trainer.evaluate()
final_wer = eval_results.get('eval_wer', 0)

print("🎯 HASIL EVALUASI:")
print(f"📈 Word Error Rate (WER): {final_wer:.4f} ({final_wer*100:.2f}%)")

def transcribe_audio(audio_path, model, device):
    model.eval()
    # Preprocess audio
    speech, sr = sf.read(audio_path)
    if sr != 16000:
        speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
    speech_tensor = torch.tensor(speech, dtype=torch.float32).to(device)
    mel_spec = mel_spectrogram_transform(speech_tensor).squeeze().T

    with torch.no_grad():
        _, log_probs = model(mel_spec.unsqueeze(0))

    pred_ids = torch.argmax(log_probs, dim=-1).squeeze().tolist()

    # Decode
    text = ""
    for i, token_id in enumerate(pred_ids):
        if token_id == char_to_int[ctc_blank_token]: continue
        if i > 0 and token_id == pred_ids[i-1]: continue
        text += int_to_char.get(token_id, "")
    return text

print("\n🎤 TESTING FUNGSI TRANSKRIPSI")
test_sample = raw_datasets['test'].shuffle().select([0])[0]
predicted_text = transcribe_audio(test_sample['audio_path'], model, device)
print(f"📝 Teks Asli: {test_sample['sentence']}")
print(f"🤖 Prediksi:  {predicted_text}")
display(Audio(test_sample['audio_path']))


📊 Melakukan evaluasi final...


🎯 HASIL EVALUASI:
📈 Word Error Rate (WER): 1.0000 (100.00%)

🎤 TESTING FUNGSI TRANSKRIPSI
📝 Teks Asli: The phrase originated in a comic strip of the same name.
🤖 Prediksi:    
