# ====================================================================
# PROYEK TUGAS AKHIR DEEP LEARNING: SPEECH RECOGNITION
# Judul: Speech Recognition (Suara ke Teks) menggunakan Mozilla Common Voice
# Model: Wav2Vec2 Fine-tuning
# Kelompok 7 - Studi Kasus 2
# Muhammad Jovi Syawal Difa - 227006516003
# Joevan Pramana Achmad - 227006516015
# Rifki Eko Pratomo - 227006516024
# Tongam Deni Gamaliel Situmoran - 227006516061
# ====================================================================

In [26]:
# Cell 1: Setup Environment
from google.colab import drive
import os
print("🔗 Menghubungkan Google Drive...")
drive.mount('/content/drive')
print("✅ Google Drive berhasil terhubung!")
print("\n📦 Menginstall library yang diperlukan...")
!pip install -q datasets transformers torchaudio jiwer accelerate
print("✅ Semua library berhasil diinstall!")

🔗 Menghubungkan Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive berhasil terhubung!

📦 Menginstall library yang diperlukan...
✅ Semua library berhasil diinstall!


In [27]:
# Cell 2: Import Libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchaudio
import librosa
import json
import gc
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
from jiwer import wer
from dataclasses import dataclass
from typing import Dict, List, Union
import warnings

warnings.filterwarnings('ignore')
torch.manual_seed(42)
np.random.seed(42)
print("✅ Library berhasil diimpor!")

✅ Library berhasil diimpor!


In [28]:
# Cell 3: Konfigurasi Path
DATASET_PATH = "/content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en"
CLIPS_PATH = os.path.join(DATASET_PATH, "clips")
VALIDATED_TSV = os.path.join(DATASET_PATH, "validated.tsv")
OUTPUT_DIR = "/content/drive/MyDrive/cnn-rnn-common-voice-en-finetuned"

print(f"📁 Dataset path: {VALIDATED_TSV}")
print(f"💾 Output Directory: {OUTPUT_DIR}")

📁 Dataset path: /content/drive/MyDrive/cv-corpus-19.0-delta-2024-09-13/en/validated.tsv
💾 Output Directory: /content/drive/MyDrive/cnn-rnn-common-voice-en-finetuned


In [29]:
# Cell 4: Memuat Data & Membuat Vocabulary (dengan Sampel Lebih Besar)
print("📂 Memuat sampel data yang lebih besar...")

# [PERBAIKAN LANGKAH 1] Gunakan sampel data yang jauh lebih besar.
# 7500 adalah titik awal yang baik. Jika RAM masih kuat, Anda bisa menaikkannya.
NUM_SAMPLES = 7500
df = pd.read_csv(VALIDATED_TSV, sep='\t', usecols=['path', 'sentence']).dropna()
df = df.sample(n=min(NUM_SAMPLES, len(df)), random_state=42)

df['audio_path'] = df['path'].apply(lambda x: os.path.join(CLIPS_PATH, x))
df = df[df['audio_path'].apply(os.path.exists)].copy()

print(f"👍 Menggunakan {len(df):,} sampel audio yang valid.")

# Membuat Vocabulary dari karakter unik
all_text = " ".join(df["sentence"].str.lower())
vocab_list = sorted(list(set(all_text)))

# Token khusus untuk CTC Loss
ctc_blank_token = "<blank>"
pad_token = "<pad>"
vocab_list.insert(0, ctc_blank_token)
vocab_list.append(pad_token)

char_to_int = {char: i for i, char in enumerate(vocab_list)}
int_to_char = {i: char for i, char in enumerate(vocab_list)}

print(f"🔤 Vocabulary dibuat dengan ukuran: {len(vocab_list)}")

# Split data
full_dataset = Dataset.from_pandas(df)
dataset_split = full_dataset.train_test_split(test_size=0.1, seed=42)
raw_datasets = DatasetDict({
    'train': dataset_split['train'],
    'test': dataset_split['test']
})
print(f"\n📈 Training samples: {len(raw_datasets['train']):,}")
print(f"📉 Testing samples:  {len(raw_datasets['test']):,}")

# Membersihkan memori
del df, all_text, full_dataset, dataset_split
gc.collect()

📂 Memuat sampel data yang lebih besar...
👍 Menggunakan 137 sampel audio yang valid.
🔤 Vocabulary dibuat dengan ukuran: 35

📈 Training samples: 123
📉 Testing samples:  14


152

In [30]:
# Cell 5: Preprocessing (Audio ke Spectrogram)
mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

def preprocess_function(batch):
    try:
        speech, sr = torchaudio.load(batch["audio_path"])
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            speech = resampler(speech)

        # Konversi ke mono
        if speech.shape[0] > 1:
            speech = torch.mean(speech, dim=0, keepdim=True)

        mel_spec = mel_spectrogram_transform(speech)
        batch["input_features"] = mel_spec.squeeze(0).T

        text = batch["sentence"].lower()
        batch["labels"] = [char_to_int[char] for char in text]
        return batch
    except Exception as e:
        return None

print("🔄 Menerapkan preprocessing...")
processed_datasets = raw_datasets.map(
    preprocess_function,
    remove_columns=raw_datasets["train"].column_names
).filter(lambda x: x is not None)
print("✅ Preprocessing selesai!")

🔄 Menerapkan preprocessing...


Map:   0%|          | 0/123 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Filter:   0%|          | 0/123 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14 [00:00<?, ? examples/s]

✅ Preprocessing selesai!


In [31]:
# Cell 6: Mendefinisikan Model CNN-RNN yang Ditingkatkan

# [PERBAIKAN LANGKAH 2] Arsitektur model yang lebih solid
class SpeechRecognitionModel(nn.Module):
    def __init__(self, rnn_dim, n_class, n_feats, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()

        # Lapisan CNN untuk ekstraksi fitur
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

        # Menghitung ukuran input untuk RNN setelah melewati CNN
        # n_feats_after_cnn = n_feats // 4 (karena 2x stride=2)
        rnn_input_size = 64 * (n_feats // 4)

        # Lapisan RNN (GRU) untuk memproses sekuens waktu
        self.rnn = nn.GRU(
            input_size=rnn_input_size,
            hidden_size=rnn_dim,
            num_layers=3, # Lebih banyak lapisan
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )

        # Klasifikasi ke vocabulary
        self.classifier = nn.Linear(rnn_dim * 2, n_class)
        self.ctc_loss = nn.CTCLoss(blank=char_to_int[ctc_blank_token], zero_infinity=True)

    def forward(self, input_values, labels=None, **kwargs):
        # input_values shape: (batch, time, freq)
        x = input_values.unsqueeze(1) # -> (batch, 1, time, freq)
        x = self.cnn(x) # -> (batch, channels, time_reduced, freq_reduced)

        b, c, t, f = x.size()
        x = x.permute(0, 2, 1, 3) # -> (batch, time_reduced, channels, freq_reduced)
        x = x.reshape(b, t, c * f) # -> (batch, time_reduced, channels * freq_reduced)

        x, _ = self.rnn(x)
        x = self.classifier(x)
        log_probs = nn.functional.log_softmax(x, dim=2)

        loss = None
        if labels is not None:
            input_lengths = torch.full(size=(b,), fill_value=log_probs.size(1), dtype=torch.long)
            labels_mask = labels >= 0
            label_lengths = labels_mask.sum(dim=1)

            loss_labels = labels.clone()
            loss_labels[loss_labels == -100] = char_to_int[pad_token]

            loss = self.ctc_loss(log_probs.permute(1, 0, 2), loss_labels, input_lengths, label_lengths)

        return (loss, log_probs) if loss is not None else (None, log_probs)

# Inisialisasi model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpeechRecognitionModel(
    rnn_dim=512,
    n_class=len(char_to_int),
    n_feats=128, # n_mels dari spectrogram
    dropout=0.1
).to(device)

print("✅ Model CNN-RNN yang ditingkatkan berhasil dibuat!")
print(model)

✅ Model CNN-RNN yang ditingkatkan berhasil dibuat!
SpeechRecognitionModel(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.1, inplace=False)
  )
  (rnn): GRU(2048, 512, num_layers=3, batch_first=True, dropout=0.1, bidirectional=True)
  (classifier): Linear(in_features=1024, out_features=35, bias=True)
  (ctc_loss): CTCLoss()
)


In [33]:
# Cell 7: Data Collator & Evaluation Metric
@dataclass
class DataCollatorSpeech:
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        labels = [{"labels": f["labels"]} for f in features]

        padded_inputs = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f["input_features"]) for f in features],
            batch_first=True, padding_value=0.0
        )

        padded_labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(l["labels"]) for l in labels],
            batch_first=True, padding_value=-100
        )

        return {"input_values": padded_inputs, "labels": padded_labels}

data_collator = DataCollatorSpeech()

def compute_metrics(pred):
    pred_log_probs = pred.predictions
    pred_ids = np.argmax(pred_log_probs, axis=-1)

    pred.label_ids[pred.label_ids == -100] = char_to_int[pad_token]

    decoded_preds = []
    for ids in pred_ids:
        text = ""
        for i, token_id in enumerate(ids):
            if token_id == char_to_int[ctc_blank_token]: continue
            if i > 0 and token_id == ids[i-1]: continue
            text += int_to_char.get(token_id, "")
        decoded_preds.append(text)

    decoded_labels = []
    for label_ids in pred.label_ids:
        filtered_ids = [l for l in label_ids if l != char_to_int[pad_token]]
        decoded_labels.append("".join([int_to_char.get(l, "") for l in filtered_ids]))

    wer_score = wer(decoded_labels, decoded_preds)
    return {"wer": wer_score}

print("✅ Data collator dan fungsi evaluasi siap!")

✅ Data collator dan fungsi evaluasi siap!


In [36]:
# Cell 8: Training Arguments yang Dioptimalkan (Perbaikan)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    fp16=True,
    learning_rate=1e-4,
    warmup_steps=500,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to="none",
    remove_unused_columns=False,
)

print("✅ Training arguments siap!")

✅ Training arguments siap!


In [38]:
# Cell 9: Initialize and Run Trainer (Perbaikan)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_datasets["train"],
    eval_dataset=processed_datasets["test"],
    compute_metrics=compute_metrics,
)

print("\n🚀 Memulai fine-tuning model CNN-RNN yang ditingkatkan...")
# Proses ini akan memakan waktu LAMA (beberapa jam)
trainer.train()
print("\n✅ Training selesai!")
trainer.save_model(OUTPUT_DIR)


🚀 Memulai fine-tuning model CNN-RNN yang ditingkatkan...


Epoch,Training Loss,Validation Loss,Wer
1,17.3605,8.914878,1.0
2,17.4169,8.896986,1.0
3,17.1003,8.863618,1.0
4,17.4811,8.815743,1.0
5,16.8307,8.752847,1.0
6,16.7476,8.673981,1.0
7,16.4441,8.573668,1.0
8,16.727,8.435616,1.0
9,15.6929,8.286146,1.0
10,15.8812,8.092406,1.0



✅ Training selesai!


In [39]:
# Cell 10: Evaluasi Final
print("\n📊 Melakukan evaluasi final...")
eval_results = trainer.evaluate()
final_wer = eval_results.get('eval_wer', 0)

print("\n🎯 HASIL EVALUASI AKHIR:")
print(f"   - Word Error Rate (WER): {final_wer:.4f} ({final_wer*100:.2f}%)")
if final_wer < 0.3:
    print("   - 🎉 Selamat! Target WER < 30% tercapai.")
else:
    print("   - ⚠️ Target WER < 30% mungkin belum tercapai. Coba tambah data atau epoch training.")


📊 Melakukan evaluasi final...



🎯 HASIL EVALUASI AKHIR:
   - Word Error Rate (WER): 1.0000 (100.00%)
   - ⚠️ Target WER < 30% mungkin belum tercapai. Coba tambah data atau epoch training.


In [40]:
# Cell 11: Pengujian Transkripsi pada Sampel Acak

print("\n🎤 MENGUJI FUNGSI TRANSKRIPSI PADA SAMPEL ACAK")

def transcribe_audio(audio_path, model, device):
    """Fungsi untuk mentranskripsi satu file audio."""
    model.eval() # Set model ke mode evaluasi

    # Preprocessing audio sama seperti saat training
    try:
        speech, sr = torchaudio.load(audio_path)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            speech = resampler(speech)
        if speech.shape[0] > 1:
            speech = torch.mean(speech, dim=0, keepdim=True)

        mel_spec = mel_spectrogram_transform(speech).squeeze(0).T
        mel_spec = mel_spec.to(device)

        # Lakukan prediksi
        with torch.no_grad():
            _, log_probs = model(mel_spec.unsqueeze(0))

        pred_ids = torch.argmax(log_probs, dim=-1).squeeze().tolist()

        # Decode hasil prediksi
        decoded_text = ""
        for i, token_id in enumerate(pred_ids):
            if token_id == char_to_int[ctc_blank_token]:
                continue
            if i > 0 and token_id == pred_ids[i-1]:
                continue
            decoded_text += int_to_char.get(token_id, "")

        return decoded_text

    except Exception as e:
        return f"Error processing audio: {e}"

# Pilih satu sampel acak dari test set
test_sample = raw_datasets['test'].shuffle(seed=42)[0]
audio_path = test_sample['audio_path']

# Panggil fungsi transkripsi
predicted_text = transcribe_audio(audio_path, model, device)

print(f"\n📝 Teks Asli: {test_sample['sentence']}")
print(f"🤖 Prediksi  : {predicted_text}")

# Putar audio sampel untuk didengarkan
from IPython.display import Audio as IPythonAudio
display(IPythonAudio(audio_path))


🎤 MENGUJI FUNGSI TRANSKRIPSI PADA SAMPEL ACAK

📝 Teks Asli: Like most towns, Creedmoor has a variety of media.
🤖 Prediksi  : ?syednjqdj?ededq dedeiye?mjcd"
