In [26]:
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio
import matplotlib.pyplot as plt
import pandas as pd
import io
import os
import re
import string
from tqdm import tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (
  Embedding,Input,LSTM,Input,Conv2D,Softmax,Dropout,Dense,GRU,
  MaxPooling2D,LayerNormalization,Reshape,BatchNormalization,Bidirectional)
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

In [27]:
print(tf.__version__)

2.13.1


In [28]:
transcripts = pd.read_csv('Dataset/data.csv')

transcripts.head()

Unnamed: 0,path,sentence
0,common_voice_id_39599471.mp3,dia tidak sepenuhnya mempercayaiku
1,common_voice_id_32165882.mp3,aku juga tidak mengerti
2,common_voice_id_27657418.mp3,berharap kau di sini
3,common_voice_id_23033570.mp3,maha suci allah
4,common_voice_id_40088845.mp3,pada akhirnya dia memilih anak kucing yang sat...


In [29]:
# file MP3
def load_mp3(file_path, target_sample_rate=16000):
    # Baca file MP3
    audio_binary = tf.io.read_file(file_path)
    
    # Decode MP3 menjadi waveform
    waveform = tfio.audio.decode_mp3(audio_binary)
    
    # Resample 
    original_sample_rate = 44100  # sr default MP3
    if target_sample_rate != original_sample_rate:
        waveform = tfio.audio.resample(waveform, rate_in=original_sample_rate, rate_out=target_sample_rate)
    
    return waveform, target_sample_rate

In [30]:
file_path = "Dataset/clips/common_voice_id_39599471.mp3"
waveform, sample_rate = load_mp3(file_path)

In [31]:
# Fungsi untuk normalisasi waveform
def normalize_waveform(waveform):
    # Cari amplitudo maksimum
    max_amplitude = tf.reduce_max(tf.abs(waveform))
    
    # Normalisasi waveform ke rentang [-1, 1]
    normalized_waveform = waveform / max_amplitude
    return normalized_waveform

In [32]:
# menambahkan padding jika perlu
def pad_waveform(waveform, frame_length, frame_step):
    # Hitung jumlah padding yang diperlukan di dimensi waktu
    waveform_length = tf.shape(waveform)[0]
    padding_needed = frame_length - (waveform_length % frame_step)
    
    # Jika padding diperlukan, tambahkan padding di dimensi waktu
    if padding_needed != 0:
        padded_waveform = tf.pad(waveform, paddings=[[0, padding_needed]])
    else:
        padded_waveform = waveform
    
    return padded_waveform

In [33]:
# Normalisasi
normalized_waveform = normalize_waveform(waveform)

# Cetak informasi
print(f"Sample rate: {sample_rate}")
print(f"Waveform shape sebelum di normalisasi: {waveform.shape}")
print(f"Waveform shape setelah di normalisasi: {normalized_waveform.shape}")

Sample rate: 16000
Waveform shape sebelum di normalisasi: (47229, 1)
Waveform shape setelah di normalisasi: (47229, 1)


In [36]:
# spektrogram dari waveform
def create_spectrogram(waveform, frame_length=256, frame_step=128, fft_length=256):
    # Hitung STFT
    stft = tf.signal.stft(
        waveform,
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=fft_length
    )
    
    # Ambil magnitudo (hilangkan fase)
    spectrogram = tf.abs(stft)
    return spectrogram

frame_length = 256  # Panjang frame untuk STFT
frame_step = 128    # Langkah antar frame

# spektrogram dari waveform yang sudah dinormalisasi
spectrogram = create_spectrogram(normalized_waveform)

print(f"Spectrogram shape: {spectrogram.shape}")

Spectrogram shape: (47229, 0, 129)
