<a href="https://colab.research.google.com/github/yezzzzin/object-detectinon/blob/main/data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import scipy.io.wavfile as wav
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, TimeDistributed

def compute_mfcc(audio_signal, sample_rate=44100, frame_size=0.025, frame_stride=0.01, num_mfcc=13, nfilt=26, nfft=512):
    # Step 1: Frame Segmentation
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
    signal_length = len(audio_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))

    # zero padding
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(audio_signal, z)

    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]

    # MFCC 계산
    mfccs = []
    for frame in frames:
        # Apply pre-emphasis filter
        emphasized_signal = np.append(frame[0], frame[1:] - 0.97 * frame[:-1])

        # Apply Hamming window
        windowed_signal = emphasized_signal * np.hamming(len(emphasized_signal))

        # Compute FFT
        magnitude_spectrum = np.abs(fft(windowed_signal, nfft)[:nfft//2])

        # Compute power spectrum
        power_spectrum = (1.0 / nfft) * (magnitude_spectrum ** 2)

        # Apply Mel filterbank
        mel_filters = get_mel_filterbank(nfilt, nfft, sample_rate)
        mel_spectrum = np.dot(power_spectrum, mel_filters.T)

        # Take logarithm
        log_mel_spectrum = np.log(mel_spectrum + 1e-10)

        # Apply DCT to get MFCC coefficients
        mfcc = dct(log_mel_spectrum, type=2, axis=1, norm='ortho')[:, 1 : (num_mfcc + 1)]
        mfccs.append(mfcc)

    return np.array(mfccs)

def get_mel_filterbank(nfilt=26, nfft=512, sample_rate=44100, low_freq=0, high_freq=None):
    high_freq = high_freq or sample_rate // 2
    mel_points = np.linspace(hz_to_mel(low_freq), hz_to_mel(high_freq), nfilt + 2)
    hz_points = mel_to_hz(mel_points)
    bin_points = np.floor((nfft + 1) * hz_points / sample_rate).astype(int)
    filterbank = np.zeros((nfilt, nfft // 2 + 1))

    for i in range(1, nfilt + 1):
        filterbank[i - 1, bin_points[i - 1] : bin_points[i]] = (bin_points[i] - bin_points[i - 1]) / (bin_points[i + 1] - bin_points[i])
        filterbank[i - 1, bin_points[i] : bin_points[i + 1]] = (bin_points[i + 1] - bin_points[i]) / (bin_points[i + 1] - bin_points[i])

    return filterbank

def hz_to_mel(hz):
    return 2595 * np.log10(1 + hz / 700)

def mel_to_hz(mel):
    return 700 * (10**(mel / 2595) - 1)

# 여러 개의 WAV 파일 읽기
file_paths = ["audio1.wav", "audio2.wav", "audio3.wav"]

mfccs_list = []  # 각 WAV 파일의 MFCC를 저장할 리스트

for file_path in file_paths:
    sample_rate, audio_signal = wav.read(file_path)
    mfccs = compute_mfcc(audio_signal, sample_rate)
    mfccs_list.append(mfccs)

# MFCCs를 numpy 배열로 변환하여 BiLSTM에 입력
X_train = np.array(mfccs_list)

# BiLSTM 모델 정의
model = Sequential()
model.add(Bidirectional(LSTM(units=64, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))

# 모델 컴파일 및 학습
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)
