In [157]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, Input, Permute, Lambda
from tensorflow.keras.models import Model

In [158]:
# ฟังก์ชันสำหรับการดึงคุณลักษณะจากไฟล์เสียง (MFCC)
def extract_features(file_path, max_pad_len=100):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=14)
        pad_width = max_pad_len - mfccs.shape[1]
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.zeros((14, max_pad_len))

In [159]:
def load_data(audio_folder):
    filenames = []
    labels = []
    for emotion in os.listdir(audio_folder):
        emotion_folder = os.path.join(audio_folder, emotion)
        if os.path.isdir(emotion_folder):
            for filename in os.listdir(emotion_folder):
                if filename.endswith(".wav"):
                    file_path = os.path.join(emotion_folder, filename)
                    filenames.append(file_path)
                    labels.append(emotion)
    return filenames, labels



In [160]:
# ฟังก์ชันสร้าง Positional Encoding
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.cast(angle_rads, dtype=tf.float32)

def multi_head_attention(query, key, value, num_heads):
    d_model = query.shape[-1]  # ตรวจสอบค่า d_model

    if d_model is None:
        raise ValueError("d_model ต้องถูกกำหนดค่า (ไม่ใช่ None)")

    if d_model % num_heads != 0:
        raise ValueError(f"d_model ({d_model}) ต้องหาร num_heads ({num_heads}) ลงตัว")

    depth = d_model // num_heads

    query = Dense(d_model)(query)
    key = Dense(d_model)(key)
    value = Dense(d_model)(value)

    # ใช้ Lambda Layer เพื่อดึง batch_size
    batch_size_fn = Lambda(lambda x: tf.shape(x)[0])
    batch_size = batch_size_fn(query)

    # ใช้ Lambda Layer เพื่อ reshape ข้อมูล
    reshape_fn = Lambda(lambda x: tf.reshape(x, (tf.shape(x)[0], -1, num_heads, depth)))
    query = reshape_fn(query)
    key = reshape_fn(key)
    value = reshape_fn(value)

    # Transpose
    query = Permute((2, 1, 3))(query)
    key = Permute((2, 1, 3))(key)
    value = Permute((2, 1, 3))(value)

    # Attention
    attention_scores = Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True))([query, key])
    attention_scores = Lambda(lambda x: x / tf.math.sqrt(tf.cast(depth, tf.float32)))(attention_scores)
    attention_weights = Lambda(lambda x: tf.nn.softmax(x, axis=-1))(attention_scores)
    attention_output = Lambda(lambda x: tf.matmul(x[0], x[1]))([attention_weights, value])

    # Reshape กลับ
    attention_output = Permute((2, 1, 3))(attention_output)
    attention_output = Lambda(lambda x: tf.reshape(x, (tf.shape(x)[0], -1, d_model)))(attention_output)

    return attention_output




In [161]:
def transformer_block(inputs, num_heads, dff, d_model, dropout_rate=0.1):
    print(f'Input shape before attention: {inputs.shape}')  # ตรวจสอบขนาดของ inputs ก่อนการทำงาน
    # ตรวจสอบให้แน่ใจว่าอินพุตมีรูปทรงถูกต้อง: (batch_size, seq_length, d_model)
    if len(inputs.shape) == 2:  # ถ้ามีแค่ (seq_length, d_model) เพิ่ม batch dimension
        inputs = tf.expand_dims(inputs, axis=0)  # เปลี่ยนรูปทรงเป็น (1, seq_length, d_model)
    
    print(f'Input shape after expanding dimensions: {inputs.shape}')  # ตรวจสอบขนาดหลังการขยายมิติ
    # ใช้ multi-head attention ที่รับ inputs อยู่ในรูป (batch_size, seq_length, d_model)
    attention_output = multi_head_attention(inputs, inputs, inputs, num_heads)
    print(f'Attention output shape: {attention_output.shape}')  # ตรวจสอบขนาดหลังการทำ attention

    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    ffn_output = Dense(dff, activation='relu')(attention_output)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    ffn_output = LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)

    return ffn_output


In [162]:
def extract_features_from_all(filenames, max_pad_len=100):
    features = []
    for file in filenames:
        mfccs = extract_features(file, max_pad_len)
        features.append(mfccs)
    return np.array(features)


In [163]:
def create_transformer_model(input_shape, num_classes, num_heads=4, dff=256, d_model=128, num_layers=2, dropout_rate=0.1):
    inputs = Input(shape=input_shape)
    pos_enc = positional_encoding(input_shape[0], d_model)  # ใช้ shape ที่ถูกต้อง
    x = Dense(d_model)(inputs)
    
    # เพิ่ม positional encoding
    x = x + pos_enc
    print(f'Input shape after adding positional encoding: {x.shape}')  # ตรวจสอบขนาดหลังการเพิ่ม positional encoding
    
    for _ in range(num_layers):
        x = transformer_block(x, num_heads, dff, d_model, dropout_rate)
    
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [164]:
train_folder = '/Users/gam/Desktop/DEEP/Woekshop#3/DATASET/Dataset_2/train'
val_folder = '/Users/gam/Desktop/DEEP/Woekshop#3/DATASET/Dataset_2/val'
test_folder = '/Users/gam/Desktop/DEEP/Woekshop#3/DATASET/Dataset_2/test'
train_filenames, train_labels = load_data(train_folder)
test_filenames, test_labels = load_data(test_folder)
val_filenames, val_labels = load_data(val_folder)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_labels)
y_test_encoded = label_encoder.transform(test_labels)
y_val_encoded = label_encoder.transform(val_labels)
y_train_encoded = to_categorical(y_train_encoded)
y_test_encoded = to_categorical(y_test_encoded)
y_val_encoded = to_categorical(y_val_encoded)
X_train_features = extract_features_from_all(train_filenames)
X_test_features = extract_features_from_all(test_filenames)
X_val_features = extract_features_from_all(val_filenames)
# เพิ่มมิติข้อมูลสำหรับ RNN
X_test = X_test_features
X_train = X_train_features  # ไม่เพิ่มมิติ
X_val = X_val_features  # ไม่เพิ่มมิติ


In [None]:
n_classes = len(label_encoder.classes_)
if X_train.shape[1] is None or X_train.shape[2] is None:
    raise ValueError("input_shape ต้องไม่มีค่า None")

input_shape = (X_train.shape[1], X_train.shape[2])
print(f"Input Shape: {input_shape}")  # Debugging

model = create_transformer_model(input_shape, n_classes)

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train_encoded, epochs=20, batch_size=50, validation_data=(X_val, y_val_encoded), callbacks=[early_stop])

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test_encoded)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

In [None]:
# ทำนายข้อมูลทดสอบ
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)  # หาคลาสที่ทำนาย
true_classes = np.argmax(y_test_encoded, axis=1)  # หาคลาสจริง

# คำนวณความแม่นยำในการทำนาย
accuracy = np.sum(predicted_classes == true_classes) / len(true_classes)
print(f"Accuracy: {accuracy*100:.2f}%")

In [None]:
cm = confusion_matrix(true_classes, predicted_classes)
class_labels = label_encoder.classes_
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# บันทึกโมเดล
model.save('emotion_recognition_model.h5')


In [None]:
# ฟังก์ชันโหลดไฟล์เสียงและแปลงเป็น Mel-Spectrogram
def preprocess_audio(file_path, sr=16000, n_mels=128, n_fft=400, hop_length=160):
    y, sr = librosa.load(file_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    # ทำ Padding หรือปรับขนาดให้อยู่ในรูปที่โมเดลรองรับ
    target_length = 100  # ปรับตาม input ของโมเดลคุณ
    if log_mel_spec.shape[1] < target_length:
        pad_width = target_length - log_mel_spec.shape[1]
        log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, pad_width)), mode='constant')
    else:
        log_mel_spec = log_mel_spec[:, :target_length]

    return log_mel_spec.T  # Transpose ให้เป็น (time, features)

# โหลดโมเดล Transformer
model = tf.keras.models.load_model("emotion_recognition_model.h5", safe_mode=False)

# โหลดและแปลงไฟล์เสียง
file_path = "C:/DeepLearnning/Gam/workshop-3/DATASET/Dataset_2/test/angry/OAF_calm_angry.wav"  # เปลี่ยนเป็น path ไฟล์เสียงของคุณ
input_data = preprocess_audio(file_path)
input_data = np.expand_dims(input_data, axis=0)  # เพิ่ม batch dimension

# พยากรณ์ผลลัพธ์
predictions = model.predict(input_data)
predicted_label = np.argmax(predictions, axis=1)  # เลือก class ที่ค่ามากสุด

print("Predicted Label:", predicted_label)