In [3]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    MaxPooling2D,
    BatchNormalization,
    Dropout,
    Flatten,
    Dense
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

DATA_PATH = "../preprocessed_audio_data"
MODEL_PATH = "../models/speech_model_trained.keras"

print("Loading data...")

X = np.load(os.path.join(DATA_PATH, "X.npy"))
y = np.load(os.path.join(DATA_PATH, "y.npy"))

print("X shape:", X.shape)
print("y shape:", y.shape)

print("Original y shape:", y.shape)

Loading data...
X shape: (1440, 128, 128, 1)
y shape: (1440, 8)
Original y shape: (1440, 8)


In [4]:
# If y is already one-hot encoded
if len(y.shape) == 2:
    y_labels = np.argmax(y, axis=1)
else:
    y_labels = y

label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(y_labels)

print("y_labels shape (after fix):", y_labels.shape)

y_labels shape (after fix): (1440,)


In [5]:
# Split using 1D labels
X_train, X_val, y_train_labels, y_val_labels = train_test_split(
    X,
    y_labels,
    test_size=0.2,
    random_state=42,
    stratify=y_labels
)

# Convert back to one-hot for training
NUM_CLASSES = len(np.unique(y_labels))

y_train = to_categorical(y_train_labels, NUM_CLASSES)
y_val = to_categorical(y_val_labels, NUM_CLASSES)

print("Split successful.")
print("Train labels shape:", y_train.shape)
print("Val labels shape:", y_val.shape)

Split successful.
Train labels shape: (1152, 8)
Val labels shape: (288, 8)


In [None]:
# Model
model = Sequential([
    Input(shape=(128, 128, 1)),

    tf.keras.layers.RandomTranslation(0.1, 0.1),
    tf.keras.layers.RandomZoom(0.1),

    Conv2D(32, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.25),

    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.3),

    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),
    Dropout(0.4),

    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

callbacks = [
    EarlyStopping(patience=8, restore_best_weights=True),
    ReduceLROnPlateau(patience=4)
]

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=callbacks
)

os.makedirs("../models", exist_ok=True)

model.save(MODEL_PATH)
joblib.dump(label_encoder, "../models/label_encoder.pkl")

print("Training complete.")
print("Model saved to:", MODEL_PATH)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 248ms/step - accuracy: 0.1493 - loss: 6.0042 - val_accuracy: 0.1458 - val_loss: 15.5049 - learning_rate: 0.0010
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 240ms/step - accuracy: 0.1484 - loss: 2.0850 - val_accuracy: 0.1528 - val_loss: 23.2840 - learning_rate: 0.0010
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 255ms/step - accuracy: 0.1224 - loss: 2.0781 - val_accuracy: 0.1319 - val_loss: 27.4250 - learning_rate: 0.0010
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 260ms/step - accuracy: 0.1484 - loss: 2.0593 - val_accuracy: 0.1389 - val_loss: 30.0215 - learning_rate: 0.0010
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 264ms/step - accuracy: 0.1389 - loss: 2.0678 - val_accuracy: 0.1319 - val_loss: 32.4678 - learning_rate: 0.0010
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m