In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from transformers import DeiTFeatureExtractor, DeiTForImageClassification
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, LSTM, Dense, Dropout, MaxPooling1D, Flatten, BatchNormalization
from tensorflow.keras.models import Model
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Set parameters
img_height, img_width = 224, 224
num_classes = 7
batch_size = 16
epochs = 25

In [3]:
import tensorflow as tf
import numpy as np

# Define parameters
img_height = 224
img_width = 224


# Load datasets
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    '/kaggle/input/speechemomel/Melspectro/train',
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=True
)

test_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    '/kaggle/input/speechemomel/Melspectro/test',
    image_size=(img_height, img_width),
    batch_size=batch_size,
    shuffle=False
)

# Convert TensorFlow dataset to NumPy arrays
def dataset_to_numpy(dataset):
    images = []
    labels = []
    for image_batch, label_batch in dataset:
        images.append(image_batch.numpy())  # Convert tensors to numpy
        labels.append(label_batch.numpy())

    return np.concatenate(images), np.concatenate(labels)

# Extract x_train, y_train, x_test, y_test
x_train, y_train = dataset_to_numpy(train_dataset)
x_test, y_test = dataset_to_numpy(test_dataset)

# Normalize images (optional)
x_train = x_train / 255.0
x_test = x_test / 255.0

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

Found 1963 files belonging to 7 classes.
Found 563 files belonging to 7 classes.
x_train shape: (1963, 224, 224, 3), y_train shape: (1963,)
x_test shape: (563, 224, 224, 3), y_test shape: (563,)


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, Dense, LSTM, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping

# Assuming x_train is of shape (num_samples, height, width, channels)
# Define input shape
height, width, channels = x_train.shape[1], x_train.shape[2], x_train.shape[3]
num_classes = len(set(y_train))  # Dynamically set number of classes

# Data Augmentation
datagen = ImageDataGenerator(
)

# Define the model
input_layer = Input(shape=(height, width, channels))

# CNN Feature Extractor
x = Conv2D(filters=32, kernel_size=(3, 3), activation="relu")(input_layer)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.3)(x)  # Increased dropout

x = Conv2D(filters=64, kernel_size=(3, 3), activation="relu")(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.3)(x)  # Increased dropout

# Flatten the output from CNN before passing to LSTM
x = Flatten()(x)

# Reshape the output to be compatible with LSTM
x = Reshape((1, -1))(x)  # Reshape to (batch_size, 1, features)

# LSTM Layer
x = LSTM(32, return_sequences=False)(x)  # Reduced LSTM units

# Dropout Layer
x = Dropout(0.3)(x)  # Increased dropout

# Batch Normalization
x = BatchNormalization()(x)

# Output Layer
output_layer = Dense(num_classes, activation="softmax")(x)

# Create the model
cnn_lstm_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
cnn_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.000001),  # Adjusted learning rate
                        loss="sparse_categorical_crossentropy",
                        metrics=["accuracy"])

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with data augmentation
history = cnn_lstm_model.fit(datagen.flow(x_train, y_train, batch_size=8),  # Use data generator
                              validation_data=(x_test, y_test),
                              epochs=50,
                              callbacks=[early_stopping])  # Added early stopping

# Evaluate the model
test_loss, test_acc = cnn_lstm_model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/50


  self._warn_if_super_not_called()


[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 50ms/step - accuracy: 0.1617 - loss: 2.3178 - val_accuracy: 0.2771 - val_loss: 1.9061
Epoch 2/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.2294 - loss: 1.9315 - val_accuracy: 0.3748 - val_loss: 1.7712
Epoch 3/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.2958 - loss: 1.7802 - val_accuracy: 0.6146 - val_loss: 1.5207
Epoch 4/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.3719 - loss: 1.6456 - val_accuracy: 0.6465 - val_loss: 1.3480
Epoch 5/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.4483 - loss: 1.5021 - val_accuracy: 0.7282 - val_loss: 1.2159
Epoch 6/50
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.4785 - loss: 1.4309 - val_accuracy: 0.7460 - val_loss: 1.1397
Epoch 7/50
[1m246/246[0m 

In [8]:
# Save the model in .h5 format
cnn_lstm_model.save("/kaggle/working/speech_emotion_model1.h5")