In [1]:
!pip install tensorflow



In [2]:
import numpy as np
import pandas as pd
import os
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GlobalAveragePooling2D
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jonathanwiii","key":"93883db93c24411e400761c37ea66b7d"}'}

In [4]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [None]:
!kaggle datasets download -d sandeep16064/1000-male-voice-samples
!unzip 1000-male-voice-samples.zip -d '/content/malevoice'
!kaggle datasets download -d sandeep16064/2000-voice-samples
!unzip 2000-voice-samples.zip -d '/content/femalevoice'

In [18]:
def extract_audio_features(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20)
    features = np.mean(mfccs.T, axis=0)
    return features

In [19]:
data = pd.DataFrame(columns=['features', 'label'])

male_folder = '/content/malevoice'

female_folder = '/content/femalevoice'


all samples

In [20]:
for file in os.listdir(male_folder):
    file_path = os.path.join(male_folder, file)
    features = extract_audio_features(file_path)
    data = pd.concat([data, pd.DataFrame({'features': [features], 'label': [0]})], ignore_index=True)

# Extract features from female voice files
for file in os.listdir(female_folder):
    file_path = os.path.join(female_folder, file)
    features = extract_audio_features(file_path)
    data = pd.concat([data, pd.DataFrame({'features': [features], 'label': [1]})], ignore_index=True)

In [21]:
X = np.array(data['features'].tolist())
y = np.array(data['label'].tolist())

# Convert labels to categorical
y = to_categorical(y)

# Split the data into training and testing sets (80% train, 20% test)
split_index = int(len(X) * 0.7)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [22]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# SOTA

In [23]:
def create_lstm_model(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(units=100, activation='tanh', input_shape=input_shape))
    model.add(Dense(units=num_classes, activation='softmax'))

    return model

In [24]:
input_shape = (X_train.shape[1], 1)
num_classes = len(np.unique(y_train))
model_lstm = create_lstm_model(input_shape, num_classes)

In [25]:
model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 100)               40800     
                                                                 
 dense_2 (Dense)             (None, 2)                 202       
                                                                 
Total params: 41002 (160.16 KB)
Trainable params: 41002 (160.16 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model_lstm.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, batch_size=16, epochs=25, validation_data=(X_test, y_test))

train_loss_lstm, train_accuracy_lstm = model_lstm.evaluate(X_train, y_train, verbose=0)
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test, y_test, verbose=0)
print(f"Training Accuracy: {train_accuracy_lstm:.4f}")
print(f"Training Loss: {train_loss_lstm:.4f}")
print(f"Validation Accuracy: {test_accuracy_lstm:.4f}")
print(f"Validation Loss: {test_loss_lstm:.4f}")

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Training Accuracy: 0.9959
Training Loss: 0.0104
Validation Accuracy: 0.9981
Validation Loss: 0.0021


# Proposed

In [27]:
from tensorflow.keras.optimizers import SGD


In [28]:
model_proposed = Sequential()
model_proposed.add(LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2])))
model_proposed.add(Dense(units=2, activation='softmax'))

model_proposed.compile(optimizer=SGD(), loss='categorical_crossentropy', metrics=['accuracy'])
model_proposed.fit(X_train, y_train, epochs=25, batch_size=16, validation_data=(X_test, y_test))

train_loss_proposed, train_accuracy_proposed = model_proposed.evaluate(X_train, y_train, verbose=0)
test_loss_proposed, test_accuracy_proposed = model_proposed.evaluate(X_test, y_test, verbose=0)
print(f"Training Accuracy: {train_accuracy_proposed:.4f}")
print(f"Training Loss: {train_loss_proposed:.4f}")
print(f"Validation Accuracy: {test_accuracy_proposed:.4f}")
print(f"Validation Loss: {test_loss_proposed:.4f}")

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Training Accuracy: 0.9860
Training Loss: 0.0486
Validation Accuracy: 0.9885
Validation Loss: 0.0516


#Test model

In [29]:
from IPython.display import Audio
Audio("/content/malevoice/indianmale (100).wav")

In [30]:
model_proposed.save('proposed_model.h5')

new_audio_file_path = '/content/malevoice/indianmale (100).wav'
features = extract_audio_features(new_audio_file_path)
features = features.reshape(1, features.shape[0], 1)

predictions = model_proposed.predict(features)

predicted_label = np.argmax(predictions)

if predicted_label == 0:
    gender = 'male'
else:
    gender = 'female'

print(f"Predicted Gender: {gender}")

  saving_api.save_model(


Predicted Gender: male
