In [1]:
!pip install librosa numpy matplotlib tqdm

Defaulting to user installation because normal site-packages is not writeable


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2025-03-07 09:38:36.868517: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-07 09:38:38.009512: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-07 09:38:38.488023: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741320519.039430    4731 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741320519.230360    4731 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-07 09:38:40.594258: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [7]:
import os
import librosa
import numpy as np
import hashlib
from tqdm import tqdm


In [3]:
dataset_path = "data"  # Folder containing 'male' and 'female' subfolders
categories = ["male", "female"]
output_path = "preprocessed_data"
os.makedirs(output_path, exist_ok=True)

sr = 22050  # Sampling rate
n_mels = 128  # Number of Mel bands
max_duration = 3  # Max duration of audio clips in seconds
max_length = sr * max_duration  # Max length in samples

In [4]:
def hash_file(file_path):
    """Generate a hash for an audio file."""
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

def remove_duplicate_files():
    """Remove duplicate audio files based on file hashes."""
    seen_hashes = set()
    for category in categories:
        category_path = os.path.join(dataset_path, category)
        for filename in tqdm(os.listdir(category_path), desc=f"Checking duplicates in {category}"):
            file_path = os.path.join(category_path, filename)
            file_hash = hash_file(file_path)
            if file_hash in seen_hashes:
                os.remove(file_path)
                print(f"Deleted duplicate: {file_path}")
            else:
                seen_hashes.add(file_hash)
    print("Duplicate removal complete!")


In [5]:
remove_duplicate_files()

Checking duplicates in male: 100%|██████████| 9841/9841 [03:41<00:00, 44.34it/s]
Checking duplicates in female: 100%|████████| 5229/5229 [01:33<00:00, 56.17it/s]

Duplicate removal complete!





In [6]:
def process_audio_files():
    data, labels = [], []
    total_files = 0

    for label, category in enumerate(categories):
        category_path = os.path.join(dataset_path, category)
        print(f"Checking folder: {category_path}")
        files = os.listdir(category_path)
        print(f"Total files found in {category}: {len(files)}")

        for filename in tqdm(files, desc=f"Processing {category}"):
            file_path = os.path.join(category_path, filename)
            try:
                y, _ = librosa.load(file_path, sr=sr, duration=max_duration)
                
                if len(y) == 0:
                    print(f"Warning: Empty file {file_path}")
                    continue

                y = librosa.util.fix_length(y, size=max_length)  
                mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)  # ✅ FIXED
                mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

                data.append(mel_spec_db)
                labels.append(label)
                total_files += 1

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    print(f"Total files successfully processed: {total_files}")

    # Convert lists to numpy arrays
    data = np.array(data)
    labels = np.array(labels)

    # Save to .npy files
    np.save(os.path.join(output_path, "data.npy"), data)
    np.save(os.path.join(output_path, "labels.npy"), labels)
    print("Preprocessing complete! Data saved.")


In [7]:
process_audio_files()

Checking folder: data/male
Total files found in male: 9841


Processing male: 100%|██████████████████████| 9841/9841 [07:09<00:00, 22.89it/s]


Checking folder: data/female
Total files found in female: 5229


Processing female: 100%|████████████████████| 5229/5229 [03:19<00:00, 26.20it/s]


Total files successfully processed: 15070
Preprocessing complete! Data saved.


In [1]:
import numpy as np

# Load the preprocessed data
data = np.load("preprocessed_data/data.npy")
labels = np.load("preprocessed_data/labels.npy")

# Print the shape of data
print(f"Data shape: {data.shape}")  # Expected: (num_samples, 128, time_steps)
print(f"Labels shape: {labels.shape}")  # Expected: (num_samples,)


Data shape: (15070, 128, 130)
Labels shape: (15070,)


In [2]:
def build_cnn_model(input_shape):
    model = keras.Sequential([
        layers.Conv2D(64, (3, 3), activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(256, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

In [3]:
# Reshape data for CNN
X = data[..., np.newaxis]  # Add channel dimension for CNN
y = labels

In [4]:
# Split data into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [7]:
# Build and train the model
input_shape = (X_train.shape[1], X_train.shape[2], 1)
model = build_cnn_model(input_shape)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-07 09:39:04.626820: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2025-03-07 09:39:05.461140: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51380224 exceeds 10% of free system memory.
2025-03-07 09:39:05.504171: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51380224 exceeds 10% of free system memory.
2025-03-07 09:39:05.520347: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51380224 exceeds 10% of free system memory.


In [8]:
history = model.fit(X_train, y_train, epochs=2, batch_size=16, validation_data=(X_test, y_test))

2025-03-07 09:39:38.434367: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 802447360 exceeds 10% of free system memory.


Epoch 1/2


2025-03-07 09:39:47.822183: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51380224 exceeds 10% of free system memory.


[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1283s[0m 2s/step - accuracy: 0.9025 - loss: 0.3849 - val_accuracy: 0.9954 - val_loss: 0.0145
Epoch 2/2
[1m754/754[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1282s[0m 2s/step - accuracy: 0.9895 - loss: 0.0305 - val_accuracy: 0.9980 - val_loss: 0.0107


In [9]:
# Evaluate model
_, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 738ms/step - accuracy: 0.9980 - loss: 0.0074
Test Accuracy: 99.80%


In [10]:
model.save("preprocessed_data/gender_cnn_model.h5")



In [11]:
from tensorflow.keras.models import load_model
import numpy as np
import librosa

# Load trained model
model = load_model("preprocessed_data/gender_cnn_model.h5")  # Change path if needed

# Model summary
model.summary()




In [12]:
def preprocess_audio(file_path, sr=22050, n_mels=128, max_duration=3):
    """Load and preprocess an audio file into a Mel spectrogram for model prediction."""
    try:
        y, _ = librosa.load(file_path, sr=sr, duration=max_duration)
        y = librosa.util.fix_length(y, size=sr * max_duration)  # Pad or truncate

        # Convert to Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Reshape for model input (128, time_steps, 1)
        mel_spec_db = mel_spec_db[..., np.newaxis]

        return np.array([mel_spec_db])  # Add batch dimension
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [13]:
# List of test audio files (provide paths)
test_files = [
    "data/male/arctic_a0001.wav",
    "data/female/arctic_a0007.wav",
    "data/male/arctic_a0009.wav",
    "data/female/arctic_a0003.wav"
]

# Predict gender for each sample
for file in test_files:
    processed_audio = preprocess_audio(file)
    if processed_audio is not None:
        prediction = model.predict(processed_audio)
        gender = "Male" if prediction[0][0] < 0.5 else "Female"
        print(f"Prediction for {file}: {gender} (Score: {prediction[0][0]:.4f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
Prediction for data/male/arctic_a0001.wav: Male (Score: 0.0000)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Prediction for data/female/arctic_a0007.wav: Female (Score: 1.0000)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
Prediction for data/male/arctic_a0009.wav: Male (Score: 0.0000)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Prediction for data/female/arctic_a0003.wav: Female (Score: 1.0000)
