In [1]:
import os
import torchaudio
from torchaudio.transforms import Resample

dataset_path = r"C:\Users\PMLS\Documents\Sound recordings\numbers"
target_sample_rate = 8000
data = []

for label_folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, label_folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.wav'):
                file_path = os.path.join(folder_path, file_name)
                waveform, sample_rate = torchaudio.load(file_path)
                
                # Resample if needed
                if sample_rate != target_sample_rate:
                    resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
                    waveform = resampler(waveform)
                
                data.append((waveform, int(label_folder)))  # label from folder
                print(f"Loaded {file_name} at 8kHz with label {label_folder}")

print(f"\nTotal samples loaded: {len(data)}")




Loaded 0_0.wav at 8kHz with label 0
Loaded 0_1.wav at 8kHz with label 0
Loaded 0_10.wav at 8kHz with label 0
Loaded 0_2.wav at 8kHz with label 0
Loaded 0_3.wav at 8kHz with label 0
Loaded 0_4.wav at 8kHz with label 0
Loaded 0_5.wav at 8kHz with label 0
Loaded 0_6.wav at 8kHz with label 0
Loaded 0_7.wav at 8kHz with label 0
Loaded 0_8.wav at 8kHz with label 0
Loaded 0_9.wav at 8kHz with label 0
Loaded 1_0.wav at 8kHz with label 1
Loaded 1_1.wav at 8kHz with label 1
Loaded 1_10.wav at 8kHz with label 1
Loaded 1_2.wav at 8kHz with label 1
Loaded 1_3.wav at 8kHz with label 1
Loaded 1_4.wav at 8kHz with label 1
Loaded 1_5.wav at 8kHz with label 1
Loaded 1_6.wav at 8kHz with label 1
Loaded 1_7.wav at 8kHz with label 1
Loaded 1_9.wav at 8kHz with label 1
Loaded 2_0.wav at 8kHz with label 2
Loaded 2_1.wav at 8kHz with label 2
Loaded 2_10.wav at 8kHz with label 2
Loaded 2_2.wav at 8kHz with label 2
Loaded 2_3.wav at 8kHz with label 2
Loaded 2_4.wav at 8kHz with label 2
Loaded 2_5.wav at 8kHz wi

## CNN Model

In [3]:
import os
import numpy as np
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from random import shuffle
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
from keras.layers import Dense, Dropout, Flatten
from keras.models import Sequential,Model
import keras
import tensorflow as tf
from tensorflow.keras import optimizers

In [17]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load audio files and extract MFCC
dataset_path = r"C:\Users\PMLS\Documents\Sound recordings\numbers"
target_sr = 8000
n_mfcc = 13

X = []
y = []

for label_folder in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, label_folder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.wav'):
                file_path = os.path.join(folder_path, file_name)
                y_audio, sr = librosa.load(file_path, sr=target_sr)
                mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=n_mfcc)
                mfcc_mean = np.mean(mfcc.T, axis=0)  # shape = (13,)
                X.append(mfcc_mean)
                y.append(int(label_folder))

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
X_train.shape

(85, 13)

In [23]:
# Reshape for CNN (add channel dimension)
X_train = X_train[..., np.newaxis]  
X_test = X_test[..., np.newaxis]



# Build CNN model

In [30]:

model = Sequential([
    Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(n_mfcc, max_pad_len, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),
    
    Conv2D(128, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 158ms/step - accuracy: 0.1017 - loss: 34.9289 - val_accuracy: 0.0909 - val_loss: 9.5124
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.0764 - loss: 21.8621 - val_accuracy: 0.0909 - val_loss: 5.1446
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0861 - loss: 13.9877 - val_accuracy: 0.2273 - val_loss: 3.5674
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1389 - loss: 8.8656 - val_accuracy: 0.3636 - val_loss: 2.5652
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1214 - loss: 7.0298 - val_accuracy: 0.2727 - val_loss: 2.2486
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1506 - loss: 4.8666 - val_accuracy: 0.0909 - val_loss: 2.3354
Epoch 7/100
[1m3/3[0m [32m━━━━━━━

## Change parameters 

In [32]:
# Build CNN model
model = Sequential([
    Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(n_mfcc, max_pad_len, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),
    
    Conv2D(256, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")


Epoch 1/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 196ms/step - accuracy: 0.0939 - loss: 38.8081 - val_accuracy: 0.0909 - val_loss: 20.5067
Epoch 2/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.1350 - loss: 26.4335 - val_accuracy: 0.0455 - val_loss: 8.5728
Epoch 3/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.1390 - loss: 14.2687 - val_accuracy: 0.1364 - val_loss: 3.2898
Epoch 4/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.1722 - loss: 5.9948 - val_accuracy: 0.1818 - val_loss: 3.0655
Epoch 5/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.1448 - loss: 4.7775 - val_accuracy: 0.1364 - val_loss: 2.2160
Epoch 6/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.0960 - loss: 2.5430 - val_accuracy: 0.0455 - val_loss: 2.3175
Epoch 7/200
[1m3/3[0m [32m━━━━━━

In [48]:
# Build CNN model
model = Sequential([
    Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(n_mfcc, max_pad_len, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),
    
    Conv2D(256, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),

    
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(y_cat.shape[1], activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train, y_train, epochs=500, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")


Epoch 1/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 182ms/step - accuracy: 0.0763 - loss: 39.0501 - val_accuracy: 0.1364 - val_loss: 26.5886
Epoch 2/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.0802 - loss: 42.1939 - val_accuracy: 0.2273 - val_loss: 12.7636
Epoch 3/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.1233 - loss: 17.6354 - val_accuracy: 0.0909 - val_loss: 5.9414
Epoch 4/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.1468 - loss: 8.5285 - val_accuracy: 0.1818 - val_loss: 3.5321
Epoch 5/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.2289 - loss: 4.5627 - val_accuracy: 0.2727 - val_loss: 1.9992
Epoch 6/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.1370 - loss: 3.3406 - val_accuracy: 0.2273 - val_loss: 1.8463
Epoch 7/500
[1m3/3[0m [32m━━━━━

## Classification Report

In [46]:
from sklearn.metrics import classification_report
import numpy as np

# Predict class probabilities
y_pred_probs = model.predict(X_test)

# Convert to class labels
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=encoder.classes_)
print(report)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      0.33      0.50         3
           2       0.50      1.00      0.67         1
           3       0.67      1.00      0.80         2
           4       1.00      1.00      1.00         2
           5       0.50      1.00      0.67         1
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         4
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         2

    accuracy                           0.86        22
   macro avg       0.87      0.90      0.84        22
weighted avg       0.92      0.86      0.86        22

