## train the RNN model

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

features_df = pd.read_csv('features.csv')

# Extract features and corresponding labels
X = features_df.iloc[:, :-1].values  # features 
y = features_df.iloc[:, -1].values  # labels 

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y = label_encoder.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train.shape[1]
print(X_train.shape)
hidden_size = 64 
num_classes = len(np.unique(y_train))

model = RNN(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 300 

for epoch in range(num_epochs):
    model.train()
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    outputs = model(X_train_tensor.unsqueeze(1))  
    optimizer.zero_grad()
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')

    # Validation
    model.eval()
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    with torch.no_grad():
        val_outputs = model(X_val_tensor.unsqueeze(1))
        val_loss = criterion(val_outputs.squeeze(), y_val)
        _, val_predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(y_val.numpy(), val_predicted.numpy())

    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluate the model on the test set
model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    test_outputs = model(X_test_tensor.unsqueeze(1))
    _, test_predicted = torch.max(test_outputs, 1)
    test_accuracy = accuracy_score(y_test.numpy(), test_predicted.numpy())

print("Test Accuracy:", test_accuracy)

(4207, 62)
Epoch [1/300], Training Loss: 0.6062
Epoch [1/300], Validation Loss: 0.5376, Validation Accuracy: 0.8632
Epoch [2/300], Training Loss: 0.5364
Epoch [2/300], Validation Loss: 0.4764, Validation Accuracy: 0.9073
Epoch [3/300], Training Loss: 0.4746
Epoch [3/300], Validation Loss: 0.4226, Validation Accuracy: 0.9301
Epoch [4/300], Training Loss: 0.4205
Epoch [4/300], Validation Loss: 0.3759, Validation Accuracy: 0.9416
Epoch [5/300], Training Loss: 0.3734
Epoch [5/300], Validation Loss: 0.3354, Validation Accuracy: 0.9494
Epoch [6/300], Training Loss: 0.3328
Epoch [6/300], Validation Loss: 0.3005, Validation Accuracy: 0.9544
Epoch [7/300], Training Loss: 0.2978
Epoch [7/300], Validation Loss: 0.2704, Validation Accuracy: 0.9579
Epoch [8/300], Training Loss: 0.2677
Epoch [8/300], Validation Loss: 0.2445, Validation Accuracy: 0.9629
Epoch [9/300], Training Loss: 0.2419
Epoch [9/300], Validation Loss: 0.2222, Validation Accuracy: 0.9665
Epoch [10/300], Training Loss: 0.2197
Epoch 

Epoch [91/300], Training Loss: 0.0273
Epoch [91/300], Validation Loss: 0.0260, Validation Accuracy: 0.9914
Epoch [92/300], Training Loss: 0.0270
Epoch [92/300], Validation Loss: 0.0258, Validation Accuracy: 0.9914
Epoch [93/300], Training Loss: 0.0268
Epoch [93/300], Validation Loss: 0.0255, Validation Accuracy: 0.9914
Epoch [94/300], Training Loss: 0.0266
Epoch [94/300], Validation Loss: 0.0252, Validation Accuracy: 0.9922
Epoch [95/300], Training Loss: 0.0264
Epoch [95/300], Validation Loss: 0.0250, Validation Accuracy: 0.9922
Epoch [96/300], Training Loss: 0.0262
Epoch [96/300], Validation Loss: 0.0248, Validation Accuracy: 0.9922
Epoch [97/300], Training Loss: 0.0260
Epoch [97/300], Validation Loss: 0.0245, Validation Accuracy: 0.9929
Epoch [98/300], Training Loss: 0.0258
Epoch [98/300], Validation Loss: 0.0243, Validation Accuracy: 0.9936
Epoch [99/300], Training Loss: 0.0256
Epoch [99/300], Validation Loss: 0.0241, Validation Accuracy: 0.9936
Epoch [100/300], Training Loss: 0.025

Epoch [185/300], Validation Loss: 0.0119, Validation Accuracy: 0.9964
Epoch [186/300], Training Loss: 0.0143
Epoch [186/300], Validation Loss: 0.0118, Validation Accuracy: 0.9964
Epoch [187/300], Training Loss: 0.0143
Epoch [187/300], Validation Loss: 0.0117, Validation Accuracy: 0.9971
Epoch [188/300], Training Loss: 0.0142
Epoch [188/300], Validation Loss: 0.0116, Validation Accuracy: 0.9971
Epoch [189/300], Training Loss: 0.0141
Epoch [189/300], Validation Loss: 0.0115, Validation Accuracy: 0.9971
Epoch [190/300], Training Loss: 0.0140
Epoch [190/300], Validation Loss: 0.0114, Validation Accuracy: 0.9971
Epoch [191/300], Training Loss: 0.0139
Epoch [191/300], Validation Loss: 0.0114, Validation Accuracy: 0.9971
Epoch [192/300], Training Loss: 0.0139
Epoch [192/300], Validation Loss: 0.0113, Validation Accuracy: 0.9971
Epoch [193/300], Training Loss: 0.0138
Epoch [193/300], Validation Loss: 0.0112, Validation Accuracy: 0.9979
Epoch [194/300], Training Loss: 0.0137
Epoch [194/300], Va

Epoch [281/300], Validation Loss: 0.0066, Validation Accuracy: 0.9993
Epoch [282/300], Training Loss: 0.0086
Epoch [282/300], Validation Loss: 0.0065, Validation Accuracy: 0.9993
Epoch [283/300], Training Loss: 0.0086
Epoch [283/300], Validation Loss: 0.0065, Validation Accuracy: 0.9993
Epoch [284/300], Training Loss: 0.0085
Epoch [284/300], Validation Loss: 0.0065, Validation Accuracy: 0.9993
Epoch [285/300], Training Loss: 0.0085
Epoch [285/300], Validation Loss: 0.0064, Validation Accuracy: 0.9993
Epoch [286/300], Training Loss: 0.0084
Epoch [286/300], Validation Loss: 0.0064, Validation Accuracy: 0.9993
Epoch [287/300], Training Loss: 0.0084
Epoch [287/300], Validation Loss: 0.0064, Validation Accuracy: 0.9993
Epoch [288/300], Training Loss: 0.0084
Epoch [288/300], Validation Loss: 0.0063, Validation Accuracy: 0.9993
Epoch [289/300], Training Loss: 0.0083
Epoch [289/300], Validation Loss: 0.0063, Validation Accuracy: 0.9993
Epoch [290/300], Training Loss: 0.0083
Epoch [290/300], Va

## Test phase 

In [23]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

# Specify the directories containing the .mp3 files
directories = ['something']

# Create an empty DataFrame 
features_df = pd.DataFrame()

for directory in directories:
    print(f"Processing files in {directory} directory")
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                features = extract_features(file_path)
                # Append the features to the DataFrame as a new row
                if features is not None:
                    features_series = pd.Series(features)
                    features_df = pd.concat([features_df, features_series], axis=0)  # Concatenate along rows (axis=0)
            except Exception as e:
                print(f"Error encountered while processing file: {file_path}")
                continue


Processing files in something directory


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.14s/it]


In [24]:
X_new= features_df.T
X_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-224.141861,123.548615,-11.934964,35.14872,-2.978472,11.988633,-10.192609,7.214376,-13.9877,1.976113,...,20.868756,15.711451,19.730115,19.38868,19.780333,19.137458,41.547228,1692.69833,0.082245,3374.557483


## load the model

<All keys matched successfully>

In [4]:
loaded_model.eval()  


X_new = scaler.transform(X_new) 
X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

with torch.no_grad():
    new_outputs = loaded_model(X_new_tensor.unsqueeze(1))
    _, new_predicted = torch.max(new_outputs, 1)

print(new_predicted)


NameError: name 'X_new' is not defined

In [5]:

model = RNN( input_size, hidden_size, num_classes)


model.load_state_dict(torch.load('rnn_model.pth'))


<All keys matched successfully>

In [None]:
import pyaudio
import wave
import time
import os

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None
    
def record_and_classify_audio():
    # Record audio for 1 minute
    audio_duration = 60 
    sample_rate = 44100 

    audio_frames = []

    p = pyaudio.PyAudio()

    print("Recording audio for 1 minute...")
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)

    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)

    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    # Save the recorded audio to a WAV file
    output_audio_file = "recorded_audio.wav"
    wf = wave.open(output_audio_file, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(sample_rate)
    wf.writeframes(b''.join(audio_frames))
    wf.close()

    # extract features 
    recorded_features = extract_features(output_audio_file)

    # Check if features are successfully extracted
    if recorded_features is not None:
        # Convert the features to a pandas DataFrame
        recorded_df = pd.DataFrame([recorded_features])

        # Standardize the features (like you did with the training data)
        recorded_df = scaler.transform(recorded_df)

        # Feed the features to the trained PyTorch model for classification
        model.eval()
        recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
        with torch.no_grad():
            predicted_output = model(recorded_tensor.unsqueeze(1))
            _, predicted_class = torch.max(predicted_output, 1)

        # Decode the predicted class using the label encoder
        predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())

        # Print the classification result
        print("Predicted Class:", predicted_class_label[0])

        # Delete the audio file
        os.remove(output_audio_file)
    else:
        print("Error occurred while extracting features from recorded audio.")

# Continuous loop for recording and classifying audio
while True:
    record_and_classify_audio()
    
    time.sleep(3)  

ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave


Recording audio for 1 minute...
Finished recording.
Error encountered while parsing file: recorded_audio.wav
Error occurred while extracting features from recorded audio.


ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave


Recording audio for 1 minute...
