## train the RNN model

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

features_df = pd.read_csv('features.csv')

# Extract features and corresponding labels
X = features_df.iloc[:, :-1].values  # features 
y = features_df.iloc[:, -1].values  # labels 

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y = label_encoder.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train.shape[1]
hidden_size = 64 
num_classes = len(np.unique(y_train))

model = RNN(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 300 

for epoch in range(num_epochs):
    model.train()
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    outputs = model(X_train_tensor.unsqueeze(1))  
    optimizer.zero_grad()
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')

    # Validation
    model.eval()
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    with torch.no_grad():
        val_outputs = model(X_val_tensor.unsqueeze(1))
        val_loss = criterion(val_outputs.squeeze(), y_val)
        _, val_predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(y_val.numpy(), val_predicted.numpy())

    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluate the model on the test set
model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    test_outputs = model(X_test_tensor.unsqueeze(1))
    _, test_predicted = torch.max(test_outputs, 1)
    test_accuracy = accuracy_score(y_test.numpy(), test_predicted.numpy())

print("Test Accuracy:", test_accuracy)

Epoch [1/300], Training Loss: 0.6108
Epoch [1/300], Validation Loss: 0.5418, Validation Accuracy: 0.9031
Epoch [2/300], Training Loss: 0.5401
Epoch [2/300], Validation Loss: 0.4804, Validation Accuracy: 0.9416
Epoch [3/300], Training Loss: 0.4780
Epoch [3/300], Validation Loss: 0.4268, Validation Accuracy: 0.9601
Epoch [4/300], Training Loss: 0.4239
Epoch [4/300], Validation Loss: 0.3802, Validation Accuracy: 0.9679
Epoch [5/300], Training Loss: 0.3771
Epoch [5/300], Validation Loss: 0.3400, Validation Accuracy: 0.9715
Epoch [6/300], Training Loss: 0.3366
Epoch [6/300], Validation Loss: 0.3052, Validation Accuracy: 0.9729
Epoch [7/300], Training Loss: 0.3017
Epoch [7/300], Validation Loss: 0.2751, Validation Accuracy: 0.9751
Epoch [8/300], Training Loss: 0.2715
Epoch [8/300], Validation Loss: 0.2490, Validation Accuracy: 0.9751
Epoch [9/300], Training Loss: 0.2455
Epoch [9/300], Validation Loss: 0.2264, Validation Accuracy: 0.9751
Epoch [10/300], Training Loss: 0.2230
Epoch [10/300], V

Epoch [85/300], Training Loss: 0.0262
Epoch [85/300], Validation Loss: 0.0257, Validation Accuracy: 0.9907
Epoch [86/300], Training Loss: 0.0259
Epoch [86/300], Validation Loss: 0.0254, Validation Accuracy: 0.9907
Epoch [87/300], Training Loss: 0.0257
Epoch [87/300], Validation Loss: 0.0251, Validation Accuracy: 0.9907
Epoch [88/300], Training Loss: 0.0255
Epoch [88/300], Validation Loss: 0.0249, Validation Accuracy: 0.9914
Epoch [89/300], Training Loss: 0.0252
Epoch [89/300], Validation Loss: 0.0246, Validation Accuracy: 0.9914
Epoch [90/300], Training Loss: 0.0250
Epoch [90/300], Validation Loss: 0.0244, Validation Accuracy: 0.9914
Epoch [91/300], Training Loss: 0.0248
Epoch [91/300], Validation Loss: 0.0241, Validation Accuracy: 0.9914
Epoch [92/300], Training Loss: 0.0245
Epoch [92/300], Validation Loss: 0.0239, Validation Accuracy: 0.9914
Epoch [93/300], Training Loss: 0.0243
Epoch [93/300], Validation Loss: 0.0236, Validation Accuracy: 0.9914
Epoch [94/300], Training Loss: 0.0241

Epoch [180/300], Training Loss: 0.0131
Epoch [180/300], Validation Loss: 0.0109, Validation Accuracy: 0.9986
Epoch [181/300], Training Loss: 0.0131
Epoch [181/300], Validation Loss: 0.0108, Validation Accuracy: 0.9986
Epoch [182/300], Training Loss: 0.0130
Epoch [182/300], Validation Loss: 0.0108, Validation Accuracy: 0.9986
Epoch [183/300], Training Loss: 0.0129
Epoch [183/300], Validation Loss: 0.0107, Validation Accuracy: 0.9986
Epoch [184/300], Training Loss: 0.0128
Epoch [184/300], Validation Loss: 0.0106, Validation Accuracy: 0.9986
Epoch [185/300], Training Loss: 0.0127
Epoch [185/300], Validation Loss: 0.0105, Validation Accuracy: 0.9993
Epoch [186/300], Training Loss: 0.0127
Epoch [186/300], Validation Loss: 0.0104, Validation Accuracy: 0.9993
Epoch [187/300], Training Loss: 0.0126
Epoch [187/300], Validation Loss: 0.0104, Validation Accuracy: 0.9993
Epoch [188/300], Training Loss: 0.0125
Epoch [188/300], Validation Loss: 0.0103, Validation Accuracy: 0.9993
Epoch [189/300], Tr

Epoch [257/300], Validation Loss: 0.0066, Validation Accuracy: 1.0000
Epoch [258/300], Training Loss: 0.0084
Epoch [258/300], Validation Loss: 0.0066, Validation Accuracy: 1.0000
Epoch [259/300], Training Loss: 0.0084
Epoch [259/300], Validation Loss: 0.0065, Validation Accuracy: 1.0000
Epoch [260/300], Training Loss: 0.0083
Epoch [260/300], Validation Loss: 0.0065, Validation Accuracy: 1.0000
Epoch [261/300], Training Loss: 0.0083
Epoch [261/300], Validation Loss: 0.0064, Validation Accuracy: 1.0000
Epoch [262/300], Training Loss: 0.0083
Epoch [262/300], Validation Loss: 0.0064, Validation Accuracy: 1.0000
Epoch [263/300], Training Loss: 0.0082
Epoch [263/300], Validation Loss: 0.0064, Validation Accuracy: 1.0000
Epoch [264/300], Training Loss: 0.0082
Epoch [264/300], Validation Loss: 0.0063, Validation Accuracy: 1.0000
Epoch [265/300], Training Loss: 0.0081
Epoch [265/300], Validation Loss: 0.0063, Validation Accuracy: 1.0000
Epoch [266/300], Training Loss: 0.0081
Epoch [266/300], Va

## Test phase 

In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

# Specify the directories containing the .mp3 files
directories = ['something']

# Create an empty DataFrame 
features_df = pd.DataFrame()

for directory in directories:
    print(f"Processing files in {directory} directory")
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                features = extract_features(file_path)
                # Append the features to the DataFrame as a new row
                if features is not None:
                    features_series = pd.Series(features)
                    features_df = pd.concat([features_df, features_series], axis=0)  # Concatenate along rows (axis=0)
            except Exception as e:
                print(f"Error encountered while processing file: {file_path}")
                continue


Processing files in something directory


100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.00s/it]


In [3]:
X_new= features_df.T
X_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-224.141861,123.548615,-11.934964,35.14872,-2.978472,11.988633,-10.192609,7.214376,-13.9877,1.976113,...,20.868756,15.711451,19.730115,19.38868,19.780333,19.137458,41.547228,1692.69833,0.082245,3374.557483


## load the model

In [4]:

loaded_model = RNN( input_size, hidden_size, num_classes)


loaded_model.load_state_dict(torch.load('rnn_model.pth'))


<All keys matched successfully>

In [5]:
loaded_model.eval()  


X_new = scaler.transform(X_new) 
X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

with torch.no_grad():
    new_outputs = loaded_model(X_new_tensor.unsqueeze(1))
    _, new_predicted = torch.max(new_outputs, 1)




In [6]:
print(new_predicted)

tensor([1])


In [7]:
import pyaudio
import wave
import time
import os
import pandas as pd  # Import pandas if not already imported
import torch  # Import PyTorch if not already imported


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    ten_sec_frames = int(sample_rate / 1024 * 10)  # Number of frames for 10 seconds

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        # Slide the window by 10 seconds
        print("Sliding the window by 10 seconds...")
        audio_frames = audio_frames[ten_sec_frames:]  # Remove the first 10 seconds
        
        # Record new 10 seconds of audio data
        for _ in range(0, ten_sec_frames):
            data = stream.read(1024)
            audio_frames.append(data)

        # Save the recorded audio to a WAV file
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(3)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave


Initializing audio buffer with 1-minute audio...
Initialization complete.
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the window by 10 seconds...
Predicted Class: 1.0
Sliding the wi

KeyboardInterrupt: 

In [7]:
import pyaudio
import wave
import time
import os
import cv2
import threading
import pandas as pd
import torch
import librosa
import numpy as np


def capture_and_display_video():
    cap = cv2.VideoCapture(0)
    while True:
        ret, frame = cap.read()
        if ret:
            cv2.imshow('Video', cv2.resize(frame, (640, 480)))
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    cap.release()
    cv2.destroyAllWindows()



def record_and_classify_audio():
    audio_duration = 60  # 1 minute
    sample_rate = 44100  
    ten_sec_frames = int(sample_rate / 1024 * 10)

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)

    # Initialize the audio buffer
    print("Initializing 1-minute audio buffer...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)

    while True:
        print("Sliding the window by 10 seconds...")
        audio_frames = audio_frames[ten_sec_frames:]
        
        for _ in range(0, ten_sec_frames):
            data = stream.read(1024)
            audio_frames.append(data)

        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            
            with torch.no_grad():
                predicted_output = model(recorded_tensor)
                _, predicted_class = torch.max(predicted_output, 1)
            
            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            os.remove(output_audio_file)
        else:
            print("Error in feature extraction.")

        time.sleep(3)

# Entry point
if __name__ == '__main__':
    # Start audio recording and processing in a separate thread
    audio_thread = threading.Thread(target=record_and_classify_audio)
    audio_thread.start()
    
    # Start video capture and display in the main thread
    capture_and_display_video()

    audio_thread.join()



Initializing 1-minute audio buffer...


[ WARN:0@0.008] global cap_v4l.cpp:982 open VIDEOIO(V4L2:/dev/video0): can't open camera by index
[ERROR:0@0.008] global obsensor_uvc_stream_channel.cpp:156 getStreamChannelGroup Camera index out of range
ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dm

Sliding the window by 10 seconds...


  return f(*args, **kwargs)
Exception in thread Thread-6 (record_and_classify_audio):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_61232/829340840.py", line 68, in record_and_classify_audio
  File "/home/waqar/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_61232/2817500600.py", line 47, in forward
IndexError: too many indices for tensor of dimension 2


KeyboardInterrupt: 

In [None]:
import cv2

def test_video_capture():
    cap = cv2.VideoCapture(2)
    if not cap.isOpened():
        print("Error: Couldn't open the camera.")
        return

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Couldn't read a frame.")
            break

        cv2.imshow('Test', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

test_video_capture()
