In [None]:
!pip uninstall opencv-python -y



In [None]:
Y

In [None]:
!pip install opencv-python


## train the RNN model

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

features_df = pd.read_csv('features.csv')

# Extract features and corresponding labels
X = features_df.iloc[:, :-1].values  # features 
y = features_df.iloc[:, -1].values  # labels 

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y = label_encoder.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train.shape[1]
hidden_size = 64 
num_classes = len(np.unique(y_train))

model = RNN(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 300 

for epoch in range(num_epochs):
    model.train()
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    outputs = model(X_train_tensor.unsqueeze(1))  
    optimizer.zero_grad()
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')

    # Validation
    model.eval()
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    with torch.no_grad():
        val_outputs = model(X_val_tensor.unsqueeze(1))
        val_loss = criterion(val_outputs.squeeze(), y_val)
        _, val_predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(y_val.numpy(), val_predicted.numpy())

    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluate the model on the test set
model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    test_outputs = model(X_test_tensor.unsqueeze(1))
    _, test_predicted = torch.max(test_outputs, 1)
    test_accuracy = accuracy_score(y_test.numpy(), test_predicted.numpy())

print("Test Accuracy:", test_accuracy)

## Test phase 

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

# Specify the directories containing the .mp3 files
directories = ['something']

# Create an empty DataFrame 
features_df = pd.DataFrame()

for directory in directories:
    print(f"Processing files in {directory} directory")
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                features = extract_features(file_path)
                # Append the features to the DataFrame as a new row
                if features is not None:
                    features_series = pd.Series(features)
                    features_df = pd.concat([features_df, features_series], axis=0)  # Concatenate along rows (axis=0)
            except Exception as e:
                print(f"Error encountered while processing file: {file_path}")
                continue


In [None]:
X_new= features_df.T
X_new.head()

## load the model

In [None]:

loaded_model = RNN( input_size, hidden_size, num_classes)


loaded_model.load_state_dict(torch.load('rnn_model.pth'))


In [None]:
loaded_model.eval()  


X_new = scaler.transform(X_new) 
X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

with torch.no_grad():
    new_outputs = loaded_model(X_new_tensor.unsqueeze(1))
    _, new_predicted = torch.max(new_outputs, 1)




In [None]:
print(new_predicted)

In [None]:
import pyaudio
import numpy as np
import librosa
import time
import torch
import pandas as pd

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

# Initialize PyAudio and settings
p = pyaudio.PyAudio()
sample_rate = 44100
channel = 1
sample_format = pyaudio.paFloat32
frame_length = 1024
audio_duration = 2  # 2 seconds
buffer_size = int(sample_rate * audio_duration / frame_length)

audio_buffer = np.zeros(buffer_size, dtype=np.float32)

stream = p.open(format=sample_format,
                channels=channel,
                rate=sample_rate,
                input=True,
                frames_per_buffer=frame_length)

print("Press Ctrl+C to stop...")

while True:
    try:
        # Shift the buffer
        audio_buffer[:-frame_length] = audio_buffer[frame_length:]

        # Read new audio frame and append to buffer
        audio_frame = np.frombuffer(stream.read(frame_length), dtype=np.float32)
        audio_buffer[-frame_length:] = audio_frame

        # Feature extraction
        features = extract_features(audio_buffer, sample_rate)
        
        if len(features) == 0:
            print("Error in feature extraction.")
            continue

        # Feature scaling, assuming 'scaler' exists
        scaled_features = scaler.transform([features])

        # Classification, assuming 'model' and 'label_encoder' exist
        model.eval()
        tensor_input = torch.tensor(scaled_features, dtype=torch.float32)
        with torch.no_grad():
            predicted_output = model(tensor_input.unsqueeze(1))
            _, predicted_class = torch.max(predicted_output, 1)
        
        predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
        print(f"Predicted Class: {predicted_class_label[0]}")

        # You can add sleep here if needed
        time.sleep(0.1)

    except KeyboardInterrupt:
        print("Stopped.")
        stream.stop_stream()
        stream.close()
        p.terminate()
        break


## Sliding the window by 1 second

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    ten_sec_frames = int(sample_rate / 1024 * 10)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 10 seconds...")
        audio_frames = audio_frames[ten_sec_frames:]
        
        for _ in range(0, ten_sec_frames):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

                

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## Sliding the window by 0.1 second


In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.1)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.1 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## slide window by 0.2 seconds

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.2)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.2 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## slide window by 0.3 seconds

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.3)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.3 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## slide window by 0.4 seconds

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.4)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.4 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## slide window by 0.5 seconds

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.5)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.5 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## slide window by 0.6 seconds

In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.6)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.6 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## Sliding the window by 0.7 second


In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.7)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.7 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## Sliding the window by 0.8 second


In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.8)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.8 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


## Sliding the window by 0.9 second


In [None]:
import pyaudio
import wave
import time
import os
import pandas as pd  
import torch


def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def record_and_classify_audio():
    audio_duration = 60  # Record audio for 1 minute
    sample_rate = 44100  # 44.1kHz
    _frames_ = int(sample_rate / 1024 * 0.9)  

    audio_frames = []
    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)
    
    # Initialize the buffer with 1-minute audio data
    print("Initializing audio buffer with 1-minute audio...")
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)
    print("Initialization complete.")

    while True:
        
        print("Sliding the window by 0.9 seconds...")
        audio_frames = audio_frames[_frames_:]
        
        for _ in range(0, _frames_):
            data = stream.read(1024)
            audio_frames.append(data)

        
        output_audio_file = "recorded_audio.wav"
        wf = wave.open(output_audio_file, 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(audio_frames))
        wf.close()

        # extract features 
        recorded_features = extract_features(output_audio_file)

        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)
            model.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)
            with torch.no_grad():
                predicted_output = model(recorded_tensor.unsqueeze(1))
                _, predicted_class = torch.max(predicted_output, 1)

            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print("Predicted Class:", predicted_class_label[0])

            # Delete the audio file
            os.remove(output_audio_file)
        else:
            print("Error occurred while extracting features from recorded audio.")

        time.sleep(0.1)

# Entry point
if __name__ == "__main__":
    record_and_classify_audio()


In [None]:
import cv2

def test_video_capture():
    cap = cv2.VideoCapture(1)
    if not cap.isOpened():
        print("Error: Couldn't open the camera.")
        return

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Couldn't read a frame.")
            break

        cv2.imshow('Test', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Call the function
test_video_capture()
