In [27]:
# Importing libraries
import os
import torch
import torchaudio
from torchvision import transforms
from torchvision.models import resnet18
import sounddevice as sd
import torch.nn as nn

class MonoToColor(nn.Module):
    def __init__(self, num_channels=3):
        super(MonoToColor, self).__init__()
        self.num_channels = num_channels

    def forward(self, tensor):
        return tensor.repeat(1, self.num_channels, 1, 1)


# Setting up audio recording parameters
SAMPLE_RATE = 22050
DURATION = 1  # in seconds
CHANNELS = 1  # Mono audio

# Setting up device and loading model
device = 'cpu'
model = resnet18(pretrained=False)
model.fc = nn.Linear(512, 10)  # change to match the number of classes
model_path = os.path.join(os.getcwd(), "ResNet18_Best.pth")  # adjust model path if necessary
state_dict = torch.load(model_path, map_location=torch.device(device))
model.load_state_dict(state_dict, strict=False)
model = model.to(device)
model.eval()

# Setting up transformation
transformation = transforms.Compose([
    torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=128),
    torchaudio.transforms.AmplitudeToDB(stype='power', top_db=80),
    MonoToColor()
])


In [28]:
# Setting up audio recording parameters
SAMPLE_RATE = 22050
DURATION = 1  # in seconds
CHANNELS = 1  # Mono audio

# Real-time audio prediction
def predict_sound(model, device, transformation, sample_rate):
    # Record the sound
    RECORD_SECONDS = 2
    recording = sd.rec(int(sample_rate * RECORD_SECONDS), samplerate=sample_rate, channels=1)
    sd.wait()  # Wait until recording is finished

    # Reshape the recording to have (batch, channel, time) form
    recording = recording.reshape(1, 1, -1)

    print(f'Shape after reshaping: {recording.shape}')

    recording = transformation(recording)
    
    print(f'Shape after applying transformation: {recording.shape}')

    # Make the prediction
    recording = recording.to(device)
    outputs = model(recording)
    _, predicted = torch.max(outputs, 1)

    return predicted.item()


In [29]:
# Run the real-time audio prediction
print(predict_sound(model, device, transformation, SAMPLE_RATE))


Shape after reshaping: (1, 1, 44100)


TypeError: 'int' object is not callable