# Mic check

In [32]:
import sounddevice as sd


def get_default_input_device_info():
    default_input_device = sd.default.device[0]  # get the ID of the default input device
    device_info = sd.query_devices(default_input_device)
    print(device_info)

# Get the info of the default input device
get_default_input_device_info()


{'name': 'MacBook Air 마이크', 'index': 2, 'hostapi': 0, 'max_input_channels': 1, 'max_output_channels': 0, 'default_low_input_latency': 0.0336875, 'default_low_output_latency': 0.01, 'default_high_input_latency': 0.043020833333333335, 'default_high_output_latency': 0.1, 'default_samplerate': 48000.0}


In [33]:
import sounddevice as sd

def list_input_devices():
    devices = sd.query_devices()
    for i, device in enumerate(devices):
        if device['max_input_channels'] > 0:  # this is an input device
            print(f"Device #{i} name: {device['name']}")

# List available input devices (including microphones)
list_input_devices()

Device #1 name: 갤럭시 S2 마이크
Device #2 name: MacBook Air 마이크


In [34]:
import numpy as np

# Choose the device to use for recording
device_id = 2  # replace with the ID of the device you want to use
duration = 3  # seconds

# Create a buffer to store the audio data
buffer = np.zeros((duration * 44100,))
buffer_index = 0

# Define a callback function to process the audio input
def audio_callback(indata, frames, time, status):
    global buffer_index
    volume_norm = np.linalg.norm(indata) * 10
    print(f'\r{"|" * int(volume_norm)}', end='')  # print a simple "volume bar"

    # Store the incoming data in the buffer
    buffer[buffer_index:buffer_index+frames] = indata[:, 0]
    buffer_index += frames

# Create a stream object
stream = sd.InputStream(callback=audio_callback, device=device_id, channels=1, samplerate=44100)

# Start the stream
with stream:
    # Record for 3 seconds
    sd.sleep(duration * 1000)

# Play back the recorded sound
sd.play(buffer, samplerate=44100)

|||||||||||||||||||

### Import models


In [35]:
import torch
import torch.nn as nn
import torchaudio
import sounddevice as sd
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torch.autograd import Variable
from IPython.display import Audio

selected_labels = ["air_conditioner","children_playing", "street_music","door_nock","glass_shatter","car_horn","dog_bark","drilling","nothing","siren","nothing2","Bicycle_bell"]

try:
    # MULTI GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(pretrained=False)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(512,len(selected_labels))
    model = nn.DataParallel(model)  # Add this line
    model.load_state_dict(torch.load('ResNet18_02.pth', map_location=device))
    model = model.to(device)
    model = model.eval()

    state_dict = torch.load('ResNet18_02.pth', map_location=device)
    new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
    model.load_state_dict(new_state_dict)

    print("Model successfully loaded. + GPU")
except:
    #One GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(pretrained=False)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(512,len(selected_labels))
    try:
        state_dict = torch.load('ResNet18_02.pth', map_location=device)
        new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
        model.load_state_dict(new_state_dict)
        model = model.to(device)
        model = model.eval()
        print("Model successfully loaded.+CPU")
    except:
        print("Failed to load the model. Please check the model file.")





Model successfully loaded.+CPU


In [36]:
SAMPLE_RATE = 22050

class MonoToColor(nn.Module):
    def __init__(self, num_channels=3):
        super(MonoToColor, self).__init__()
        self.num_channels = num_channels

    def forward(self, tensor):
        return tensor.repeat(self.num_channels, 1, 1)

# Apply the same transformation as used during training
transformation = transforms.Compose([
    torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=128),
    torchaudio.transforms.AmplitudeToDB(stype='power', top_db=80),
    MonoToColor()
])



## 2seconds / 80% upper guess

In [41]:
import torch.nn.functional as F


## print every labels
def continuous_sound_prediction(model, device, transformation, sample_rate, target_sample_rate):
    # Define class labels

    count = 0
    while True:
        if count > 100:
            break
        # Record a 2 seconds mono audio at the specified sample rate
        duration = 2.0  # seconds
        recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1) 
        sd.wait()

        # Convert to PyTorch tensor and switch channels and frames
        recording = torch.from_numpy(recording).float()
        recording = torch.transpose(recording, 0, 1)

        # Resample if necessary
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
            recording = resampler(recording)

        # Mix down if necessary
        if recording.shape[0] > 1:
            recording = torch.mean(recording, dim=0, keepdim=True)

        # Cut or pad if necessary
        if recording.shape[1] > target_sample_rate:
            recording = recording[:, :target_sample_rate]
        elif recording.shape[1] < target_sample_rate:
            num_missing_samples = target_sample_rate - recording.shape[1]
            last_dim_padding = (0, num_missing_samples)
            recording = nn.functional.pad(recording, last_dim_padding)

        # Apply transformation
        recording = transformation(recording)

        # Make the prediction
        model.eval()  # set model to evaluation mode
        with torch.no_grad():  # deactivate autograd engine to reduce memory usage and speed up computations
            recording = recording.to(device)
            outputs = model(recording[None, ...])
            #probabilities = F.softmax(outputs, dim=1)  # apply softmax to output
            #_, predicted = torch.max(outputs, 1)
            probabilities = torch.sigmoid(outputs)  # apply sigmoid to output
            _, predicted = torch.max(outputs, 1)
        # Get predicted label and its corresponding probability
        predicted_label = selected_labels[predicted.item()]
        predicted_confidence = probabilities[0, predicted.item()].item()  # get the probability of the predicted class

        ######## Adjust 'x' probability   #########
        #change_label = "drilling"
        change_probability = 0.5
        try:
            x_index = selected_labels.index(change_label)
            probabilities[0, x_index] = max(0.0, probabilities[0, x_index].item() - change_probability)
            #print(f"reduce {change_label} by {change_probability}")
        except:
            #print("no tune")
            pass
        # Print the probabilities of all labels in one line
        prob_strs = [f"{label} {probabilities[0, idx].item():.2%}" for idx, label in enumerate(selected_labels)]
        #print(f"\r{count} / " + " / ".join(prob_strs), end="")
        print(f"{count} / " + " / ".join(prob_strs))

        count = count + 1


In [43]:
# Call the continuous sound prediction function
#print("model : ",model)
print("device : ",device)

continuous_sound_prediction(model, device, transformation, SAMPLE_RATE, SAMPLE_RATE)

device :  cpu
0 / door_nock 0.43% / glass_shatter 15.13% / car_horn 0.02% / dog_bark 98.94% / drilling 8.43% / nothing 12.00% / siren 15.63% / nothing2 14.77%
1 / door_nock 0.07% / glass_shatter 42.81% / car_horn 0.00% / dog_bark 98.41% / drilling 52.49% / nothing 0.95% / siren 22.18% / nothing2 27.88%
2 / door_nock 14.23% / glass_shatter 73.48% / car_horn 0.05% / dog_bark 56.12% / drilling 2.92% / nothing 15.90% / siren 0.72% / nothing2 77.02%
3 / door_nock 3.21% / glass_shatter 65.95% / car_horn 0.01% / dog_bark 74.46% / drilling 0.39% / nothing 23.93% / siren 2.00% / nothing2 72.87%
4 / door_nock 68.15% / glass_shatter 58.19% / car_horn 0.00% / dog_bark 5.37% / drilling 2.01% / nothing 25.25% / siren 0.68% / nothing2 96.93%
5 / door_nock 12.25% / glass_shatter 56.14% / car_horn 0.01% / dog_bark 78.13% / drilling 2.41% / nothing 11.26% / siren 0.75% / nothing2 75.61%
6 / door_nock 52.63% / glass_shatter 28.99% / car_horn 0.03% / dog_bark 50.64% / drilling 7.28% / nothing 10.16% / sir