# Mic check

In [75]:
import sounddevice as sd


def get_default_input_device_info():
    default_input_device = sd.default.device[0]  # get the ID of the default input device
    device_info = sd.query_devices(default_input_device)
    print(device_info)

# Get the info of the default input device
get_default_input_device_info()


{'name': 'MacBook Air 마이크', 'index': 3, 'hostapi': 0, 'max_input_channels': 1, 'max_output_channels': 0, 'default_low_input_latency': 0.0336875, 'default_low_output_latency': 0.01, 'default_high_input_latency': 0.043020833333333335, 'default_high_output_latency': 0.1, 'default_samplerate': 48000.0}


In [76]:
import sounddevice as sd

def list_input_devices():
    devices = sd.query_devices()
    for i, device in enumerate(devices):
        if device['max_input_channels'] > 0:  # this is an input device
            print(f"Device #{i} name: {device['name']}")

# List available input devices (including microphones)
list_input_devices()

Device #1 name: 갤럭시 S2 마이크
Device #2 name: USB PnP Sound Device
Device #3 name: MacBook Air 마이크


In [77]:
import numpy as np

# Choose the device to use for recording
device_id = 2  # replace with the ID of the device you want to use
duration = 3  # seconds

# Create a buffer to store the audio data
buffer = np.zeros((duration * 44100,))
buffer_index = 0

# Define a callback function to process the audio input
def audio_callback(indata, frames, time, status):
    global buffer_index
    volume_norm = np.linalg.norm(indata) * 10
    print(f'\r{"|" * int(volume_norm)}', end='')  # print a simple "volume bar"

    # Store the incoming data in the buffer
    buffer[buffer_index:buffer_index+frames] = indata[:, 0]
    buffer_index += frames

# Create a stream object
stream = sd.InputStream(callback=audio_callback, device=device_id, channels=1, samplerate=44100)

# Start the stream
with stream:
    # Record for 3 seconds
    sd.sleep(duration * 1000)

# Play back the recorded sound
sd.play(buffer, samplerate=44100)

||||||||||

### Import models


In [78]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models import resnet18

def load_model_interactive(model_dir="models"):
    # List all models in the directory
    model_files = [f for f in os.listdir(model_dir) if f.endswith('.pth') and not f.startswith('._')]
    
    # Display the models to the user
    for idx, model_name in enumerate(model_files, 1):
        print(f"{idx}. {model_name}")
    
    # Get user input
    selected_idx = int(input("Enter the number corresponding to the model you wish to load: ")) - 1
    model_path = os.path.join(model_dir, model_files[selected_idx])
    model = None   
    try:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            
            # Load the resnet18 model structure
            model = resnet18(pretrained=False)
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(512, 14)  # Assuming selected_labels is globally accessible
            
            # Load the state dict
            state_dict = torch.load(model_path, map_location=device)
            new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
            model.load_state_dict(new_state_dict)
            
            print(f"Model loaded from {model_path}")
            model = model.to(device)
            model = model.eval()
    except Exception as e:
        print(f"Failed to load the model. Error: {e}")

    return model

# For demonstration purposes, the function will display model names but won't actually load them here.
# Please run this in your local environment for actual model loading.
model = load_model_interactive()

1. BestModel_1.pth
2. BestModel_10.pth
3. BestModel_11.pth
4. BestModel_12.pth
5. BestModel_13.pth
6. BestModel_14.pth
7. BestModel_15.pth
8. BestModel_2.pth
9. BestModel_3.pth
10. BestModel_4.pth
11. BestModel_5.pth
12. BestModel_6.pth
13. BestModel_7.pth
14. BestModel_8.pth
15. BestModel_9.pth
16. Model2_001.pth
17. Model2_002.pth
18. Model2_003.pth
19. Model2_004.pth
Model loaded from models/Model2_004.pth


In [79]:
class MinMaxNormalize(nn.Module):
    def __init__(self, min_val=None, max_val=None):
        super(MinMaxNormalize, self).__init__()
        self.min_val = min_val
        self.max_val = max_val

    def forward(self, tensor):
        if self.min_val is None or self.max_val is None:
            min_val = torch.min(tensor)
            max_val = torch.max(tensor)
        else:
            min_val = self.min_val
            max_val = self.max_val
        
        normalized_tensor = (tensor - min_val) / (max_val - min_val)
        return normalized_tensor

In [80]:
class MonoToColor(nn.Module):
    def __init__(self, num_channels=3):
        super(MonoToColor, self).__init__()
        self.num_channels = num_channels

    def forward(self, tensor):
        return tensor.repeat(self.num_channels, 1, 1)

In [81]:
SAMPLE_RATE = 22050

# Apply the same transformation as used during training
transformation = transforms.Compose([
    torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=40),# higher the better but more complex. For talking we use 128, for sound effect, about 40.
    torchaudio.transforms.AmplitudeToDB(stype='power', top_db=80),
    MinMaxNormalize(),
    MonoToColor()
])

In [82]:
def continuous_sound_prediction(model, device, transformation, sample_rate, target_sample_rate):
    # Define class labels
    labels = [
        "children", "nothing2", "drilling", "engine", "siren", 
        "gunshot", "aircon", "jackhammer", "carhorn", "glass", 
        "nock", "street_music", "dog_bark", "nothing1"
    ]

    count = 0
    while count <= 100:
        try:
            # Record a 2 seconds mono audio at the specified sample rate
            duration = 2.0  # seconds
            recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
            sd.wait()

            # Convert to PyTorch tensor and switch channels and frames
            recording = torch.from_numpy(recording).float()
            recording = torch.transpose(recording, 0, 1)

            # Resample if necessary
            if sample_rate != target_sample_rate:
                resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
                recording = resampler(recording)

            # Mix down if necessary
            if recording.shape[0] > 1:
                recording = torch.mean(recording, dim=0, keepdim=True)

            # Cut or pad if necessary
            if recording.shape[1] > target_sample_rate:
                recording = recording[:, :target_sample_rate]
            elif recording.shape[1] < target_sample_rate:
                num_missing_samples = target_sample_rate - recording.shape[1]
                last_dim_padding = (0, num_missing_samples)
                recording = nn.functional.pad(recording, last_dim_padding)

            # Apply transformation
            recording = transformation(recording)

            # Make the prediction
            model.eval()
            with torch.no_grad():
                recording = recording.to(device)
                outputs = model(recording[None, ...])
                probabilities = F.softmax(outputs, dim=1)
                _, predicted = torch.max(outputs, 1)

            # Get predicted label and its corresponding probability
            predicted_label = labels[predicted.item()]
            predicted_confidence = probabilities[0, predicted.item()].item()

            # Print the probabilities of all labels in one line
            prob_strs = [f"{label} {probabilities[0, idx].item():.2%}" for idx, label in enumerate(labels)]
            print(f"{count} / " + " / ".join(prob_strs))

            count += 1
        except Exception as e:
            print(f"Error during prediction: {e}")
            break

    print("Finished continuous sound prediction.")

continuous_sound_prediction

<function __main__.continuous_sound_prediction(model, device, transformation, sample_rate, target_sample_rate)>

In [85]:
def continuous_sound_prediction(model, device, transformation, sample_rate, target_sample_rate, device_id):
    # Define class labels
    labels = [
        "children", "nothing2", "drilling", "engine", "siren", 
        "gunshot", "aircon", "jackhammer", "carhorn", "glass", 
        "nock", "street_music", "dog_bark", "nothing1"
    ]

    count = 0
    while count <= 100:
        try:
            # Record a 2 seconds mono audio at the specified sample rate using the correct device
            duration = 2.0  # seconds
            recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, device=device_id)
            sd.wait()

            # Convert to PyTorch tensor and switch channels and frames
            recording = torch.from_numpy(recording).float()
            recording = torch.transpose(recording, 0, 1)

            # Resample if necessary
            if sample_rate != target_sample_rate:
                resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
                recording = resampler(recording)

            # Mix down if necessary
            if recording.shape[0] > 1:
                recording = torch.mean(recording, dim=0, keepdim=True)

            # Cut or pad if necessary
            if recording.shape[1] > target_sample_rate:
                recording = recording[:, :target_sample_rate]
            elif recording.shape[1] < target_sample_rate:
                num_missing_samples = target_sample_rate - recording.shape[1]
                last_dim_padding = (0, num_missing_samples)
                recording = nn.functional.pad(recording, last_dim_padding)

            # Apply transformation
            recording = transformation(recording)

            # Make the prediction
            model.eval()
            with torch.no_grad():
                recording = recording.to(device)
                outputs = model(recording[None, ...])
                probabilities = F.softmax(outputs, dim=1)
                _, predicted = torch.max(outputs, 1)

            # Get predicted label and its corresponding probability
            predicted_label = labels[predicted.item()]
            predicted_confidence = probabilities[0, predicted.item()].item()

            # Print the probabilities of all labels in one line
            prob_strs = [f"{label} {probabilities[0, idx].item():.2%}" for idx, label in enumerate(labels)]
            print(f"{count} / " + " / ".join(prob_strs))

            count += 1
        except Exception as e:
            print(f"Error during prediction: {e}")
            break

    print("Finished continuous sound prediction.")

# Set the device ID
device_id = 2  # Replace with the ID of the device you want to use

# Call the function
continuous_sound_prediction(model, device, transformation, SAMPLE_RATE, SAMPLE_RATE, device_id)

0 / children 1.24% / nothing2 0.01% / drilling 0.50% / engine 0.01% / siren 95.75% / gunshot 0.35% / aircon 0.02% / jackhammer 0.26% / carhorn 1.75% / glass 0.00% / nock 0.00% / street_music 0.02% / dog_bark 0.00% / nothing1 0.10%
1 / children 1.23% / nothing2 0.01% / drilling 0.50% / engine 0.01% / siren 95.74% / gunshot 0.34% / aircon 0.02% / jackhammer 0.26% / carhorn 1.77% / glass 0.00% / nock 0.00% / street_music 0.02% / dog_bark 0.00% / nothing1 0.10%
2 / children 1.24% / nothing2 0.01% / drilling 0.50% / engine 0.01% / siren 95.72% / gunshot 0.35% / aircon 0.02% / jackhammer 0.26% / carhorn 1.77% / glass 0.00% / nock 0.00% / street_music 0.02% / dog_bark 0.00% / nothing1 0.10%
3 / children 1.25% / nothing2 0.01% / drilling 0.50% / engine 0.01% / siren 95.72% / gunshot 0.35% / aircon 0.02% / jackhammer 0.26% / carhorn 1.76% / glass 0.00% / nock 0.00% / street_music 0.02% / dog_bark 0.00% / nothing1 0.10%
4 / children 1.23% / nothing2 0.01% / drilling 0.50% / engine 0.01% / siren 

KeyboardInterrupt: 

In [84]:
# Call the continuous sound prediction function
print("device : ",device)
continuous_sound_prediction(model, device, transformation, SAMPLE_RATE, SAMPLE_RATE)

device :  cpu
0 / children nan% / nothing2 nan% / drilling nan% / engine nan% / siren nan% / gunshot nan% / aircon nan% / jackhammer nan% / carhorn nan% / glass nan% / nock nan% / street_music nan% / dog_bark nan% / nothing1 nan%
1 / children nan% / nothing2 nan% / drilling nan% / engine nan% / siren nan% / gunshot nan% / aircon nan% / jackhammer nan% / carhorn nan% / glass nan% / nock nan% / street_music nan% / dog_bark nan% / nothing1 nan%
2 / children nan% / nothing2 nan% / drilling nan% / engine nan% / siren nan% / gunshot nan% / aircon nan% / jackhammer nan% / carhorn nan% / glass nan% / nock nan% / street_music nan% / dog_bark nan% / nothing1 nan%
3 / children nan% / nothing2 nan% / drilling nan% / engine nan% / siren nan% / gunshot nan% / aircon nan% / jackhammer nan% / carhorn nan% / glass nan% / nock nan% / street_music nan% / dog_bark nan% / nothing1 nan%


KeyboardInterrupt: 