In [1]:
from joblib import load

from sklearn.cluster import KMeans
import sounddevice as sd
import torch.nn as nn
import numpy as np
import torchaudio
import torch


In [2]:
class AudioTransformer(nn.Module):
    def __init__(
            self,
            num_tokens: int, dim_model: int, num_heads: int,
            num_classes: int, dim_feedforward: int = 2048,
            num_layers: int = 1, dropout: int = 0.1
            ) -> None:

        super().__init__()
        self.embedding = nn.Embedding(num_tokens, dim_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_model, nhead=num_heads,
            dim_feedforward=dim_feedforward, dropout=dropout
            )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
            )

        self.fc = nn.Linear(dim_model, num_classes)

    def forward(self, src):
        src = self.embedding(src)  # Replace tokens with embeddings
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)
        output = self.fc(output)
        return output


In [3]:
def record_audio(duration=1.5, sr=16000):
    audio_data = sd.rec(
        int(duration * sr),
        samplerate=sr,
        channels=1,
        dtype="float32"
        )

    sd.wait()
    return audio_data.flatten()


In [4]:
num_tokens = 100  # Same as number of clusters

model = AudioTransformer(
    num_tokens=num_tokens,
    dim_model=256,
    num_heads=8,
    num_classes=4
    )

model.load_state_dict(torch.load("models/model.pth"))
model.eval()




AudioTransformer(
  (embedding): Embedding(100, 256)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [5]:
# import sounddevice as sd


# def record_audio(duration=5, sr=16000):
#     print("Nagrywanie rozpoczęte...")
#     audio_data = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype='float32')
#     sd.wait()
#     print("Nagrywanie zakończone.")
#     return audio_data.flatten()


# def play_audio(audio_data, sr=16000):
#     print("Odtwarzanie dźwięku...")
#     sd.play(audio_data, samplerate=sr)
#     sd.wait()
#     print("Odtwarzanie zakończone.")


# # Nagrywanie dźwięku
# audio_input = record_audio()

# # Odtwarzanie nagranego dźwięku
# play_audio(audio_input)


In [6]:
while True:
    audio_input = record_audio()
    audio_tensor = torch.tensor(audio_input).unsqueeze(0).long()

    with torch.no_grad():
        output = model.forward(audio_tensor)

    predicted_command = output.argmax().item()

    print(predicted_command)


3
3
3
3
3
3
3
3
3
3


KeyboardInterrupt: 

In [18]:
def audio_to_spectrogram(file_path: str):
    waveform, sr = torchaudio.load(file_path)

    transformer = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr, n_fft=2048, hop_length=512, n_mels=64
        )

    spectrogram = transformer(waveform)
    spectrogram = torchaudio.transforms.AmplitudeToDB()(spectrogram)
    return spectrogram.squeeze(0).transpose(0, 1)


def pad_sequences(sequences, pad_value: int = 0):
    max_len = max([s.size(0) for s in sequences])

    padded_sequences = [
        torch.nn.functional.pad(s, (0, max_len - s.size(0)), value=pad_value)
        for s in sequences
        ]

    return torch.stack(padded_sequences)


def vector_quantize(features, n_clusters: int = 100):
    kmeans = KMeans(n_clusters=n_clusters)
    all_data = np.vstack([f.numpy() for f in features])
    kmeans.fit(all_data)

    quantized_features = [
        torch.tensor(kmeans.predict(f.numpy()), dtype=torch.long)
        for f in features
        ]

    return quantized_features, kmeans


def predict_single_file(file_path: str, model, kmeans, label_encoder):
    spectrogram = audio_to_spectrogram(file_path)

    all_data = np.vstack([spectrogram.numpy()])
    quantized_features = torch.tensor(
        kmeans.predict(all_data), dtype=torch.long
        )

    quantized_features_padded = pad_sequences([quantized_features])

    model.eval()
    with torch.no_grad():
        outputs = model(quantized_features_padded)
        predicted_probabilities = torch.nn.functional.softmax(outputs, dim=1)
        predicted_index = predicted_probabilities.argmax(1)
        predicted_label = label_encoder.inverse_transform([predicted_index.item()])

    return predicted_label, predicted_probabilities


kmeans = load("models/kmeans_model.joblib")
label_encoder = load("models/label_encoder.joblib")

file_path = "data/train/up/44260689_nohash_0.wav"
predicted_label, probabilities = predict_single_file(file_path, model, kmeans, label_encoder)
print(f"Predicted Label: {predicted_label}, Probabilities: {probabilities}")


Predicted Label: ['up'], Probabilities: tensor([[0.2671, 0.2788, 0.0193, 0.4348]])
