# Setup and Install Dependencies

In [None]:
!pip install torch torchaudio torchaudio-transform


import torch
import torchaudio
from torch.utils.data import DataLoader, Dataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Data Loading and Preprocessing

In [None]:
!pip install torchaudio
!wget https://download.pytorch.org/tutorial_data.tar.gz
!tar -xf tutorial_data.tar.gz

import os
from pathlib import Path
import torchaudio
from torchaudio.transforms import Resample
from torch.utils.data import Dataset

DATA_DIR = Path("./data/")
LABELS_FILE = DATA_DIR / "labels.txt"
SAMPLE_RATE = 16000

def load_data_and_labels(data_dir, labels_file):
    data = []
    labels = []

    with open(labels_file, "r") as file:
        label_dict = {line.strip(): i for i, line in enumerate(file)}

    for folder in data_dir.iterdir():
        if folder.is_dir():
            label = label_dict.get(folder.name, -1)
            if label != -1:
                for file in folder.glob("*.wav"):
                    waveform, _ = torchaudio.load(file)
                    # Resample if
                    if waveform.shape[1] != SAMPLE_RATE:
                        resample = Resample(orig_freq=waveform.shape[1], new_freq=SAMPLE_RATE)
                        waveform = resample(waveform)
                    data.append(waveform)
                    labels.append(label)

    return data, labels

class SpeechDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

data, labels = load_data_and_labels(DATA_DIR, LABELS_FILE)

speech_dataset = SpeechDataset(data, labels)

sample_data, sample_label = speech_dataset[0]
print(f"Sample Data Shape: {sample_data.shape}")
print(f"Sample Label: {sample_label}")


# Model Definition and Training using VGGish

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

class VGGish(nn.Module):
    def __init__(self, num_classes):
        super(VGGish, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0),
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0),
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0),
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0),
        )
        self.classifier = nn.Sequential(
            nn.Linear(512 * (SAMPLE_RATE // 16) // 16, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

num_classes = len(set(labels))
batch_size = 64
learning_rate = 0.001
epochs = 10

train_data, val_data, train_labels, val_labels = train_test_split(data, labels, test_size=0.2, random_state=42)

train_loader = DataLoader(SpeechDataset(train_data, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(SpeechDataset(val_data, val_labels), batch_size=batch_size)

model = VGGish(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1))  # Add channel dim
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")

model.eval()
correct, total = 0, 0

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.unsqueeze(1))  # Add channel dim
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


# Model Evaluation and Prediction

In [None]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz
!tar -xf speech_commands_test_set_v0.02.tar.gz

TEST_DATA_DIR = Path("./speech_commands_test_set_v0.02")
TEST_LABELS_FILE = TEST_DATA_DIR / "testing_list.txt"

test_data, test_labels = load_data_and_labels(TEST_DATA_DIR, TEST_LABELS_FILE)

test_loader = DataLoader(SpeechDataset(test_data, test_labels), batch_size=batch_size)

# Evaluate on test
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.unsqueeze(1))  # Add channel dimension
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save the trained model
torch.save(model.state_dict(), "speech_model.pth")
print("Model saved successfully.")


sample_index = 0
sample_input, sample_label = test_data[sample_index], test_labels[sample_index]
sample_input = torch.tensor(sample_input, dtype=torch.float32).unsqueeze(0).to(device)
sample_output = model(sample_input.unsqueeze(1))
_, predicted_label = torch.max(sample_output, 1)

print(f"\nExample Prediction:")
print(f"True Label: {sample_label}, Predicted Label: {predicted_label.item()}")


# Inference on New Data

In [None]:
def preprocess_audio(audio_path):
    waveform, _ = torchaudio.load(audio_path, normalize=True)

    #TODO

    return waveform

# Load the saved model
loaded_model = VGGish(num_classes)
loaded_model.load_state_dict(torch.load("speech_model.pth"))
loaded_model.to(device)
loaded_model.eval()


new_audio_path = "/path/to/your/new/audio/sample.wav"
preprocessed_audio = preprocess_audio(new_audio_path)
input_tensor = torch.tensor(preprocessed_audio, dtype=torch.float32).unsqueeze(0).to(device)
output = loaded_model(input_tensor.unsqueeze(1))  # Add channel dimension
_, predicted_label = torch.max(output, 1)

print("\nInference on New Data:")
print(f"Predicted Label: {predicted_label.item()}")


# Visualization of Model Predictions

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def visualize_predictions(model, data_loader, num_samples=5):
    model.eval()

    fig, axes = plt.subplots(num_samples, 2, figsize=(12, 3 * num_samples))
    fig.subplots_adjust(hspace=0.5)

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(data_loader):
            if i >= num_samples:
                break

            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs.unsqueeze(1))  # Add channel dimension
            _, predicted = torch.max(outputs, 1)


            axes[i, 0].plot(inputs[0].cpu().numpy(), color='blue')
            axes[i, 0].set_title(f"True Label: {labels.item()}")

            # predicted probabilities
            probabilities = torch.nn.functional.softmax(outputs, dim=1).cpu().numpy()[0]
            class_labels = list(range(num_classes))
            axes[i, 1].bar(class_labels, probabilities, color='green')
            axes[i, 1].set_xticks(class_labels)
            axes[i, 1].set_title(f"Predicted Label: {predicted.item()}")

    plt.show()

visualize_predictions(loaded_model, test_loader, num_samples=5)
