# Imports

### Libraries

In [1]:
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torch.optim as optim
import os
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from google.colab import auth, drive
from googleapiclient.discovery import build
import zipfile
import gdown

In [2]:
torch.cuda.empty_cache()

### Paths

In [3]:
file_id = "1QFB1fNcsAoMlLcyrmPmNlQe7YZmlHPjd"

destination = "/content/file.zip"

gdown.download(f"https://drive.google.com/uc?id={file_id}", destination, quiet=False)

extract_path = "/content/extracted"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(destination, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

Downloading...
From (original): https://drive.google.com/uc?id=1QFB1fNcsAoMlLcyrmPmNlQe7YZmlHPjd
From (redirected): https://drive.google.com/uc?id=1QFB1fNcsAoMlLcyrmPmNlQe7YZmlHPjd&confirm=t&uuid=fcf53a68-d258-4722-82fc-5edd3460b364
To: /content/file.zip
 85%|████████▍ | 6.04G/7.13G [01:29<00:16, 67.1MB/s]

KeyboardInterrupt: 

In [None]:
dataset = "200"
# dataset = "12"

dir_main = extract_path
dir_train = dir_main + f"/dataset_3s/dataset_{dataset}_sp/train_grouped"
dir_val = dir_main + f"/dataset_3s/dataset_{dataset}_sp/val_grouped"
dir_test = dir_main + f"/dataset_3s/dataset_{dataset}_sp/test_grouped"

print("Train Directory:", dir_train, os.path.exists(dir_train))
print("Validation Directory:", dir_val, os.path.exists(dir_val))
print("Test Directory:", dir_test, os.path.exists(dir_test))

### Constants

In [None]:
N_SAMPLE_RATE = 48000 # The sample rate of the input audio, determining the temporal resolution of the audio features.
N_FTT = 1024 # The length of the Fast Fourier Transform (FFT) window, influencing the frequency resolution of the audio features.
N_HOP = 280 # The hop length between consecutive FFT windows, determining the temporal resolution of the audio features.
F_MIN = 500 # The minimum frequency of the mel filter banks, representing the lower bound of the frequency range of the audio features.
F_MAX = 15000 # The maximum frequency of the mel filter banks, representing the upper bound of the frequency range of the audio features.

N_MELS = 96 # The number of mel filter banks in the input spectrogram, representing the frequency resolution of the audio features.
N_CTX = 1000 # The context length, determining how many time steps (frames) the model processes at once.
N_STATE = 1024 # The dimension of the hidden state, affecting the size of internal representations in the model.
N_HEAD = 8 # The number of attention heads in the multi-head attention mechanism, impacting how the model attends to different parts of the input.
N_LAYER = 24 # The number of residual attention blocks, controlling the depth of the encoder.
N_CLASSES = len(os.listdir(dir_train)) # The number of output classes for classification, defining the number of possible labels the model predicts.

### Model

In [None]:
class Conv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding)

    def forward(self, x):
        return self.conv(x)

class LayerNorm(nn.Module):
    def __init__(self, n_state):
        super().__init__()
        self.norm = nn.LayerNorm(n_state)

    def forward(self, x):
        return self.norm(x)

class ResidualAttentionBlock(nn.Module):
    def __init__(self, n_state, n_head):
        super().__init__()
        self.attn = nn.MultiheadAttention(n_state, n_head, batch_first=True)
        self.ln_1 = LayerNorm(n_state)
        self.ln_2 = LayerNorm(n_state)
        self.fc = nn.Sequential(nn.Linear(n_state, 4 * n_state), nn.GELU(), nn.Linear(4 * n_state, n_state))

    def forward(self, x):
        attn_output, _ = self.attn(x, x, x)
        x = x + attn_output
        x = self.ln_1(x)
        x = x + self.fc(x)
        x = self.ln_2(x)
        return x

def sinusoids(n_ctx, n_state):
    position = torch.arange(n_ctx).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_state, 2) * -(torch.log(torch.tensor(10000.0)) / n_state))
    pe = torch.zeros(n_ctx, n_state)
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe.unsqueeze(0)

In [None]:
class AudioEncoder(nn.Module):
    def __init__(self, n_mels, n_ctx, n_state, n_head, n_layer, n_classes):
        super(AudioEncoder, self).__init__()
        self.conv1 = nn.Conv1d(n_mels, n_state, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)

        self.register_buffer("positional_embedding", torch.randn(1, 1000, n_state))

        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=n_state, nhead=n_head, activation="gelu")
            for _ in range(n_layer)
        ])

        self.ln_post = nn.LayerNorm(n_state)
        self.fc1 = nn.Linear(n_state, n_state // 2)
        self.fc2 = nn.Linear(n_state // 2, n_classes)
        self.dropout = nn.Dropout(0.3)
        self.activation = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(x.shape[0], x.shape[2], -1)
        x = F.gelu(self.conv1(x))
        x = F.gelu(self.conv2(x))
        x = x.permute(0, 2, 1)

        pos_emb = self.positional_embedding[:, :x.shape[1], :]
        x = (x + pos_emb).to(x.dtype)

        for block in self.blocks:
            x = block(x)

        x = self.ln_post(x)
        x = self.dropout(F.gelu(self.fc1(x)))
        x = self.fc2(F.gelu(x[:, -1, :]))

        return self.activation(x)

In [None]:
model = AudioEncoder(N_MELS, N_CTX, N_STATE, N_HEAD, N_LAYER, N_CLASSES)
print(model)

# Training Model

### Class dan Function

In [None]:
def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = waveform / waveform.abs().max()
    transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=N_SAMPLE_RATE,
        n_fft=N_FTT,
        hop_length=N_HOP,
        n_mels=N_MELS,
        f_min=F_MIN,
        f_max=F_MAX
    )
    mel_spectrogram = transform(waveform)
    mel_spectrogram = mel_spectrogram.unsqueeze(0)
    mel_spectrogram = torch.nn.functional.interpolate(
        mel_spectrogram, size=(96, 217), mode="bilinear", align_corners=False
    )
    mel_spectrogram = mel_spectrogram.squeeze(0)
    return mel_spectrogram

class AudioDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        mel_spectrogram = preprocess_audio(audio_path)
        return mel_spectrogram, torch.tensor(label, dtype=torch.long)

def load_data(data_dir):
    file_paths = []
    labels = []
    class_mapping = {cls_name: i for i, cls_name in enumerate(sorted(os.listdir(data_dir)))}

    for cls_name in sorted(os.listdir(data_dir)):
        cls_path = os.path.join(data_dir, cls_name)
        if os.path.isdir(cls_path):
            for file_name in sorted(os.listdir(cls_path)):
                file_paths.append(os.path.join(cls_path, file_name))
                labels.append(class_mapping[cls_name])

    return file_paths, labels

def train(model, train_loader, criterion, optimizer, device, train_size):
    model.train()
    total_loss, correct = 0, 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    train_loss = total_loss / len(train_loader)
    train_acc = correct / train_size

    return train_loss, train_acc

def evaluate(model, val_loader, criterion, device, val_size):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()

    val_loss = total_loss / len(val_loader)
    val_acc = correct / val_size

    return val_loss, val_acc

### Load Dataset

In [None]:
gc.collect()
torch.cuda.empty_cache()

BATCH_SIZE = 128

In [None]:
train_files, train_labels = load_data(dir_train)
val_files, val_labels = load_data(dir_val)

train_dataset = AudioDataset(train_files, train_labels)
val_dataset = AudioDataset(val_files, val_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

### Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
dataset_size = len(train_loader.dataset)
val_size = len(val_loader.dataset)

for epoch in range(num_epochs):
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=True)
    total_loss, correct, batch_count = 0, 0, 0
    for inputs, labels in train_loader_tqdm:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
        batch_count += 1
        train_loader_tqdm.set_postfix(loss=total_loss/batch_count, acc=correct/dataset_size)

    val_loss, val_acc = evaluate(model, val_loader, criterion, device, val_size)
    print(f"Epoch {epoch+1}: Train Loss={total_loss/batch_count:.4f}, Train Acc={correct/dataset_size:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")