In [2]:
from transformers import ASTFeatureExtractor, ASTForAudioClassification
import torch.nn as nn
from data_loading import TorchTensorFolderDataset
from torch.utils.data import Dataset, DataLoader
import os

# Wczytaj model
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
extractor = ASTFeatureExtractor(max_length=16000)


in_features = model.classifier.dense.in_features
model.classifier = nn.Linear(in_features, 30) 
train_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\train")
val_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\validation")
train_dataset = TorchTensorFolderDataset(train_path)
val_dataset = TorchTensorFolderDataset(val_path)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size = 16)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size = 16)


In [9]:
model = ASTForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=30,
    ignore_mismatched_sizes=True
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([30]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([30, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import ASTForAudioClassification, ASTFeatureExtractor

def train(model, dataloader, feature_extractor, device, epochs=10, lr=1e-4):
    model = model.to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in pbar:
            waveforms = batch[0]    # Tensor (B, 16000)
            labels = batch[1].to(device)        # Tensor (B,)
            waveform_list = [w.cpu().numpy().squeeze() for w in waveforms]
            # Feature extraction (AST expects numpy input)
            inputs = feature_extractor(
                waveform_list, 
                sampling_rate=16000, 
                return_tensors="pt", 
                padding=True
            )
            input_values = inputs['input_values'].to(device)

            # Forward
            outputs = model(input_values=input_values)
            logits = outputs.logits
            loss = criterion(logits, labels)

            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Metrics
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            pbar.set_postfix(loss=loss.item(), acc=100 * correct / total)

        print(f"[Epoch {epoch+1}] Avg loss: {total_loss / len(dataloader):.4f} | Accuracy: {100 * correct / total:.2f}%")


In [4]:
from transformers import ASTForAudioClassification, ASTFeatureExtractor

NUM_CLASSES = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ASTForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=NUM_CLASSES,
    ignore_mismatched_sizes=True
)

feature_extractor = ASTFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)

# Załóżmy, że masz już DataLoader:
# batch = (waveform_tensor_batch, label_tensor_batch)
# waveform_tensor_batch: shape (B, 16000)
# label_tensor_batch: shape (B,)

train(model, train_loader, feature_extractor, device, epochs=10, lr=1e-4)


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([30]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([30, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10:   1%|          | 17/3193 [07:34<23:36:20, 26.76s/it, acc=1.47, loss=3.6] 


KeyboardInterrupt: 

In [5]:
import torch
import torch.nn as nn
import torchaudio
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class AudioTransformer(nn.Module):
    def __init__(self, num_classes=30, n_mels=64, transformer_dim=256, num_heads=4, num_layers=4):
        super().__init__()

        self.melspec = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=400,
            hop_length=160,
            n_mels=n_mels
        )
        self.db = torchaudio.transforms.AmplitudeToDB()

        # CNN feature extractor
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        # Patch projection to transformer dim
        self.patch_proj = nn.Linear(n_mels * 64, transformer_dim)  # 64: CNN output channels

        self.pos_embedding = nn.Parameter(torch.randn(1, 101, transformer_dim))  # 101: ~1s/10ms

        encoder_layer = TransformerEncoderLayer(
            d_model=transformer_dim,
            nhead=num_heads,
            dim_feedforward=512,
            dropout=0.1
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)

        self.cls_head = nn.Sequential(
            nn.LayerNorm(transformer_dim),
            nn.Linear(transformer_dim, num_classes)
        )

    def forward(self, x):
        # x: (B, 16000)
        x = self.melspec(x)         # (B, n_mels, T)
        x = self.db(x)              # (B, n_mels, T)
        #x = x.unsqueeze(1)          # (B, 1, n_mels, T)

        x = self.cnn(x)             # (B, 64, n_mels, T)
        B, C, H, W = x.shape
        x = x.view(B, C * H, W)     # (B, C*H, T)
        x = x.permute(0, 2, 1)      # (B, T, C*H)

        x = self.patch_proj(x)      # (B, T, transformer_dim)
        x = x + self.pos_embedding[:, :x.size(1), :]

        x = self.transformer_encoder(x)  # (B, T, transformer_dim)
        x = x.mean(dim=1)               # (B, transformer_dim)

        return self.cls_head(x)         # (B, num_classes)


In [None]:
import torch
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs=20, lr=1e-4, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    best_val_acc = 0.0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for x, y in loop:
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * x.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

            loop.set_postfix(loss=loss.item())

        train_acc = correct / total
        avg_loss = total_loss / total

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                outputs = model(x_val)
                preds = outputs.argmax(dim=1)
                val_correct += (preds == y_val).sum().item()
                val_total += y_val.size(0)

        val_acc = val_correct / val_total

        print(f"Epoch {epoch+1}: Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()

    print(f"Best Val Acc: {best_val_acc:.4f}")
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model


In [11]:
from data_loading import TorchTensorFolderDataset
from torch.utils.data import Dataset, DataLoader
import os
train_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\train")
val_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\validation")
train_dataset = TorchTensorFolderDataset(train_path)
val_dataset = TorchTensorFolderDataset(val_path)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size = 16)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size = 16)
model = AudioTransformer(num_classes=30)

trained_model = train_model(model, train_loader, val_loader, epochs=20)


                                                                         

KeyboardInterrupt: 