In [1]:
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, ASTForAudioClassification, ASTConfig, ASTModel
from torchinfo import summary
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, DownloadConfig
import combined_utils
import os
from outsource import AudioSetAST, create_label_mapping, collate_fn, create_multihot_labels

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.chdir("../..")
# os.getcwd()

# All paths now need to be relative to the project dir

In [3]:
from crnn.code import engine

# Training

In [None]:
class ASetFineAnyAST(nn.Module):

    def __init__(self, inp_t=512, inp_f=128, n_classes=50):
        super(ASetFineAnyAST, self).__init__()

        self.backbone = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
        # Note that the embedding dim is 768 from the pretrained model


        # Replacing the old head for the new classes
        self.backbone.classifier = nn.Sequential(
            nn.LayerNorm(normalized_shape=(768,), eps=1e-12, elementwise_affine=True),
            nn.Linear(in_features=768, out_features=n_classes, bias=True)
            )
        # Modifying the positional embeddings

        H_old = (1024 - 16) // 10 + 1
        W_old = (128  - 16) // 10 + 1
        H_new = (inp_t - 16) // 10 + 1
        W_new = (inp_f  - 16) // 10 + 1
        
        N_old = H_old * W_old
        N_new = H_new * W_new

        old_pos = self.backbone.audio_spectrogram_transformer.embeddings.position_embeddings.data
        
        # (CLS-pos, patch-pos grid, DIST-pos):
        cls_pe   = old_pos[:, 0, :]              
        patch_pe = old_pos[:, 1 : 1 + N_old, :]   
        dist_pe  = old_pos[:, 1 +N_old , :]      


        # Reshaping patch_pe  [1, D, H_old, W_old] for interpolation to [1, D, H_new, W_new]:
        D = patch_pe.size(-1)
        patch_pe = patch_pe.view(1, H_old, W_old, D).permute(0, 3, 1, 2)  # [1, D, H_old, W_old]

        patch_pe = F.interpolate(
            patch_pe,
            size=(H_new, W_new),
            mode='bilinear',
            align_corners=False
        )
        patch_pe = patch_pe.permute(0, 2, 3, 1).reshape(1, N_new, D)     # [1, N_new, D]

        # print(cls_pe.shape, patch_pe.shape, dist_pe.shape)
        # Concat back CLS-pos and DIST-pos
        new_pos = torch.cat([cls_pe.unsqueeze_(0), patch_pe, dist_pe.unsqueeze_(0)], dim=1)         # [1, 1+N_new+1, D]


        # Overwrite the original embeddings
        self.backbone.audio_spectrogram_transformer.embeddings.position_embeddings = nn.Parameter(new_pos, requires_grad=True)

    def forward(self, x):
        
    
        return self.backbone.forward(x).logits
        
        



In [None]:
# wav_dir = "AST/ast-master/egs/esc50/data/ESC-50-master/audio"
# config_path = "AST/ast-master/egs/esc50/data/datafiles/esc50_config.json"

# Configuration dictionary
config_dict = {'num_mel_bins': 128,
            'target_length': 512, # {'audioset':1024, 'esc50':512, 'speechcommands':128}
            'loss' : 'CE',
            'mode':'train', 
            'mean':-6.6268077, # ESC -6.6268077, AUDIOSET -4.2677393
            'std' : 5.358466, # ESC 5.358466, AUDIOSET 4.5689974
            'fstride' : 10,
            'tstride' : 10,
            'input_fdim' : 128,
            'input_tdim' : 512,
            'imagenet_pretrain' : True,
            'audioset_pretrain' : True,
            'model_size' : 'base384',
            'epochs' : 5,
            'lr' : 1e-5, # if audioset pretrain is false, then value one order up (1e-4)
            'weight_decay' : 5e-7,
            'betas' : (0.95, 0.999),
            'lrscheduler_start' : 5,
            'lrscheduler_step' : 1,
            'lrscheduler_decay' : 0.85,
            'print_freq' : 100,
            'exp_dir' : "./exp/landing"
}

# Paths
train_json = "AST/finetuned/data/datafiles_fbank/esc_train_data_1.json"
eval_json = "AST/finetuned/data/datafiles_fbank/esc_eval_data_1.json"
label_csv = "AST/finetuned/data/esc_class_labels_indices.csv"

# Dataloaders
train_dataset = combined_utils.FbankDataset(train_json, label_csv=label_csv)
eval_dataset = combined_utils.FbankDataset(eval_json, label_csv=label_csv)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True
)
eval_loader = torch.utils.data.DataLoader(
    eval_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# temp_train_dataset = combined.FbankDataset(train_json_path, csv_path, config_dict)

# os.makedirs(config_dict['exp_dir'], exist_ok=True)

'cuda'

In [None]:
model = ASetFineAnyAST(inp_t=512, inp_f=128, n_classes=50)
# optimizer = torch.optim.Adam(model.parameters(), config_dict['lr'], weight_decay=config_dict['weight_decay'], betas=config_dict['betas'])
# loss_fn = nn.CrossEntropyLoss()
# accuracy_multi_class = lambda pred, tar: (torch.argmax(pred, dim=1) == torch.argmax(tar, dim=1)).float().mean()


In [None]:
# Train
os.makedirs(config_dict['exp_dir'], exist_ok=True)
combined_utils.train_model(model, train_loader, eval_loader, config_dict)

Using device: cuda

Epoch 1/5
Initiating training batches
Epoch: [0][0/400]	Time 0.543 (0.543)	Data 0.009 (0.009)	Loss 4.2201 (4.2201)
Epoch: [0][100/400]	Time 0.634 (0.625)	Data 0.010 (0.013)	Loss 2.1011 (3.7279)
Epoch: [0][200/400]	Time 0.629 (0.633)	Data 0.009 (0.013)	Loss 1.3429 (3.0048)
Epoch: [0][300/400]	Time 0.336 (0.579)	Data 0.016 (0.013)	Loss 2.1317 (2.4411)
Validation: [0/100]	Time 0.137 (0.137)	Loss 1.0083 (1.0083)




Train Loss: 2.0757
Val Loss: 0.6812
Val Accuracy: 0.9100

Epoch 2/5
Initiating training batches
Epoch: [1][0/400]	Time 0.696 (0.696)	Data 0.000 (0.000)	Loss 0.2690 (0.2690)
Epoch: [1][100/400]	Time 0.384 (0.400)	Data 0.032 (0.024)	Loss 0.1021 (0.4951)
Epoch: [1][200/400]	Time 0.408 (0.407)	Data 0.034 (0.028)	Loss 0.0991 (0.4043)
Epoch: [1][300/400]	Time 0.946 (0.478)	Data 0.013 (0.026)	Loss 0.3112 (0.3390)
Validation: [0/100]	Time 0.135 (0.135)	Loss 0.2563 (0.2563)
Train Loss: 0.3097
Val Loss: 0.3435
Val Accuracy: 0.9350

Epoch 3/5
Initiating training batches
Epoch: [2][0/400]	Time 0.509 (0.509)	Data 0.000 (0.000)	Loss 0.0640 (0.0640)
Epoch: [2][100/400]	Time 0.441 (0.398)	Data 0.038 (0.023)	Loss 0.0394 (0.1084)
Epoch: [2][200/400]	Time 0.931 (0.485)	Data 0.012 (0.024)	Loss 0.0399 (0.0980)
Epoch: [2][300/400]	Time 0.486 (0.626)	Data 0.002 (0.019)	Loss 0.0532 (0.0814)
Validation: [0/100]	Time 0.161 (0.161)	Loss 0.4465 (0.4465)
Train Loss: 0.0751
Val Loss: 0.2575
Val Accuracy: 0.9450

Ep

In [None]:
check = torch.load("exp/best_5epoch.pth")


In [None]:
model.load_state_dict(check['model_state_dict'])

<All keys matched successfully>

In [None]:
#generating soft labels
combined_utils.run_inference(model, train_json, label_csv,'exp/landing/fold1_train_soft_labels.csv', 'cuda')


Running inference: 100%|██████████| 1600/1600 [01:00<00:00, 26.32it/s]


Inference complete!
Overall accuracy: 1.0000
Results saved to: exp/landing/fold1_train_soft_labels.csv





1.0

In [None]:
combined_utils.run_inference(model, eval_json, label_csv,'exp/landing/fold1_eval_soft_labels.csv', 'cuda')