In [1]:
import wandb
import torch
from torch import optim
from torch.utils.data import DataLoader
from clap import Clap, ClapAudioClassifier
from clap.training import create_scheduler, ClapFinetuner
from clap.datasets import ClapDataset
from clap.utils import get_target_device, load_clap_config, set_random_seed

# Fine-tune ClapAudioClassifier on ESC-50

In [2]:
# Load config for audio processing and get target device
audio_encoder = "htsat-tiny"
text_encoder = "gpt2"
cfg_version = 1
ckpt_version = 2
config = load_clap_config(audio_encoder=audio_encoder, text_encoder=text_encoder, version=cfg_version)
device = get_target_device()

In [3]:
# Load Datasets
seed = set_random_seed(None)
train_dataset = ClapDataset(config=config, kinds=["train"], datasets=["ESC50"])
val_dataset = ClapDataset(config=config, kinds=["val"], datasets=["ESC50"])
test_dataset = ClapDataset(config=config, kinds=["test"], datasets=["ESC50"])

Random seed set as 2608568488


In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mleonakkad[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
wandb.init(
    # Set the wandb project where this run will be logged 
    project='CLAP-Fine-tuning',
    name="First fine-tuning run",
    # Track hyperparameters
    config=config
)
config = wandb.config

In [6]:
# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=config["fine-tuning"]["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config["fine-tuning"]["batch_size"])
test_loader = DataLoader(test_dataset, batch_size=config["fine-tuning"]["batch_size"])

In [7]:
# Define model, optimizer, scheduler and loss function
clap = Clap.from_ckpt(audio_encoder=audio_encoder, text_encoder=text_encoder, ckpt_version=ckpt_version, cfg_version=cfg_version)
clap_clf = ClapAudioClassifier(clap=clap, config=config).to(device)
print(f"Number of parameters to train: {sum(p.numel() for p in clap_clf.parameters())}")
optimizer = optim.Adam(clap.parameters(), lr=config["fine-tuning"]["learning_rate"])
scheduler = create_scheduler(optimizer, warmup_steps=31, T_max=len(train_loader)*config["fine-tuning"]["epochs"], milestones=[31])
loss_fn = torch.nn.CrossEntropyLoss()
trainer = ClapFinetuner(
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    model=clap_clf,
    optimizer=optimizer,
    scheduler=scheduler,
    loss_fn=loss_fn,
    epochs=config["fine-tuning"]["epochs"]
)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Number of parameters to train: 158435530


In [8]:
train_metrics, val_metrics, test_metrics = trainer.finetune_and_eval(audio_encoder=audio_encoder, text_encoder=text_encoder, version=1, early_stopping=False)


Starting to finetune Classifier


Training epoch 0: 100%|██████████| 25/25 [00:30<00:00,  1.22s/it]
Evaluating model on val/test set: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]



Epoch: 0 || Training loss: 3.8961 || Validation loss: 3.8534 || Training accuracy: 0.1100 || Validation accuracy: 0.4762

Model saved to C:\Users\leon\Documents\ML_Projects\Custom-CLAP\clap\checkpoints\clf_htsat-tiny_gpt2_v1.ckpt



Training epoch 1: 100%|██████████| 25/25 [00:25<00:00,  1.01s/it]
Evaluating model on val/test set: 100%|██████████| 4/4 [00:02<00:00,  1.50it/s]



Epoch: 1 || Training loss: 3.7964 || Validation loss: 3.7185 || Training accuracy: 0.6737 || Validation accuracy: 0.8006

Model saved to C:\Users\leon\Documents\ML_Projects\Custom-CLAP\clap\checkpoints\clf_htsat-tiny_gpt2_v1.ckpt



Training epoch 2: 100%|██████████| 25/25 [00:25<00:00,  1.00s/it]
Evaluating model on val/test set: 100%|██████████| 4/4 [00:02<00:00,  1.66it/s]



Epoch: 2 || Training loss: 3.6738 || Validation loss: 3.6169 || Training accuracy: 0.8594 || Validation accuracy: 0.8405

Model saved to C:\Users\leon\Documents\ML_Projects\Custom-CLAP\clap\checkpoints\clf_htsat-tiny_gpt2_v1.ckpt



Training epoch 3: 100%|██████████| 25/25 [00:25<00:00,  1.03s/it]
Evaluating model on val/test set: 100%|██████████| 4/4 [00:02<00:00,  1.55it/s]



Epoch: 3 || Training loss: 3.5926 || Validation loss: 3.5548 || Training accuracy: 0.9038 || Validation accuracy: 0.8812

Model saved to C:\Users\leon\Documents\ML_Projects\Custom-CLAP\clap\checkpoints\clf_htsat-tiny_gpt2_v1.ckpt



Training epoch 4: 100%|██████████| 25/25 [00:26<00:00,  1.04s/it]
Evaluating model on val/test set: 100%|██████████| 4/4 [00:02<00:00,  1.49it/s]



Epoch: 4 || Training loss: 3.5517 || Validation loss: 3.5366 || Training accuracy: 0.9313 || Validation accuracy: 0.8769

Model saved to C:\Users\leon\Documents\ML_Projects\Custom-CLAP\clap\checkpoints\clf_htsat-tiny_gpt2_v1.ckpt



Evaluating model on val/test set: 100%|██████████| 4/4 [00:02<00:00,  1.58it/s]


Final loss: 3.5365676879882812 || Final test accuracy: 0.8769

Done!





In [9]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
test/batch accuracy,▅▁▇█
test/batch loss,▅█▅▁
test/step,▁▃▆█
train/accuracy,▁▆▇██
train/batch accuracy,▁▁▁▁▁▂▂▃▄▅▆▆▆▆▆▆▇▇▇▇█▇▇▇▇▇█▇▇███▇██▇███▇
train/batch loss,██████▇▇▇▇▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁
train/loss,█▆▃▂▁
train/step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val/accuracy,▁▇▇██

0,1
epoch,4.0
test/batch accuracy,0.91379
test/batch loss,3.52291
test/step,3.0
train/accuracy,0.93125
train/batch accuracy,0.92188
train/batch loss,3.5414
train/loss,3.55167
train/step,124.0
val/accuracy,0.87689
