In [7]:
import os
from torchvision.datasets import ImageFolder
import torch
from torchvision import transforms, models
from torch import nn
from collections import OrderedDict
import matplotlib.pyplot as plt

In [2]:
output_folder = "data/birdclef-2023/mel_spectrograms"

# Define subfolders for training, validation, and testing
train_folder = os.path.join(output_folder, "train")
val_folder = os.path.join(output_folder, "validation")
test_folder = os.path.join(output_folder, "test")

In [3]:
data_transforms = transforms.Compose([transforms.RandomResizedCrop(224), # before all 225 ##256
                                      #transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor()])

# Load the datasets with ImageFolder
train_data = ImageFolder(train_folder, data_transforms)
valid_set = ImageFolder(val_folder, data_transforms)
test_set = ImageFolder(test_folder, data_transforms)
                                       
# Using the image datasets and the trainforms, define the dataloaders
trainloader = torch.utils.data.DataLoader(train_data, batch_size =64, shuffle=True)
valid_data = torch.utils.data.DataLoader(valid_set, batch_size=64)
test_data = torch.utils.data.DataLoader(test_set, batch_size=64)

classes = dict(zip(list(range(len(train_data.classes))),train_data.classes))

In [4]:
model = models.vit_b_16(weights='DEFAULT')
model

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [5]:
#freeze parameters
for param in model.parameters():
    param.requires_grad = False 

#new classifier for the feedforward
classifier = nn.Sequential(OrderedDict([
                            ('fc1', nn.Linear(768, 400)), #4608,320
                            ('relu', nn.ReLU()),
                            ('dropout', nn.Dropout(0.5)),
                            ('fc2', nn.Linear(400, 350)), #320,300
                            ('relu', nn.ReLU()),
                            ('dropout', nn.Dropout(0.5)),
                            ('fc3', nn.Linear(350, 247)), #300,102
                            ('output', nn.LogSoftmax(dim=1))
                            ]))
model.heads = classifier
print(model)

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [6]:
def create_backup(filename, train_losses, valid_losses, valid_acc):
    torch.save(train_losses, "backups/transformers/train_losses_"+filename)
    torch.save(valid_losses, "backups/transformers/valid_losses_"+filename)
    torch.save(valid_acc, "backups/transformers/valid_acc_"+filename)
    torch.save(model,'models/transformers/model_'+filename+'.pth')
    # Print loss
    cpu_train_losses = []
    cpu_valid_losses = []
    cpu_valid_acc = []

    for tens in train_losses:
        cpu_train_losses.append(tens.to('cpu'))
    plt.plot(cpu_train_losses, label='Training loss')
    plt.plot(valid_losses, label='Validation loss')
    plt.legend(frameon=False)
    plt.ylabel("Loss")
    plt.xlabel("Epochs")
    plt.savefig('fig/transformers/plot_loss_'+filename+'.jpg')

    plt.clf()
    plt.plot(valid_acc, label='Validation Accuracy')
    plt.legend(frameon=False)
    plt.ylabel("Accuracy")
    plt.xlabel("Epochs")
    plt.savefig('fig/transformers/plot_acc_'+filename+'.jpg')

    print("Created tensor backup")

In [8]:
def validation(model, valid_data, criterion):
    test_loss = 0
    accuracy = 0
    for images, labels in valid_data:
        
        images,labels = images.to('cuda'), labels.to('cuda')
        
        output = model.forward(images)
        batch_loss = criterion(output, labels)
        test_loss += batch_loss.item()
        
        ps = torch.exp(output)
        equality = (labels.data == ps.max(dim=1)[1])
        accuracy += equality.type(torch.FloatTensor).mean()
    
    return test_loss, accuracy

In [9]:
train_losses, valid_losses, valid_acc = [], [], []


In [None]:
learning_rate=0.000005
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.functional.nll_loss
epochs = 500
offset=0
step_of_backup=10
#model.to('cuda')
for epoch in range(epochs):
  model.train()
  counter = 0
  for data in trainloader:
    X , y= data
    #X , y = X.to('cuda'), y.to('cuda')
    optimizer.zero_grad() # clear gradient information.
    #output = model(X) <- gives errors
    output = model.forward(X)
    loss = criterion(output, y)
    loss.backward() # do pack-propagation step
    optimizer.step() # tell optimizer that you finished batch/iteration.
    counter += 1 
 
  # network in eval mode for inference
  model.eval()
            
  # Turn off gradients for validation, saves memory and computations
  with torch.no_grad():
      test_loss, accuracy = validation(model, valid_data, criterion)
  print("Epoch: {}/{}...".format(epoch+1,epochs),
        "Loss: {:.4f}".format(loss.data),
        "Train Loss: {:.3f}.. ".format(test_loss/len(valid_data)),
        "Train Accuracy: {:.3f}".format(accuracy/len(valid_data)))
  train_losses.append(loss.data)
  valid_losses.append(test_loss/len(valid_data))
  valid_acc.append(accuracy/len(valid_data))
  running_loss = 0
  model.train()
  if((epoch+1) % step_of_backup == 0):
    create_backup('checkpoint_epoch_'+str(epoch+1+offset)+'_lr_'+str(learning_rate)+'.txt', train_losses, valid_losses, valid_acc)