In [1]:
# @title Imports

import os
import glob
import torch
import time
import copy
import random

import numpy as np
import matplotlib.pyplot as plt
import pickle
from tqdm.notebook import tqdm

import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.optim import lr_scheduler

In [2]:
# Mount Google Drive if Necessary
# from google.colab import drive
# drive.mount('/content/drive') #it will ask you for a verification code

In [2]:
# Set path to datasets and models
datapath = os.path.abspath('./data')
#datapath = os.path.abspath('/content/drive/MyDrive/LEVERHULME/COURSES/NEUROMATCH2021/HallucinatingGANs/Code/data')
print(datapath)

/home/jon/Drive/LEVERHULME/COURSES/NEUROMATCH2021/HallucinatingGANs/Code/data


In [3]:
# @title Set device (GPU or CPU)
# NMA code
# inform the user if the notebook uses GPU or CPU.

def set_device():
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

def set_seed(seed=None, seed_torch=True):
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')

# In case that `DataLoader` is used
def seed_worker(worker_id):
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

def scale_minmax(X):
    mini = X.min()
    maxi = X.max()
    X_scaled = (X - mini) / (maxi - mini)
    return X_scaled, mini, maxi


set_seed(seed=2021)
DEVICE = set_device()

Random seed 2021 has been set.
GPU is enabled in this notebook.


In [4]:
# Test pickle loaders

def pickle_loader_mel(file):
  with open(file, 'rb') as f:
    data = pickle.load(f) # load pickle file from disk a tensor
    # Why?
    data = data.permute(1, 2, 0)
    # Scale frequency amplitudes to the range [0.0, 1.0]
    data,_,_ = scale_minmax(data)
    
    # Create 3 channel images by copying the image to each channel
    # TODO: Try different use of RGB channels, e.g. time/frequency chopping the image and putting each slice on a different channel
    # This approach will actually make 3 dreams, one per channel..
    data = torch.cat((data, data, data), axis=2)

  return(data)

def pickle_loader_stft_real(file):
  with open(file, 'rb') as f:
      datain = pickle.load(f) # load this pickle file, expected a tensor
      #print(f"Data In: {datain.shape}")
    
      # the loaded tensors are 1x513x2586(2585)x2 >> 1xBINSxFRAMESxREAL/COMPLEX
      # not sure why the extra dim 1 at the beginning, this removes it
      data = datain.squeeze(dim=0)
      
      # NOTE: the model expects these images as 3xBINSxFRAMES where 3=channels
        
      # Annoyingly, the spectrograms are not consistent in length
      # most are 2586 or 2585, but I've also seen 2592, 2599, 2640, 2626, 2579
      # Clip all of them to 2502 to be safe (which is still divisible by 3)
      # TODO: Try clipping to 861 (10s) ... or even smaller
      data = data[:,:2502,:]
    
      # real and imaginary parts are scaled independently
      # TODO: also try log scaling
      data[:,:,0],_,_ = scale_minmax(data[:,:,0])
      data[:,:,1],_,_ = scale_minmax(data[:,:,1])

      data = data[:,:,0].reshape(3, 513, -1)
      # add an extra dim for 3-chan concatenation
      # real is now BINSxFRAMESx1
      #real = data[:,:,0].unsqueeze(2)
        
      # copy the real (magnitude) data 3 times for each channel
      #data = torch.cat((real, real, real), axis=2)
        
#      print(f"Data In: {datain.shape} -- Data Out: {data.shape}")
  return(data)

# Load mel (NOT YET TESTED IN TRAINING)
pkfile = os.path.join(datapath, 'spectrograms/mel/1024_256_128/blues/blues.00000.pkl')
smp = pickle_loader_mel(pkfile)
print("Mel Loader (Mels not tested in training!): ", smp.shape, type(smp), smp.dtype)

# Load stft
print()
pkfile = os.path.join(datapath, 'spectrograms/stft/1024_256/classical/classical.00000.pkl')
smp = pickle_loader_stft_real(pkfile)
print("STFT Loader: ", smp.shape, type(smp), smp.dtype)

# TODO: print out the spectrograms to make sure they look correct

Mel Loader (Mels not tested in training!):  torch.Size([128, 2586, 3]) <class 'torch.Tensor'> torch.float32

STFT Loader:  torch.Size([3, 513, 834]) <class 'torch.Tensor'> torch.float32


In [5]:
# Load dataset

data_dir = os.path.join(datapath, 'spectrograms', 'stft', '1024_256')
print("Loading Dataset: ", data_dir)

#dims = (513, 2586) # 30s audio clips
dims = (513, 862) 
data_transforms = transforms.Compose([transforms.ToTensor()])
pickle_loader = pickle_loader_stft_real

dataset = torchvision.datasets.DatasetFolder(
    root = data_dir,
#    transform = data_transforms, 
    loader = pickle_loader, 
    extensions='.pkl', 
)

class2label, label2class = dataset.find_classes(data_dir)
print("\nFound Class Labels: ", label2class)

# Make train-val-test split
n_classes = len(class2label) 

items_per_class = 100
ratios = [0.8, 0.1, 0.1]
train_size = ratios[0] * items_per_class
val_size = ratios[1] * items_per_class
test_size = ratios[2] * items_per_class

test_ix, val_ix, train_ix = np.array([]),np.array([]),np.array([])

# Take the appropriate ratio of examples from each class
for i in range(n_classes): 
    class_ix = items_per_class * i # 0 index for samples from class i
    train_ix = np.append(train_ix, np.arange(train_size) + class_ix)
    val_ix = np.append(val_ix, np.arange(train_size, train_size + val_size) + class_ix)
    test_ix = np.append(test_ix, np.arange(train_size + val_size, train_size + val_size + test_size) + class_ix)

# Create train-val-test subsets of the dataset
subsets = {
    'train': torch.utils.data.Subset(dataset, train_ix.astype(int)),
    'val': torch.utils.data.Subset(dataset, val_ix.astype(int)),
    'test': torch.utils.data.Subset(dataset, test_ix.astype(int))
}

Loading Dataset:  /home/jon/Drive/LEVERHULME/COURSES/NEUROMATCH2021/HallucinatingGANs/Code/data/spectrograms/stft/1024_256

Found Class Labels:  {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3, 'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8, 'rock': 9}


In [6]:
# Load pretrained model INCEPTION3

model_path = os.path.join(datapath, 'models/inception3.pth')

# See: https://pytorch.org/vision/stable/_modules/torchvision/models/inception.html
# And: https://pytorch.org/hub/pytorch_vision_inception_v3/

inception3 = models.inception_v3(pretrained=False)
inception3.load_state_dict(torch.load(model_path))

# Freeze the network, we are only training the final classifier layer
for param in inception3.parameters():
    param.requires_grad = False

# Add our own classifier layer, replacing the ImageNet classifier, on to the end of inception3
inception3.fc = nn.Sequential(
    nn.Linear(2048, 256, bias=True),
    nn.ReLU(), 
    nn.Linear(256, n_classes, bias=True),                   
    nn.LogSoftmax(dim=1)
)

inception3.to(DEVICE)





Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [7]:
#model training parameters
n_epochs = 2
n_workers = 4
batch_size = 16

#audio transforms parrameters
n_fft = 1024
n_mels = 128
hop_length = 256 # smaller hop size leads to better reconstruction but takes longer to compute
power = 2.0 # squared power spectrogram
samplerate =  22050


learning_rate = 1e-03
momentum=0.9

#model = vgg16
model = inception3
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

# Use a LR scheduler: decay LR by a factor of 0.1 every 7 epochs
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


In [None]:
# Load pre-trained model VGG16

# # WARNING: This model is big! (over 1.4GB on the GPU!)
# For a comparison of model sizes and training speeds, see:
#     https://www.analyticsvidhya.com/blog/2020/08/top-4-pre-trained-models-for-image-classification-with-python-code/

# model_path = os.path.join(datapath, 'models/vgg16.pth')
# vgg16 = torchvision.models.vgg16(pretrained=False)
# vgg16_state_dict = torch.load(model_path)
# vgg16.load_state_dict(vgg16_state_dict)

# vgg16.to(DEVICE)

In [8]:
# create dataloaders
from torch.utils.data import DataLoader

print("Item from subset:",subsets['train'][0][0].shape)
#batch_size=2
dataloaders = {
    'train': DataLoader(subsets['train'], batch_size=batch_size, shuffle=True, drop_last=True),
    'val': DataLoader(subsets['val'], batch_size=batch_size, shuffle=True, drop_last=True)
}

dl = dataloaders['train']
inputs, labels = next(iter(dl))
print("One Batch:", inputs.shape, labels.shape)

#model.eval()
model.train()
with torch.no_grad():
    out = model(inputs.to(DEVICE))
    print(f"\nOutput on batch is {type(out)} \n  with Output[0]: {out[0].shape}  and Output[1]: {out[1].shape}")
    del out
    del inputs

# TODO: print out the spectrograms to make sure they look correct

Item from subset: torch.Size([3, 513, 834])
One Batch: torch.Size([16, 3, 513, 834]) torch.Size([16])


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)



Output on batch is <class 'torchvision.models.inception.InceptionOutputs'> 
  with Output[0]: torch.Size([16, 10])  and Output[1]: torch.Size([16, 1000])


In [14]:
# Go through batches
# i=0
# for inputs, labels in dataloaders['train']:
#     print(f"Batch {i} - {inputs.shape} : {labels.shape}")
#     i+=1

# i=0
# for inputs, labels in dataloaders['val']:
#     print(f"Batch {i} - {inputs.shape} : {labels.shape}")
#     i+=1

# Explicit memory cleanup if needed
del inputs
del labels
del outputs
del loss
del preds


NameError: name 'inputs' is not defined

In [10]:
# For useful tips on memory usage, see: https://pytorch.org/docs/stable/notes/faq.html
# Training...

# Training Loop
since = time.time()

# Keep track of weights with best loss performance...
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
# Keep track of train-val accuracy stats
train_acc_list, val_acc_list = [], []

for epoch in tqdm(range(n_epochs)):
    print(f'Epoch {epoch}/{n_epochs-1}')
    print('-' * 10)

    # For each epoch, do a training and evaluation (against the validation set) phase
    for phase in ['train', 'val']:
        if phase == 'train':
            model.train()  # Set model to training mode
        elif phase == 'val':
            model.eval()   # Set model to evaluate mode, especially important for dropout layers!

        running_loss = 0.0
        running_corrects = 0
        num_examples = 0

        # Iterate over data.
        for inputs, labels in dataloaders[phase]:
            
            outputs = None
            loss = None
            preds = None
            maxpreds = None
            
            num_examples += int(inputs.data.shape[0])

            
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE)
                

            if phase == 'train':
                
                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                outputs,_ = model(inputs)
                maxvalues, maxidx = torch.max(outputs.cpu().data, axis=1)
#                 print("Output size: ", outputs.shape)
#                 print("Labels size: ", labels.shape, labels.data)
#                 print("Predictions size: ", maxidx.shape, maxidx.data)
                loss = criterion(outputs, labels)
                    
                # backward
                loss.backward()
                optimizer.step()
                scheduler.step()
                    
            elif phase == 'val':
                                
                with torch.inference_mode(True):
                    # forward
                    outputs = model(inputs)
                    maxvalues, maxidx = torch.max(outputs.cpu().data, axis=1)
                    loss = criterion(outputs, labels)
            
            # statistics
            last_labels = labels.cpu().data
            last_preds = maxidx.data
            running_loss += float(loss.cpu().data) * inputs.cpu().data.shape[0]
            running_corrects += torch.sum(last_preds == last_labels)
                
            if num_examples % 100 == 0:
                print(f' ..{phase}-{num_examples}.. ', end='')
            
            # Explicit memory cleanup
            del inputs
            del labels
            del outputs
            del loss
            del preds
            del maxpreds
            
        print("Predicted  : ", last_preds)
        print("Real Labels: ", last_labels)
        print('number of examples trained on = ', num_examples)
        print(f'RUNNING LOSS: {running_loss}, RUNNING CORRECT PREDS: {running_corrects}')

        epoch_loss = running_loss / num_examples
        print()
        epoch_acc = running_corrects.double() / num_examples

        if phase == 'train':
            train_acc_list.append(epoch_acc)
        elif phase == 'val':
            val_acc_list.append(epoch_acc)
          
        print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        # A new record for best accuracy on the validation set
        #  Keep a record of the weights and the new epoch accuracy
        if phase == 'val' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())

        print()

time_elapsed = time.time() - since
print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
print('Best val Acc: {:4f}'.format(best_acc))

# load best model weights
model.load_state_dict(best_model_wts)


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 0/1
----------
 ..train-400..  ..train-800.. Predicted  :  tensor([5, 5, 9, 9, 9, 8, 9, 9, 9, 6, 1, 5, 5, 8, 9, 5])
Real Labels:  tensor([2, 5, 7, 4, 1, 6, 7, 2, 3, 0, 5, 3, 9, 4, 7, 8])
number of examples trained on =  800
RUNNING LOSS: 1845.4993171691895, RUNNING CORRECT PREDS: 79

train Loss: 2.3069 Acc: 0.0988

Predicted  :  tensor([1, 1, 9, 1, 1, 1, 1, 9, 1, 5, 1, 1, 9, 1, 1, 1])
Real Labels:  tensor([7, 5, 2, 0, 8, 7, 9, 7, 2, 7, 3, 6, 6, 2, 6, 6])
number of examples trained on =  96
RUNNING LOSS: 221.07543563842773, RUNNING CORRECT PREDS: 10

val Loss: 2.3029 Acc: 0.1042

Epoch 1/1
----------
 ..train-400..  ..train-800.. Predicted  :  tensor([1, 9, 9, 1, 1, 1, 5, 1, 8, 1, 5, 1, 5, 9, 9, 1])
Real Labels:  tensor([0, 7, 8, 2, 4, 9, 7, 8, 5, 7, 2, 3, 3, 2, 9, 4])
number of examples trained on =  800
RUNNING LOSS: 1847.7895240783691, RUNNING CORRECT PREDS: 76

train Loss: 2.3097 Acc: 0.0950

Predicted  :  tensor([1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 1, 1])
Real Labels:  

<All keys matched successfully>

In [None]:
# save model for later
acc_dict = {'train_acc': train_acc_list, 'val_acc': val_acc_list}
print('writing file: ' + filename)
with open(acc_file, 'wb') as f:
    pickle.dump(acc_dict, f, pickle.HIGHEST_PROTOCOL)

#save model
torch.save(model_ft.state_dict(), model_file)

In [None]:
# @title Mount Google Drive

if mount_drive:
  from google.colab import drive
drive.mount('/content/drive') #it will ask you for a verification code

In [None]:
# @title Main Loop

for transform in transforms_list:
  #get settings for the transform to be performed
  params, pickle_loader, data_transforms = get_cfg_transform(transform, augment)

  #path of the corresponding spectrograms
  data_dir = os.path.join(os.path.abspath(path), 'spectrograms', transform, params)
  print(data_dir)

  #names of ouput files
  if augment:
    label = '{}AUG_{}_()'.format(transform, params)
  else:
    label = '{}_{}_()'.format(transform, params)

  acc_file = outpath + label + '.pkl'
  model_file = outpath + label + '.pt'

  #load dataset
  dataset = torchvision.datasets.DatasetFolder(root=data_dir,
                                              transform = data_transforms, 
                                              loader=pickle_loader, 
                                              extensions='.pkl', 
                                              )
  #get genres and number of classes in the dataset
  genres = list(os.listdir(data_dir))
  n_classes = len(genres) 

  #generate training, validation and test sets
  subsets = make_sets(classes=n_classes, 
                                        items_per_class=100,
                                        ratios=[.8, .1, .1])

  #create dataloaders
  dataloaders = {x: torch.utils.data.DataLoader(subsets[x], batch_size=minibatch_size,
                                              shuffle=True, num_workers=n_workers)
                for x in ['train', 'val']}

  # Load pretrained VGG
  # code extracted from:
  # https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html#convnet-as-fixed-feature-extractor
  # https://pytorch.org/vision/stable/models.html

  vgg16 = models.vgg16(pretrained=True)

  # Freeze the network except the last layer / unfreeze layers to allow finetuning
  for param in vgg16.parameters():
      param.requires_grad = True # If True it will train

  # Parameters of newly constructed modules have requires_grad=True by default
  # Add on classifier

  vgg16.classifier[6] = nn.Sequential(
                        nn.Linear(vgg16.classifier[3].in_features, 256),
                        nn.ReLU(), 
                        nn.Linear(256, n_classes),                   
                        nn.LogSoftmax(dim=1))

  criterion = nn.CrossEntropyLoss()

  # Observe that only parameters of final layer are being optimized as
  # opposed to before.
  optimizer_conv = optim.SGD(vgg16.parameters(), lr=0.001, momentum=0.9)

  # Decay LR by a factor of 0.1 every 7 epochs
  exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

  # model trains - change num_epochs to increase training time
  vgg16 = vgg16.float()
  model_ft, train_acc_list, val_acc_list = train_model(vgg16.to(device), criterion, optimizer_conv, exp_lr_scheduler,
                        num_epochs=epochs)

  #save accuracies from training procedure
  acc_dict = {'train_acc': train_acc_list, 'val_acc': val_acc_list}
  print('writing file: ' + filename)
  with open(acc_file, 'wb') as f:
      pickle.dump(acc_dict, f, pickle.HIGHEST_PROTOCOL)

  #save model
  torch.save(model_ft.state_dict(), model_file)
