In [1]:
from torchvision import datasets, transforms
import math
import torch
import torch.nn.functional as nnf
import torch.nn as nn
import os
import time
import torch.optim as optim 
import datetime
import csv
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt


#### Model

In [2]:
# get a pretrained network for transfer learning
import torchvision.models as models
densenet = models.densenet161(pretrained=True)

class transfer_music_classifer(nn.Module):
  def __init__(self):
    super(transfer_music_classifer, self).__init__()
    self.featureExtract = densenet.features
    
    self.classifier = nn.Sequential(
      nn.Linear(2208, 780),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(780, 240),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(240, 5),
      nn.Sigmoid(),
    )

  def forward(self, spectrogram):
    features = self.featureExtract(spectrogram)

    ######
    # This code is normally built into densenet, but since we are splitting
    # the features and classifier it must be put here
    # Taken from here: 
    # https://pytorch.org/vision/stable/_modules/torchvision/models/densenet.html

    # inplace = True means it modifies the input instead of allocating memory
    # for an ouput. Saves on memory
    out = nnf.relu(features, inplace=True)

    # adaptive pool decides the stride and kernel size automatically to ensure
    # the output has shape (x,x,1,1) regardless of input
    out = nnf.adaptive_avg_pool2d(out, (1, 1))

    # reshape output to be (1, x)
    out = torch.flatten(out, 1)
    ######    

    out = self.classifier(out)
    return out

In [3]:
print(densenet.features)

Sequential(
  (conv0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (norm0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu0): ReLU(inplace=True)
  (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (denseblock1): _DenseBlock(
    (denselayer1): _DenseLayer(
      (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (conv1): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (norm2): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu2): ReLU(inplace=True)
      (conv2): Conv2d(192, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    )
    (denselayer2): _DenseLayer(
      (norm1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (conv1): Conv2d(144, 192, 

#### Data loading

In [14]:
torch.manual_seed(1000)

# root dir for dataset !!! CHANGE TO OWN !!!
root_path = r'C:\Users\sjd30\Docnosync\spectrograms2'
#root_path = r'C:\Users\sjd30\Docnosync\overfit'

preprocess = transforms.Compose([
  transforms.Resize([224,224]),
  transforms.ToTensor(),
  # densenet expects this transformations
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# load the dataset & labels from the given folder. Convert them to tensors
dataset = datasets.ImageFolder(root_path, transform=preprocess)
print("Total num images:", len(dataset))

num_train_samples = math.floor(len(dataset)*0.7)
num_val_samples = math.floor(len(dataset)*0.15)
num_test_samples = math.floor(len(dataset)*0.15)

# add any extra images to the test set to avoid imperfect splitting
num_test_samples += len(dataset) - (num_train_samples + num_val_samples + num_test_samples)
print("number of images in train,val,test,total: ", num_train_samples, num_val_samples, num_test_samples, num_train_samples + num_val_samples + num_test_samples)

# uses pytorch method to randomly divide the loaded dataset with a set seed for reproducability
train_data, val_data, test_data = torch.utils.data.random_split(dataset, [num_train_samples, num_val_samples, num_test_samples], generator=torch.Generator().manual_seed(100))

Total num images: 108738
number of images in train,val,test,total:  76116 16310 16312 108738


#### Functions

In [15]:
# model is the network
# train_data and val_data are pytorch datasets
# batch_size, learning_rate, and num_epochs are hyperparameters
# save_dir defines the directory in which models will be saved
# net_name makes it easy to find the saved model in drive
# weight_decay is a parameter used in adam optimizer to 

def train_net(model, train_data, val_data, batch_size=32, learning_rate=1e-3, num_epochs=5, save=False, save_dir=r'C:\Users\sjd30\Downloads', weight_decay=1e-2, net_name='default'):
  
  csvPath = r'C:\Users\sjd30\Downloads\records.csv'
  # CE Loss for multi-class
  criterion = nn.CrossEntropyLoss()

  # adam optimizer for rapid training
  # the pretrained features of densenet are trained at 1/3 the learning rate
  # for fine tuning purposes.
  optimizer = optim.AdamW([
                {'params': model.featureExtract.parameters(), 'lr': learning_rate / 3.0},
                {'params': model.classifier.parameters() }
            ], lr=learning_rate, weight_decay = weight_decay)

  trainacc, iterations, lossrecords = [], [], []

  # create the dataloaders from the passed data
  train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, 
                                          num_workers=0, shuffle=True)
  val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, 
                                            num_workers=0, shuffle=True)
  batch_num = 0
  start=time.time()
  for epoch in range(num_epochs):
    mini_batch_correct = 0
    mini_batch_total = 0
    #loops through each minibatch
    for spectrograms, labels in iter(train_loader):
      #Checks for GPU availability. Uses GPU if available.
      if torch.cuda.is_available():
        spectrograms = spectrograms.cuda()
        labels = labels.cuda()

      # standard training loop for pytroch NN.
      # gets outputs for the batch, computes losses,
      # uses backpropagation to compute the gradient
      # adjusts the weights, and cleans up the gradient
      optimizer.zero_grad()
      out = model(spectrograms)

      loss = criterion(out, labels)
      loss.backward()
      optimizer.step()
      
    
      pred = out.max(1, keepdim=True)[1]
      mini_batch_correct = pred.eq(labels.view_as(pred)).sum().item()
      mini_batch_total = spectrograms.shape[0]

      lossrecords.append(float(loss) / batch_size) # convert to float
      trainacc.append((mini_batch_correct / mini_batch_total))
      iterations.append(batch_num)
    
      # every 50 iteration give basic progress info
      if batch_num % 50 == 0:
          print('Batch Number {0}  Total Time (s): {1}  Progress: {2}%  Trainacc: {3}'.format(batch_num, time.time()-start, (100 * batch_num * batch_size) / (num_epochs * len(train_loader) * batch_size), trainacc[-1]))
      # progress to next batch
      batch_num += 1
        
      
      newRow = [lossrecords[-1], trainacc[-1], iterations[-1], np.nan]
      with open(csvPath, 'a') as file:
        writer = csv.writer(file)
        writer.writerow(newRow)

    # saves the file with a descriptive name, including a timestamp to help
    # prevent overwriting files in case the net_name was not changed
    # also, since the model is large (100mb), dont save it unless it has
    # improved since the last epoch. -2 fixes a possible issue where the last
    # minibatch has skewed accuracy due to a different size
    # https://piazza.com/class/ko91w5f8es3619?cid=70
    if save:
        print('saving model')
        save_file = os.path.join(save_dir,'name-{0}_epoch-{1}_bs-{2}_lr-{3}_time-{4}'.format(net_name, epoch+1, batch_size, learning_rate, datetime.datetime.now().strftime("%H-%M-%S")))
        torch.save(model.state_dict(), save_file)
        
    valacc = eval_model(model, val_loader)
    
    newRow = [np.nan, np.nan, np.nan, valacc]
    with open(csvPath, 'a') as file:
        writer = csv.writer(file)
        writer.writerow(newRow)
    
    # each epoch, give more detailed accuracy information
    print('Epoch number: {0}  Train acc: {1}  Val acc: {2}   Loss{3}\nTotal Time (s): {4}   Time per Epoch (s): {5}  \n\n'.format(
        epoch+1, 'n/a', valacc, lossrecords[-1], time.time() - start, (time.time() - start) / (1 + epoch) ))

In [16]:
def eval_model(model, loader_to_evaluate):
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
      for spectrograms, labels in loader_to_evaluate:
        # use GPU
        if torch.cuda.is_available():
          spectrograms = spectrograms.cuda()
          labels = labels.cuda()

        # the output from model(spectrograms) is the probabilities of each genre
        # as the rows, and the classes as the columns. We find the max
        # probability class (dim = 1) and that returns a 2d tensor with only
        # columns. To make this a 1d tensor, we take the dim 1 (like squeezing)
        classPrediction = model(spectrograms).max(1, keepdim=True)[1]
        # adds up all of the  times that the label equal the prediction. Taken
        # from tut3a notebook
        correct += classPrediction.eq(labels.view_as(classPrediction)).sum().item()
        total += spectrograms.shape[0]
  model.train()
  return correct / total

#### Training

In [17]:
model = transfer_music_classifer()
#state = torch.load(r'C:\Users\sjd30\Downloads\name-optimal2_epoch-32_bs-16_lr-0.0003_time-12-30-31')
#model.load_state_dict(state)

# Train on GPU
if torch.cuda.is_available():
    print('Using GPU')
    model.cuda()
else:
    print('Using CPU, consider using a GPU runtime to speed up training')
    
train_net(model, train_data, val_data, batch_size=16, num_epochs=20, learning_rate = 1.5e-4, save=True, net_name = 'microlr', weight_decay = 1e-2)

Using GPU
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
Batch Number 0  Total Time (s): 0.356428861618042  Progress: 0.0%  Trainacc: 0.3125
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])
torch.Size([16, 2208, 7, 7])
torch.Size([16, 2208, 1, 1])


KeyboardInterrupt: 

In [None]:
# evaluate the test accuracy and create a confusion matrix
model = transfer_music_classifer()
state = torch.load(r"C:\Users\sjd30\Downloads\retrain\FINAL MODEL.00015_time-17-59-12")
model.load_state_dict(state)
model.eval()
# Train on GPU
if torch.cuda.is_available():
    print('Using GPU')
    model.cuda()
else:
    print('Using CPU, consider using a GPU runtime to speed up training')
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, 
                                    num_workers=0, shuffle=True)
truth = []
pred = []
for spec,label in test_loader:
    if torch.cuda.is_available():
      spec = spec.cuda()
      label = label.cuda()
    pred.extend(model(spec).max(1, keepdim=True)[1].squeeze(1).tolist())
    truth.extend(label.tolist())
print('done')
#print(eval_model(model, test_loader))
model.train()

In [None]:
axisLabels = ['Electronic', 'Folk', 'Hip-Hop', 'Pop', 'Rock']


sns.set(font_scale=1.4)


chart = sn.heatmap(confusion_matrix(truth, pred), annot=True, annot_kws={"size": 12}, fmt='g', cbar = False, 
           xticklabels = axisLabels, yticklabels = axisLabels)
chart.set_ylabel('Actual Truth')
chart.set_xlabel('Predicted Value')


In [21]:
print('f')

f
