# Optical Music Recognition - PyTorch implementation

In this notebook I train a convolution neural network to be able to correctly identify the pitch of music notes. 

The dataset I am using was created by DeepScores, and is available at https://tuggeluk.github.io/downloads/.

In [632]:
# Import dependencies

import pandas as pd
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import efficientnet_pytorch
import torchvision
from pathlib import Path
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from random import randrange
from torchvision.models.detection import FasterRCNN
from xml.dom import minidom
from torchvision.models.detection.rpn import AnchorGenerator
from utils import collate_fn

## 1. Constructing data loaders

In [666]:
train_df = pd.read_csv('data/deep_scores_dense/train_names.csv', index_col=0, header = None)
test_df = pd.read_csv('data/deep_scores_dense/test_names.csv', index_col=0, header = None)
classes = pd.read_csv('data/deep_scores_dense/class_names.csv', index_col = 0, header = None)
class_map = classes.to_dict()[1]
class_map = {v: k for k, v in class_map.items()}


class DeepScores(torch.utils.data.Dataset):
    
    def __init__(self, df):
        self.fns = df[1]
        self.ROOT = 'data/deep_scores_dense/'
    
    def __len__(self):
        return len(self.fns)
    
    def get_bboxes(self, fn, scale_factors):
        xml = ET.parse(f'{self.ROOT}xml_annotations/{fn}.xml')
        root = xml.getroot()
        elems = root.findall('object/bndbox')
        bboxes = []
        x_scale, y_scale = scale_factors        
        for e in elems:
            bbox = [float(b) for b in e.itertext()]
            # XML format is (x0, x1, y0, y1)
            # We need (x0, y0, x1, y1)
            x0y0x1y1 = [0, 2, 1, 3]
            bbox = ([bbox[i] for i in x0y0x1y1])

            bbox[0] = bbox[0] * x_scale
            bbox[1] = bbox[1] * y_scale
            bbox[2] = bbox[2] * x_scale
            bbox[3] = bbox[3] * y_scale
            
            bboxes.append(bbox)
        return bboxes
    
    def get_labels(self, fn):
        xml = ET.parse(f'{self.ROOT}xml_annotations/{fn}.xml')
        root = xml.getroot()
        bbox_labels = []
        for tag in root.findall('object/name'):
            lab = tag.text
            lab = class_map[lab]
            bbox_labels.append(lab)
        return bbox_labels
    
    def get_bbox_area(self, bboxes):
        areas = [(x1 - x0) * (y1 - y0) for x0, y0, x1, y1 in bboxes]
        return areas
    

    def __getitem__(self, idx):
        file_name = self.fns[idx]
        fp = os.path.join(self.ROOT, 'images_png', file_name + '.png')
        img = cv2.imread(fp)
        img = img / 255.
        
        x_scale = 1000 
        y_scale = 1420 
        
        scale_factors = (x_scale, y_scale)
        
        img = cv2.resize(img, (x_scale, y_scale))
        img = torch.FloatTensor(img)
        img = img.permute(2, 0, 1)
        
        bboxes = self.get_bboxes(file_name, scale_factors)
        bboxes = torch.as_tensor(bboxes)
        labels = torch.tensor(self.get_labels(file_name), dtype=torch.int64)
        area = torch.FloatTensor(self.get_bbox_area(bboxes))
        iscrowd = torch.zeros(len(labels), dtype=torch.int64)
        target = {}
        target["boxes"] = bboxes
        target["labels"] = labels
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        # Copy data to gpu if available
        img = img.to(device)
        target = target.to(device)
        
        # Then get the
        return img, target
    
training_data = DeepScores(train_df)

training_data_generator = torch.utils.data.DataLoader(training_data, 
                                                      batch_size = 4, 
                                                      collate_fn = collate_fn)

## 2. Training the model

In [667]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                output_size=7,
                                                sampling_ratio=2)

model = FasterRCNN(backbone, 
                   num_classes = 124, 
                   rpn_anchor_generator=anchor_generator, 
                   box_roi_pool = roi_pooler)

In [668]:
params = [p for p in model.parameters() if p.requires_grad]
optimiser = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

num_epochs = 10
device = 'cpu'

for epoch in range(num_epochs):
    engine.train_one_epoch(model, optimizer, training_data_generator, device, epoch, print_freq=1)
    lr_scheduler.step()
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimiser.state_dict()
    }, 'ckpts/')

AttributeError: 'dict' object has no attribute 'to'

In [665]:
from matplotlib.patches import Rectangle

plt.figure(num=None, figsize=(15, 10), dpi=100, facecolor='w', edgecolor='k')


item = training_data.__getitem__(3)
bboxes = list(item[1]['boxes'])
plt.imshow(item[0].permute(1, 2, 0))


for box in bboxes:
    x0, y0, x1, y1 = box.tolist()
    w = x1 - x0
    h = y1 - y0
    
    rect = Rectangle((x0, y0), w, h, fill=False, color='red')
    plt.axes().add_patch(rect)

plt.show()      

AttributeError: 'dict' object has no attribute 'to'

<Figure size 1500x1000 with 0 Axes>

In [652]:
item = training_data.__getitem__(1)



In [653]:
print(item[1]['boxes'].shape)
print(item[1]['labels'].shape)

torch.Size([285, 4])
torch.Size([285])
