In [1]:
import numpy as np
import math
import librosa
import glob

import IPython.display as ipd

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

import torch
from torch import nn as nn
import matplotlib.pyplot as plt


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


from scipy.special import expit, logit


plt.style.use("dark_background")

pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 50)

plt.style.use("dark_background")

%load_ext lab_black

In [2]:
### Data Preprocessing function with librosa
N_BINS = 60
HOP_LENGTH = 512


def get_logC(audio_file):
    y, sr = librosa.load(audio_file)
    C = librosa.cqt(
        y, sr=sr, fmin=librosa.note_to_hz("C1"), n_bins=N_BINS, hop_length=HOP_LENGTH
    )
    logC = librosa.amplitude_to_db(np.abs(C))
    df = pd.DataFrame(logC)
    return df


### Accuracy function
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = correct / len(y_true) * 100

    return acc


### Batch function


def split_batches(x, batch_size):
    return torch.split(x, batch_size)


### dict function
def get_keys_from_value(d, val):
    return [k for k, v in d.items() if v == val]


### set device
device = "cuda" if torch.cuda.is_available() else "cpu"

### get audio files
audio_files = glob.glob("audio_files/**/*.mp3", recursive=True)

In [3]:
### Assign classes

artist_dict = {
    "Gould": 0,
    "Ishizaka": 1,
    "Richter": 2,
    "Schiff": 3,
    "Tureck": 4,
    "Tharaud": 5,
    "Moravec": 6,
    "Rubinstein": 7,
    "Pogorelich": 8,
    "Nikolayeva": 9,
    "Horowitz": 10,
    "Crochet": 11,
}

for n in audio_files[0:]:
    artist = n.split("/")[1].split()[0]
    if artist not in artist_dict.keys():
        print(artist)

In [4]:
### Get data

SECONDS_PER_SAMPLE = 10
chunk_size = SECONDS_PER_SAMPLE * 45  # 45 samples per second
X_list = []  # List to store X arrays
y_list = []  # List to store y arrays

for file in audio_files[0:]:
    t = get_logC(file)

    artist_mapping = file.split("/")[1].split()[0]

    array_list = []

    for n in range(len(t.columns) // chunk_size):
        arr = t[range(n * chunk_size, (n + 1) * chunk_size, 1)].T.to_numpy()
        array_list.append(arr)

    arrays = np.array(array_list)
    X_list.append(torch.tensor(arrays))
    y_list.append(torch.full((arrays.shape[0],), artist_dict[artist_mapping]))

X = torch.cat(X_list, dim=0)
y = torch.cat(y_list, dim=0)
y = y.to(torch.float32)

In [5]:
X.shape, y.shape

(torch.Size([7410, 450, 60]), torch.Size([7410]))

In [4]:
# torch.save(X, "X.pt")
# torch.save(y, "y.pt")

X = torch.load("X.pt")
y = torch.load("y.pt")

In [5]:
X.shape, y.shape

(torch.Size([7410, 450, 60]), torch.Size([7410]))

One hot encode y 

In [6]:
num_classes = len(torch.unique(y))
identity_matrix = torch.eye(num_classes)

one_hot_encoded = identity_matrix[y.to(torch.int64)]

y = one_hot_encoded

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
X_train.shape[2], y_train.shape[1]

(60, 12)

In [9]:
torch.manual_seed(42)

INPUT_SIZE = X_train.shape[2]
HIDDEN_SIZE = 12
OUTPUT_SIZE = y_train.shape[1]


class SimpleRNN(nn.Module):
    def __init__(self):
        # This just calls the base class constructor
        super().__init__()
        # Neural network layers assigned as attributes of a Module subclass
        # have their parameters registered for training automatically.
        self.rnn = torch.nn.RNN(
            INPUT_SIZE, HIDDEN_SIZE, nonlinearity="relu", batch_first=True
        )

        self.linear = torch.nn.Linear(HIDDEN_SIZE, OUTPUT_SIZE)

    def forward(self, x):
        # The RNN also returns its hidden state but we don't use it.
        # While the RNN can also take a hidden state as input, the RNN
        # gets passed a hidden state initialized with zeros by default.
        h = self.rnn(x)[0]
        x = self.linear(h)
        return x

In [10]:
model_1 = SimpleRNN().to(device)

# model_1 = torch.compile(model_1)

# loss
loss_fn = nn.CrossEntropyLoss()


# optimizer

# optimizer = torch.optim.SGD(model_1.parameters(), lr=0.001)
# optimizer = torch.optim.Adam(model_1.parameters(), lr=0.001)
optimizer = torch.optim.RMSprop(model_1.parameters(), lr=0.001)

sigmoid = torch.nn.Sigmoid()

In [11]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# set number of epochs
EPOCHS = 50

# put data on device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

# batch size
BATCH_SIZE = 8

train_num_batches = len(X_train) // BATCH_SIZE
test_batch = 0

X_train_batches = split_batches(X_train, BATCH_SIZE)

y_train_batches = split_batches(y_train, BATCH_SIZE)

X_test_batches = split_batches(X_test, BATCH_SIZE)

y_test_batches = split_batches(y_test, BATCH_SIZE)

In [12]:
# Build training and evaluation loops
for epoch in range(EPOCHS):
    for batch in range(train_num_batches):
        X_train_batch = X_train_batches[batch]

        y_train_batch = y_train_batches[batch]

        model_1.train()

        y_logits = model_1(X_train_batch)

        y_logits = y_logits[:, -1, :]

        # print(y_logits.shape)

        y_pred = torch.argmax(sigmoid(torch.softmax(y_logits, dim=1)), dim=1)

        # print(y_pred.shape)

        y_train_batch = torch.argmax(y_train_batch, dim=1)

        loss = loss_fn(y_logits, y_train_batch)

        acc = f1_score(y_true=y_train_batch.cpu(), y_pred=y_pred.cpu(), average="micro")

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    # test
    if batch % 4 == 0:
        test_batch += 1

        model_1.eval()
        with torch.inference_mode():
            y_test_logits = model_1(X_test_batches[test_batch])

            y_test_logits = y_test_logits[:, -1, :]

            y_test_pred = torch.argmax(
                sigmoid(torch.softmax(y_test_logits, dim=1)), dim=1
            )

            y_test_true = torch.argmax(y_test_batches[test_batch], dim=1)

            test_loss = loss_fn(y_test_logits, y_test_true)

            test_acc = f1_score(
                y_true=y_test_true.cpu(), y_pred=y_test_pred.cpu(), average="micro"
            )

    if (epoch) % 10 == 0:
        print(
            f"Epoch: {epoch} | Train Loss: {loss:.5f} | Train Acc: {acc*100:.1f}% | Test Loss: {test_loss:.5f} | Test Acc: {test_acc*100:.1f}%"
        )

Epoch: 0 | Train Loss: 2.16442 | Train Acc: 37.5% | Test Loss: 1.96553 | Test Acc: 37.5%
Epoch: 10 | Train Loss: 1.06949 | Train Acc: 62.5% | Test Loss: 1.75715 | Test Acc: 37.5%
Epoch: 20 | Train Loss: 0.92192 | Train Acc: 75.0% | Test Loss: 1.08574 | Test Acc: 75.0%
Epoch: 30 | Train Loss: 1.03677 | Train Acc: 75.0% | Test Loss: 1.41016 | Test Acc: 50.0%
Epoch: 40 | Train Loss: 2.12544 | Train Acc: 25.0% | Test Loss: 1.17906 | Test Acc: 50.0%


torch.Size([1482, 12])

In [13]:
y_train_batch

tensor([7, 1, 4, 5, 7, 7, 7, 0], device='cuda:0')

In [14]:
y_test_pred

tensor([ 0,  6,  7, 11,  8,  6,  4,  4], device='cuda:0')

In [15]:
y_test_true

tensor([ 2, 11,  2,  0,  5,  6,  8,  4], device='cuda:0')

# eval on unseen data

In [21]:
SECONDS_PER_SAMPLE = 10
chunk_size = SECONDS_PER_SAMPLE * 45  # 45 samples per second


files = [
    "test_audio/Rubinstein - Chopin Nocturne Op. 48 in C Minor.mp3",
    "test_audio/Nikolayeva - Shostakovich  P and F B2.mp3",
    "test_audio/Gould - Goldberg Variations, Aria.mp3",
    "test_audio/Moravec - Bach Chromatic Fantasia BVW 903.mp3",
    "test_audio/Crochet - Goldberg Variation.mp3",
    "test_audio/Pogorelich - Chopin 4 scherzi.mp3",
]
for file in files:
    t = get_logC(file)
    X_list = []  # List to store X arrays
    y_list = []  # List to store y arrays

    artist_mapping = file.split("/")[1].split()[0]

    array_list = []

    for n in range(len(t.columns) // chunk_size):
        arr = t[range(n * chunk_size, (n + 1) * chunk_size, 1)].T.to_numpy()
        array_list.append(arr)

    arrays = np.array(array_list)
    X_list.append(torch.tensor(arrays))
    y_list.append(torch.full((arrays.shape[0],), artist_dict[artist_mapping]))

    x_test_1 = torch.cat(X_list, dim=0)
    y_test_1 = torch.cat(y_list, dim=0)
    x_test_1 = x_test_1.to(device)

    with torch.inference_mode():
        test_1_logits = model_1(x_test_1)

        test_1_logits = test_1_logits[:, -1, :]

        test_1_preds = torch.argmax(sigmoid(torch.softmax(test_1_logits, dim=1)), dim=1)

        test_1_pred = torch.mode(
            torch.argmax(sigmoid(torch.softmax(test_1_logits, dim=1)), dim=1)
        ).values

    print(
        f"Guess is: {get_keys_from_value(artist_dict, pd.Series(test_1_pred.cpu()).value_counts().index[0] )[0]}({pd.Series(test_1_pred.cpu()).value_counts().index[0]}), True is: {get_keys_from_value(artist_dict, pd.Series(y_test_1)[0])[0]}({pd.Series(y_test_1)[0]})"
    )

    pd.Series(test_1_preds.cpu()).value_counts()

Guess is: Moravec(6), True is: Rubinstein(7)
Guess is: Crochet(11), True is: Nikolayeva(9)
Guess is: Tureck(4), True is: Gould(0)
Guess is: Rubinstein(7), True is: Moravec(6)
Guess is: Tureck(4), True is: Crochet(11)
Guess is: Moravec(6), True is: Pogorelich(8)


Guess is: Moravec(6), True is: Pogorelich(8)


6     89
1     46
7     34
11    16
10    13
2     10
0     10
4     10
5      8
8      1
dtype: int64

In [80]:
test_1_pred

tensor(6, device='cuda:0')

In [77]:
y_test_1

tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8])