# Pianist classification using melodic and rhythmic features

## Import dependencies and set constants, etc.

In [1]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from src import utils
from src.detect.midi_utils import *
from src.features.melody_features import *
from src.features.rhythm_features import *

In [None]:
import joblib
joblib.parallel_backend = 'threading'
print(joblib.parallel_backend)

In [None]:
smallest_n_gram = 3
largest_n_gram = 10
MAX_INTERVAL = 12

## Pre-processing

In [None]:
# Get filepaths for processing
root = f'{utils.get_project_root()}/data/cambridge-jazz-trio-database-v01/corpus_chronology'
files = [os.path.join(root, f) for f in os.listdir(root)]

In [None]:
# Load in onsets and beats for each track
oms = [utils.load_track_from_files(f) for f in files]

In [None]:
# Extract melody for each track
mms = [MelodyMaker(os.path.join(fp, 'piano_midi.mid'), om) for fp, om in zip(files, oms)]

## Create histogram of pitch classes for one track

In [None]:
# Create a histogram for the pitch classes in one track
# Extract the melody for our first track
mel = list(mms[201].extract_melody())
# Count the pitch classes
pitch_classes = Counter([m.note for m in mel])
# Sort the pitch classes in order of the piano keys
sorted_pitches = {i: pitch_classes[i] for i in utils.ALL_PITCHES}
# Create the bar chart
plt.bar(sorted_pitches.keys(), sorted_pitches.values())
plt.title(oms[201].item['track_name'] + ' — ' + oms[200].item['pianist'])
plt.show()

## N-gram extraction

### Extract n-grams which appear in more than 3 solos

In [None]:
res = {}
for melody, onsets in tqdm(zip(mms, oms)):
    mel = list(melody.extract_melody())
    sequence = list(melody.extract_intervals(mel))
    if len(sequence) < 2:
        continue
    for start in range(len(sequence)):
        for n in range(smallest_n_gram, largest_n_gram + 1):
            n_gram = tuple([i.interval for i in sequence[start: start + n]])
            if any(abs(interval) > MAX_INTERVAL for interval in n_gram):
                continue
            if len(n_gram) == n:
                if str(n_gram) not in res.keys():
                    res[str(n_gram)] = {
                        'tracks': set(),
                        'pianists': set(),
                        'uses': 0,
                        'n': len(n_gram)
                    }
                res[str(n_gram)]['tracks'].add(onsets.item['mbz_id'])
                res[str(n_gram)]['pianists'].add(onsets.item['pianist'])
                res[str(n_gram)]['uses'] += 1

In [None]:
valid_ngrams = set(ng for ng, vals in res.items() if len(vals['tracks']) >= 3)

### Extract only valid n-grams from entire solos (no chunks)

In [None]:
ngrams = []
valid_dict = {ng: 0 for ng in valid_ngrams}
for melody, onsets in tqdm(zip(mms, oms)):
    mel = list(melody.extract_melody())
    sequence = list(melody.extract_intervals(mel))
    track_results = {'pianist': onsets.item['pianist'], 'track_name': onsets.item['track_name'], 'mbz_id': onsets.item['mbz_id'], **valid_dict}
    for start in range(len(sequence)):
        for n in range(smallest_n_gram, largest_n_gram + 1):
            n_gram = tuple([i.interval for i in sequence[start: start + n]])
            if len(n_gram) == n:
                n_gram = str(n_gram)
                if n_gram not in valid_ngrams:
                    continue
                track_results[n_gram] += 1
    ngrams.append(track_results)

    
    # using chunks
    # for chunk in melody.chunk_melody(sequence, overlapping_chunks=False, chunk_measures=4):
    #     if len(chunk) < 2:
    #         continue
    #     track_results = {'pianist': onsets.item['pianist'], 'track_name': onsets.item['track_name'], **valid_dict}
    #     for start in range(len(chunk)):
    #         for n in range(smallest_n_gram, largest_n_gram + 1):
    #             n_gram = tuple([i.interval for i in chunk[start: start + n]])
    #             if len(n_gram) == n:
    #                 n_gram = str(n_gram)
    #                 if n_gram not in valid_ngrams:
    #                     continue
    #                 track_results[n_gram] += 1
    #     ngrams.append(track_results)

To ensure repeatable outputs, we need to ensure that our samples and features always follow the same order

In [None]:
tracks = pd.DataFrame.from_dict(ngrams).sort_values(by=['pianist', 'track_name']).set_index(['pianist', 'track_name', 'mbz_id'])
# tracks = pd.read_csv('track_results.csv', index_col=0).sort_values(by=['pianist', 'track_name']).set_index(['pianist', 'track_name', 'mbz_id'])
tracks = tracks.reindex(sorted(tracks.columns), axis=1).reset_index(drop=False)

In [None]:
print(tracks.shape)

In [None]:
print(tracks.iloc[:5, :5])

### Model fitting

Tree-based methods
- Random Forest ✅
- Gradient-Boosted Trees ✅

Non-Tree based ML:
- SVM ✅
- Naive Bayes
- Multiclass logistic regression

DL
- Perceptron (ANN)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Split the data into predictors and response
X = tracks.iloc[:, 3:].to_numpy()
y = tracks.iloc[:, 0].to_numpy()
# Express n-gram numbers as percentage of total track
X = np.true_divide(X, X.sum(axis=1, keepdims=True))

In [None]:
def initial_fit(estimator):
    accs = []
    for train_idx, test_idx in tqdm(cv.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        estimator.fit(X_train, y_train)
        y_predict = estimator.predict(X_test)
        acc = accuracy_score(y_test, y_predict)
        accs.append(acc)
    return np.mean(accs)

In [None]:
cv = StratifiedKFold(random_state=42, n_splits=5, shuffle=True)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, verbose=False)

In [None]:
# Testing using default parameters
rf_acc = initial_fit(rf)
print(rf_acc)

- 58% using only n-grams contained in at least 3 solos
- 50% using all n-grams!

### Gradient boosted trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(random_state=42, verbose=0, max_features='sqrt')

In [None]:
# Testing using default parameters
gbc_acc = initial_fit(gbc)
print(gbc_acc)

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(random_state=42, decision_function_shape='ovo', verbose=0)

In [None]:
# Need to scale the data for svms (i.e. Z-score)
from sklearn.preprocessing import StandardScaler
X_scale = StandardScaler().fit_transform(X, y)

In [None]:
# Testing using default parameters
svc_acc = initial_fit(svc)
print(svc_acc)

Without tuning, RF outperforms gradient boosted trees and SVC

## Initial hyperparameter optimisation
Find some sensible values to use prior to RF feature selection

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# These are the parameters we'll sample from when optimizing
test_params = dict(
    # the loss function to use when splitting a node
    criterion=['gini', 'entropy', 'log_loss'],
    # The number of trees to grow in the forest
    n_estimators=[i for i in range(10, 200, 10)],
    # Max number of features considered for splitting a node
    max_features=[None, 'sqrt', 'log2'],
    # Max number of levels in each tree
    max_depth=[None, *[i for i in range(1, 51, 10)]],
    # Minimum number of samples required to split a node
    min_samples_split=[i for i in range(2, 11)],
    # Minimum number of samples required at each leaf node
    min_samples_leaf=[i for i in range(1, 11)],
    # Whether to sample data points with our without replacement
    bootstrap=[True, False],
)

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, verbose=False)
rs = RandomizedSearchCV(rf, param_distributions=test_params, cv=cv, random_state=42, n_iter=1000, verbose=5)

In [None]:
rs.fit(X, y)

In [None]:
rs.best_params_

In [None]:
# {'n_estimators': 170,
 # 'min_samples_split': 10,
 # 'min_samples_leaf': 2,
 # 'max_features': 'sqrt',
 # 'max_depth': 21,
 # 'criterion': 'gini',
 # 'bootstrap': False}

In [None]:
# Hard-coded results from running the above
initial_params = {'n_estimators': 190,
                  'min_samples_split': 8,
                  'min_samples_leaf': 3,
                  'max_features': 'sqrt',
                  'max_depth': 41,
                  'criterion': 'gini',
                  'bootstrap': False}

## Feature selection from random forest
Considering only the *n*th percentile of most important features 

In [None]:
rf_initial = RandomForestClassifier(random_state=42, n_jobs=-1, verbose=False, **initial_params)

In [None]:
def get_importances(X_, y_, est = None) -> float:
    if est is None:
        est = rf_initial
    imports = []
    for train_idx, test_idx in cv.split(X_, y_):
        X_train, y_train = X_[train_idx], y_[train_idx]
        X_test, y_test = X_[test_idx], y_[test_idx]
        rf_initial.fit(X_train, y_train)
        imports.append(rf_initial.feature_importances_)
    return imports

def get_most_important_feature_idxs(imports, n: int = 95):
    means = np.mean(np.vstack(imports), axis=0)
    order = means.argsort()
    ranks = order.argsort()
    perc = (len(ranks) / 100) * n
    to_keep = np.argwhere(ranks > perc)[:, 0]
    return to_keep

def get_accuracy(X_d, y_):
    accs = []
    for train_idx, test_idx in cv.split(X_d, y_):
        X_train, y_train = X_d[train_idx], y_[train_idx]
        X_test, y_test = X_d[test_idx], y_[test_idx]
        rf_initial.fit(X_train, y_train)
        y_predict = rf_initial.predict(X_test)
        acc = accuracy_score(y_test, y_predict)
        accs.append(acc)
    return np.mean(accs)


measures = []
imports = get_importances(X, y)
for n in tqdm(range(0, 95, 1)):
    n_imports = get_most_important_feature_idxs(imports, n)
    measures.append((100 - n, get_accuracy(X[:, n_imports], y)))

In [None]:
measures_df = pd.DataFrame(measures)
plt.plot(measures_df[0], measures_df[1])
plt.ylabel('Mean accuracy across all folds ($k$=5)')
plt.xlabel('% of all features considered')
plt.title('Values show smoothed averages (window=10%)')

In [None]:
perc_features_to_keep = measures_df.iloc[measures_df[1].sort_values(ascending=False).index[1], 0]
# perc_features_to_keep = 34
n_imports = get_most_important_feature_idxs(imports, 100 - perc_features_to_keep)

In [None]:
X_min_features = X[:, n_imports]
print(get_accuracy(X_min_features, y), perc_features_to_keep)

We only need to include the ~33% most important features to obtain similar accuracy when compared to including all predictors

## Final hyperparameter optimisation

**TODO**

## Plot confusion matrix

In [None]:
preds = []
acts = []
ids = []
for train_idx, test_idx in tqdm(cv.split(X_min_features, y)):
    X_train, y_train = X_min_features[train_idx], y[train_idx]
    X_test, y_test = X_min_features[test_idx], y[test_idx]
    rf_initial.fit(X_train, y_train)
    y_predict = rf_initial.predict(X_test)
    preds.extend(y_predict)
    acts.extend(y_test)
    ids.extend(tracks.iloc[test_idx, 2].values)

In [None]:
pred_df = (
    pd.concat([pd.Series(ids), pd.Series(acts), pd.Series(preds)], axis=1)
    .rename(columns={0: 'mbz_id', 1: 'actual', 2: 'predicted'})
    .apply(lambda x: [i.split(' ')[-1] for i in x])
    .sort_values(by='actual')
    .reset_index(drop=True)
)

In [None]:
import seaborn as sns
labs = pred_df['actual'].unique()
cm = confusion_matrix(pred_df['actual'], pred_df['predicted'], normalize='true')
g = sns.heatmap(cm, cmap='Reds', annot=True, fmt='.2f')
g.set_xticks(g.get_xticks(), labels=labs, rotation=90)
g.set_yticks(g.get_yticks(), labels=labs, rotation=00)
g.set(xlabel='Predicted', ylabel='Actual')

## Most indicative n-gram for each particular pianist
- Fit binary classifier - i.e. is it John Hicks vs someone else? - and extract feature importances
    - How to get direction (i.e. this n-gram definitely makes it Hicks, or means its definitely not Hicks?)
    - Possibly -- extract *n* most important n-grams from binary random forest, fit logistic regression using these, check log odds?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
pianist_labels, y_int = np.unique(y, return_inverse=True)

In [None]:
from joblib import Parallel, delayed
from sklearn.exceptions import ConvergenceWarning
import warnings

# We don't care that the model won't converge now and will bruteforce this later by increasing max_iter
warnings.simplefilter('ignore', ConvergenceWarning, )

fs = []
accs = []
c_range = range(10, 100)


def predict(c_):
    lr = LogisticRegression(C=c_, penalty='l2', random_state=42, solver='lbfgs', multi_class='ovr', n_jobs=1, max_iter=100)
    accs = []
    fs = []
    for train_idx, test_idx in cv.split(X_min_features, y_int):
        X_train, y_train = X_min_features[train_idx], y_int[train_idx]
        X_test, y_test = X_min_features[test_idx], y_int[test_idx]
        lr.fit(X_train, y_train)
        yhat = lr.predict(X_test)
        f = f1_score(y_test, yhat, average='macro')
        acc = accuracy_score(y_test, yhat)
        fs.append(f)
        accs.append(acc)
    return c_, np.mean(f), np.mean(acc)

with Parallel(n_jobs=-1, verbose=5) as par:
    cv_res = par(delayed(predict)(c) for c in range(5, 100))

In [None]:
best_c = 58
print(predict(c_=best_c))

In [None]:
cols = tracks.columns[3:]
for p in range(10):
    pianist = pianist_labels[p]
    best = cols[n_imports[np.argmax(np.square(lr.coef_[p, :]))]]
    worst = cols[n_imports[np.argmin(np.square(lr.coef_[p, :]))]]
    print(pianist, best, worst)
    print(tracks[tracks['pianist'] == pianist][best].sum())
    print(tracks[best].sum())
    print(tracks[tracks['pianist'] == pianist][worst].sum())
    print(tracks[worst].sum())
    # print(X_min_features[np.argmin(np.square(lr.coef_[p, :]))])

In [None]:
lr = LogisticRegression(C=best_c, penalty='l2', random_state=42, solver='lbfgs', multi_class='multinomial', n_jobs=1, max_iter=100)
lr.fit(X_min_features, y_int)

## Integrating rhythmic features


In [None]:
rhythm = pd.read_csv(f'{utils.get_project_root()}/notebooks/prediction/rhythm_features.csv', index_col=0).reset_index(drop=True)
rhythm['mbz_id'].dtype

In [None]:
X_sort = np.hstack((
    tracks.sort_values('mbz_id').reset_index(drop=True).iloc[:, 3:].to_numpy()[:, n_imports],
    rhythm.iloc[:, 3:].to_numpy()
))
y_sort = tracks.sort_values('mbz_id').reset_index(drop=True).iloc[:, 0].to_numpy()

In [None]:
imports = get_importances(X_sort, y_sort)
measures = []
for n in tqdm(range(0, 95, 1)):
    n_imports = get_most_important_feature_idxs(imports, n)
    ac = get_accuracy(X_sort[:, n_imports], y_sort)
    print(n, ac)
    measures.append((100 - n, ac))

In [173]:
from joblib import Parallel, delayed
from sklearn.exceptions import ConvergenceWarning
import warnings

# We don't care that the model won't converge now and will bruteforce this later by increasing max_iter
warnings.simplefilter('ignore', ConvergenceWarning, )

fs = []
accs = []
c_range = range(10, 100)


def predict(c_):
    lr = LogisticRegression(C=c_, penalty='l2', random_state=42, solver='lbfgs', multi_class='ovr', n_jobs=1, max_iter=100)
    accs = []
    fs = []
    for train_idx, test_idx in cv.split(X_min_features, y_int):
        X_train, y_train = X_min_features[train_idx], y_int[train_idx]
        X_test, y_test = X_min_features[test_idx], y_int[test_idx]
        lr.fit(X_train, y_train)
        yhat = lr.predict(X_test)
        f = f1_score(y_test, yhat, average='macro')
        acc = accuracy_score(y_test, yhat)
        fs.append(f)
        accs.append(acc)
    return c_, np.mean(f), np.mean(acc)

with Parallel(n_jobs=-1, verbose=5) as par:
    cv_res = par(delayed(predict)(c) for c in range(5, 100))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.10/runpy.py", line 110, in _get_module_details
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.10/runpy.py", line 110, in _get_module_details
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
    __import__(pkg_name)
  File "/home/hwc31/.virtualenvs/Cambridge-Jazz-Trio-Database/lib/python3.10/site-packages/joblib/__init__.py", line 113, in <module>
    __import__(pkg_name)
  File "/home/hwc31/.virtuale

KeyboardInterrupt: 

In [179]:
best_c = 58
print(predict(c_=best_c))

(58, 0.6666883116883117, 0.6666666666666666)


In [180]:
cols = tracks.columns[3:]
for p in range(10):
    pianist = pianist_labels[p]
    best = cols[n_imports[np.argmax(np.square(lr.coef_[p, :]))]]
    worst = cols[n_imports[np.argmin(np.square(lr.coef_[p, :]))]]
    print(pianist, best, worst)
    print(tracks[tracks['pianist'] == pianist][best].sum())
    print(tracks[best].sum())
    print(tracks[tracks['pianist'] == pianist][worst].sum())
    print(tracks[worst].sum())
    # print(X_min_features[np.argmin(np.square(lr.coef_[p, :]))])

Ahmad Jamal (-1, -1, -1) (3, 2, 3, -1, -2)
162
925
1
6
Bill Evans (1, 1, 1) (-3, 8, -1)
230
1451
1
19
Bud Powell (1, 1, 1) (4, 3, -3)
286
1451
12
152
John Hicks (1, 1, 1) (-13, -2, -2)
253
1451
1
8
Junior Mance (12, -12, 12) (1, 2, 4, -3)
231
364
1
8
Keith Jarrett (1, 1, 1) (-5, 14, -1)
136
1451
1
3
Kenny Barron (0, 0, 0) (-9, 5, 4, 1)
42
637
1
3
McCoy Tyner (1, 1, 1) (-17, -3, 3)
5
1451
1
13
Oscar Peterson (1, 1, 1) (-7, 4, -7)
20
1451
1
12
Tommy Flanagan (-2, -1, -2) (-6, -2, 7)
152
841
1
6


In [181]:
lr = LogisticRegression(C=best_c, penalty='l2', random_state=42, solver='lbfgs', multi_class='multinomial', n_jobs=1, max_iter=100)
lr.fit(X_min_features, y_int)

## Integrating rhythmic features


In [173]:
rhythm = pd.read_csv(f'{utils.get_project_root()}/notebooks/prediction/rhythm_features.csv', index_col=0).reset_index(drop=True)
rhythm['mbz_id'].dtype

dtype('O')

In [190]:
X_sort = np.hstack((
    tracks.sort_values('mbz_id').reset_index(drop=True).iloc[:, 3:].to_numpy()[:, n_imports],
    rhythm.iloc[:, 3:].to_numpy()
))
y_sort = tracks.sort_values('mbz_id').reset_index(drop=True).iloc[:, 0].to_numpy()

In [None]:
imports = get_importances(X_sort, y_sort)
measures = []
for n in tqdm(range(0, 95, 1)):
    n_imports = get_most_important_feature_idxs(imports, n)
    ac = get_accuracy(X_sort[:, n_imports], y_sort)
    print(n, ac)
    measures.append((100 - n, ac))