# Incremental Evaluation

## 1 - Load the data

In [None]:
# Imports

import json
import os
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Set this to the path where the data is 
DATA_ROOT = 'C:\openmic-2018\openmic-2018'

if not os.path.exists(DATA_ROOT):
    raise ValueError('Did you forget to set `DATA_ROOT`?')


In [None]:
# Loading the data
OPENMIC = np.load(os.path.join(DATA_ROOT, 'openmic-2018.npz'), allow_pickle=True)

# Make direct variable names for everything
X, Y_true, Y_mask, sample_key = OPENMIC['X'], OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']


In [None]:
# Map class indices to names
with open(os.path.join(DATA_ROOT, 'class-map.json'), 'r') as f:
    class_map = json.load(f)

In [None]:
class_map

## 2 - Load the splits
### Creating splits for train, test, and the unlabeled data.
###### Adapted from the original OpenMIC notebook

In [None]:
# Split the data into train, test, and unlabeled sets
# Use squeeze=True here to return a single array for each, rather than a full DataFrame

split_train = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_train.csv'), 
                          header=None).squeeze("columns")
split_test = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_test.csv'), 
                         header=None).squeeze("columns")

# Create partition CSV for unlabeled
split_unlabeled = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_unlabeled.csv'), 
                         header=None).squeeze("columns")

In [None]:
# The breakdown of the data is roughly 50% training, 25% test, 25% unlabeled
# The percentage breakdowns can be adjusted by adjusting the partitions csv's above 
print('# Train: {},  # Test: {}, # Unlabeled: {}'.format(len(split_train), len(split_test), len(split_unlabeled)))

In [None]:
train_set = set(split_train)
test_set = set(split_test)
unlabeled_set = set(split_unlabeled)

In [None]:
# Split the data into arrays

idx_train, idx_test, idx_unlabeled = [], [], []

for idx, n in enumerate(sample_key):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)
    elif n in unlabeled_set:
        idx_unlabeled.append(idx)
    else:
        raise RuntimeError('Unknown sample key={}! Abort!'.format(sample_key[n]))

# Cast the idx_* arrays to numpy structures
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)
idx_unlabeled = np.asarray(idx_unlabeled)

In [None]:
# Finally, we use the split indices to partition the features, labels, and masks
X_train = X[idx_train]
X_test = X[idx_test]
X_unlabeled = X[idx_unlabeled]

Y_true_train = Y_true[idx_train]
Y_true_test = Y_true[idx_test]
Y_true_unlabeled = Y_true[idx_unlabeled]

Y_mask_train = Y_mask[idx_train]
Y_mask_test = Y_mask[idx_test]
Y_mask_unlabeled = Y_mask[idx_unlabeled]

In [None]:
# Validate shapes of slices
print(X_train.shape)
print(X_test.shape)
print(X_unlabeled.shape)

## 3 - Fit the models
### The below has been updated from the original OpenMIC notebook to include both Random Forest and KNN models

In [None]:
models = dict()

for instrument in class_map:

    # get column num from instrument name
    inst_num = class_map[instrument]

    # isolate data that has been labeled as this instrument
    train_inst = Y_mask_train[:, inst_num]
    test_inst = Y_mask_test[:, inst_num]

    # gets training data with labels for this instrument
    X_train_inst = X_train[train_inst]

    # averages features over time
    X_train_inst_sklearn = np.mean(X_train_inst, axis=1)

    # labels instrument as present if value over 0.5
    Y_true_train_inst = Y_true_train[train_inst, inst_num] >= 0.5

    # Repeat slicing for test
    X_test_inst = X_test[test_inst]
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.5

    # Initialize a new classifier
    rfc = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    knn = KNeighborsClassifier(n_neighbors=10)   #TODO: BUMP UP # OF NEIGHBORS

    # Fit model
    rfc.fit(X_train_inst_sklearn, Y_true_train_inst)
    knn.fit(X_train_inst_sklearn, Y_true_train_inst)

    # Evaluate the model
    Y_pred_train_rfc = rfc.predict(X_train_inst_sklearn)
    Y_pred_test_rfc = rfc.predict(X_test_inst_sklearn)

    Y_pred_train_knn = knn.predict(X_train_inst_sklearn)
    Y_pred_test_knn = knn.predict(X_test_inst_sklearn)

    print('-' * 52)
    print(instrument)
    print('\tTRAIN RFC')
    print(classification_report(Y_true_train_inst, Y_pred_train_rfc))
    print('\tTEST RFC')
    print(classification_report(Y_true_test_inst, Y_pred_test_rfc))
    print('\tTRAIN knn')
    print(classification_report(Y_true_train_inst, Y_pred_train_knn))
    print('\tTEST knn')
    print(classification_report(Y_true_test_inst, Y_pred_test_knn))
    
    # Store the classifier in our dictionary
    models[instrument] = [rfc, knn]

## Algorithmic Disagreement

#### In the algorithimc disagreement process, two models evaluate the same piece of data and their evaluations are compared. If they disagree on their evaluation, that track is deemed to be a priority for annotation.
##### Let's start with an example

In [None]:
# We need soundfile to load audio data
import soundfile as sf
import sys

# For audio playback
from IPython.display import Audio

from compare import *

In [26]:
# Run algorithmic disagreement process

# Populate skipIndices with empty arrays to be filled
# skipIndices = {}
# for i in class_map:
#     skipIndices[i] = []

X_copy = X_unlabeled

uncertaintyScores, allPredictions = compare(X_copy, models)

In [27]:
uncertaintyScores["voice"]

{263: 0.7032401642476506,
 191: 0.6029405576586641,
 204: 0.5972618261871,
 42: 0.5667487843478562,
 136: 0.5634261738790503,
 368: 0.5559008482133847,
 128: 0.5551919629750715,
 69: 0.5531436215247285,
 448: 0.5428168135971605,
 352: 0.5269734079548318,
 288: 0.5255056069610935,
 132: 0.5245899452689443,
 369: 0.5219697801589775,
 148: 0.5193005410781559,
 459: 0.5191293080403767,
 169: 0.5109415123226353,
 361: 0.5010411920082221,
 467: 0.5008214525089805,
 30: 0.49825387401100585,
 413: 0.4960143218192622,
 64: 0.49543969972792984,
 293: 0.49411720515046453,
 138: 0.49061213930254166,
 306: 0.48518842912479043,
 437: 0.4831510780261504,
 110: 0.46343159070014994,
 493: 0.46276483082273867,
 432: 0.45875693752051216,
 386: 0.4572306568503114,
 480: 0.45707705870636717,
 202: 0.4565325016777728,
 285: 0.4547861475214538,
 324: 0.4532967809296794,
 439: 0.4530026602143541,
 450: 0.45278751205160667,
 464: 0.4509581195483127,
 465: 0.4489852190964013,
 381: 0.44661591133217055,
 164: 0.

In [28]:
list(uncertaintyScores["voice"].items())[:5]

[(263, 0.7032401642476506),
 (191, 0.6029405576586641),
 (204, 0.5972618261871),
 (42, 0.5667487843478562),
 (136, 0.5634261738790503)]

In [29]:
# Get ID of the most uncertain track for voice
highest_idx = list(uncertaintyScores.get("voice").items())[0][0]
trackID = split_unlabeled[highest_idx]

In [30]:
# And the most uncertain track is...
trackID

'003792_46080'

Let's hear what this difficult-to-label track sounds like!

In [31]:
# Replace below .ogg file with trackID
audio, rate = sf.read(os.path.join(DATA_ROOT, 'audio/003/003792_46080.ogg'))

In [32]:
# Let's listen to the example
Audio(data=audio.T, rate=rate)

In [33]:
# this displays the instruments that the models disagreed on
allPredictions.get("voice").get(highest_idx)

[0.19675983575234945, 0.9]

### Incremental Evaluation -- Simulate the Annotation Process

In [34]:
# First, train a model on the fully annotated dataset. This is what we will be measuring efficiency against. 
combined_models = dict()

for instrument in class_map:

    # get column num from instrument name
    inst_num = class_map[instrument]

    # isolate data that has been labeled as this instrument
    train_inst = Y_mask_train[:, inst_num]
    test_inst = Y_mask_test[:, inst_num]
    unlabeled_inst  = Y_mask_unlabeled[:, inst_num]

    # gets training data with labels for this instrument
    X_train_inst = X_train[train_inst]
    X_unlabeled_inst = X_unlabeled[unlabeled_inst]

    # combine training and unlabeled sets
    X_combined = np.append(X_train_inst, X_unlabeled_inst, axis=0)
    Y_true_combined = np.append(Y_true_train, Y_true_unlabeled, axis=0)

    # averages features over time
    X_train_inst_sklearn = np.mean(X_combined, axis=1)

    # labels instrument as present if value over 0.5
    Y_true_train_inst = Y_true_train[train_inst, inst_num] >= 0.5
    Y_true_unlabeled_inst = Y_true_unlabeled[unlabeled_inst, inst_num] >= 0.5

    Y_true_combined = np.append(Y_true_train_inst, Y_true_unlabeled_inst, axis=0)



    # Repeat slicing for test
    X_test_inst = X_test[test_inst]
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.5

    # Initialize a new classifier
    rfc = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)

    # Fit model
    rfc.fit(X_train_inst_sklearn, Y_true_combined)

    # Evaluate the model
    Y_pred_train_rfc = rfc.predict(X_train_inst_sklearn)
    Y_pred_test_rfc = rfc.predict(X_test_inst_sklearn)

    combined_models[instrument] = rfc


In [35]:
NUM_TO_LABEL = 90  # Batch size
NUM_RANDOM = 10      # Number of random tracks to include 
EPOCHS = 3          # Number of times to annotate

skipIndices = {}
accuracies_over_time = {}
ctrl_accuracies = {}

# populates dictionaries with instrument keys and empty lists
for i in class_map:
    skipIndices[i] = []
    accuracies_over_time[i] = []
    ctrl_accuracies[i] = []


for i in range(EPOCHS):
    # get instrument predictions
    sorted_trx, allInstProbs = compare(X_copy, models, batch=500)

    for instr in sorted_trx:    
        X_labeled = []           # Track IDs for labeled tracks
        Y_true_labeled = []      # True values for labeled tracks
        Y_mask_labeled = []      # Boolean values for labeled tracks

        # Retrieve the top tracks for annotation
        track_indices = list(sorted_trx[instr].items())[:NUM_TO_LABEL]
        track_indices = [i[0] for i in track_indices]   # isolates the indices

        # add random tracks to be annotated
        track_indices = addRandomTracks(NUM_RANDOM, len(sorted_trx[instr]), track_indices)

        # add track IDs to the labeled list
        for trk in track_indices:
            X_labeled.append(X_unlabeled[trk])
            Y_true_labeled.append(Y_true_unlabeled[trk])
            Y_mask_labeled.append(Y_mask_unlabeled[trk])

        # Update indices to skip here
        X_copy = np.delete(X_copy, track_indices, axis=0)

        # Now to train a new model on the annotated data
        
        ###########################################################################
        inst_num = class_map[instr]
        model = trainModel(inst_num, X_train, X_test, X_labeled, Y_true_train, Y_true_test, np.array(Y_true_labeled), Y_mask_train, Y_mask_test, np.array(Y_mask_labeled))

        # Graph the accuracy results
        control_model = combined_models[instr]

        test_inst = Y_mask_test[:, inst_num]

        # Repeat slicing for test
        X_test_inst = X_test[test_inst]
        X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
        Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.5

        Y_pred_test_ctrl = control_model.predict(X_test_inst_sklearn)
        Y_pred_test_rfc = model.predict(X_test_inst_sklearn)

        # Graph the accuracy results
        control_model = combined_models[instr]

        test_inst = Y_mask_test[:, inst_num]

        # Repeat slicing for test
        X_test_inst = X_test[test_inst]
        X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
        Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.5

        Y_pred_test_ctrl = control_model.predict(X_test_inst_sklearn)
        Y_pred_test_rfc = model.predict(X_test_inst_sklearn)

        acc = classification_report(Y_true_test_inst, Y_pred_test_rfc, output_dict=True)["accuracy"]  
        ctrl_acc = classification_report(Y_true_test_inst, Y_pred_test_ctrl, output_dict=True)["accuracy"]  

        accuracies_over_time[instr].append(acc)
        ctrl_accuracies[instr].append(ctrl_acc)


IndexError: index 448 is out of bounds for axis 0 with size 422

In [None]:
X_unlabeled.shape

(4922, 10, 128)

In [None]:
import matplotlib.pyplot as plt

In [None]:
len(X_unlabeled)

In [None]:
for i in class_map:
    plt.plot(accuracies_over_time[i], linestyle='dotted')
    plt.plot(ctrl_accuracies[i])
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title(i)
    plt.show()

Update
* Annotation process is simulated

To Do
* Evaluation - train model on partially-annotated dataset
* Update README