# Start to finish
rough draft to implement in pycharm project

In [1]:
from pydub import AudioSegment
import librosa
import numpy as np
import torch
from pathlib import Path
#import pandas as pd

## Select tap sound

In [None]:
# Prepared, reserved test clip recorded same as train
#clip = '../../../Source/Clean_train_clips/Test_pad/Ball_change/1/15.wav'

# Audio extracted from youtube
clip = '../../../Source/Shuffle/4/1.wav'

clip_path = Path(clip)

In [None]:
# Play sound
AudioSegment.from_wav(clip)

## Processing Pipeline

In [None]:
# Parameters (set in training and validation)
clip_length = 20772
n_mfcc = 20
frame_length = 256
hop_length = 128

In [None]:
def resize_signal(path, length):
    samples, sample_rate = librosa.load(path)
    if len(samples) < length:
        y = np.pad(samples, (0, length-len(samples)), 'constant')
    elif len(samples) > length:
        y = samples[:length]
    else:
        y = samples
    return y, sample_rate

def get_features_mfcc(samples, sample_rate):
    mfccs = np.mean(librosa.feature.mfcc(y=samples, sr=sample_rate, n_mfcc=n_mfcc).T,axis=0)
    return mfccs

def get_features_zcr(samples, sample_rate):
    zcr = librosa.feature.zero_crossing_rate(samples, frame_length=frame_length, hop_length=hop_length)
    return zcr
    
def get_features_energy(samples, sample_rate) :
    energy = np.array([sum(abs(samples[i:i+frame_length]**2)) for i in range (0, len(samples), hop_length)])
    return energy

def get_features_rmse(samples, sample_rate):
    rmse = librosa.feature.rmse(samples, frame_length=frame_length, hop_length=hop_length, center=True)
    return rmse[0]

def get_features_bpm(samples, sample_rate): 
    onset_env = librosa.onset.onset_strength(samples, sr=sample_rate) # Assumes static tempo, for dynamic: aggregate=None
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate)
    return tempo

def get_label(path):
    if path.parts[-3] == 'Shuffle':
        return 1
    else:
        return 0

In [None]:
# Pad with ending silence or cut to set length
y, sr = resize_signal(clip_path, clip_length)


In [None]:
# Extract features
mfcc = get_features_mfcc (y, sr)[:,np.newaxis]
zcr = get_features_zcr (y, sr).T
energy = get_features_energy(y, sr)[:,np.newaxis]
rmse = get_features_rmse (y, sr)[:,np.newaxis]
bpm = get_features_bpm (y, sr)[:,np.newaxis]


## Select features

In [None]:
feature_set = (mfcc, zcr, energy,rmse, bpm)


In [None]:
# Combine features to form input for model
inputs = np.concatenate(feature_set, axis=0).T


## Select model

In [None]:
# Trained models
#model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm.pt')
#model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm_128.pt')
model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm_256.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc_zcr_energy_rmse_bpm.pt')

#model = ('../src/train/trained_models/one_hidden_mfcc_128.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc.pt')
#model = ('../src/train/trained_models/one_hidden_mfcc_bpm_128.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc_bpm.pt')



## Run and predict

In [None]:
dtype = torch.float
device = torch.device('cpu')

inputs = torch.tensor(inputs, device=device, dtype=dtype)

# Load model
model = torch.load(model)

outputs = model(inputs)

y_pred = (torch.argmax(outputs.data).numpy())

true = get_label(clip_path)

print("What's on tap?")
print()
if y_pred == 1:
    print('Predicted: Shuffle')
elif y_pred == 0:
    print('Predicted: Ball change')
print()
if true == 1:
    print('It was a Shuffle.')
elif true == 0:
    print('It was a Ball change.')


In [None]:
# What did that sound like again?
AudioSegment.from_wav(clip)

In [5]:
# Parameters (set in training and validation)
clip_length = 20772
n_mfcc = 20
frame_length = 256
hop_length = 128




In [95]:


# Feature generating functions

class Features :
    def __init__(self, samples, sample_rate, clip_length, n_mfcc, frame_length, hop_length, feature_set):
        self.samples = samples
        self.sample_rate = sample_rate
        self.clip_length = clip_length
        self.n_mfcc = n_mfcc
        self.frame_length = frame_length
        self.hop_length = hop_length
        self.feature_set = feature_set


    def get_features_mfcc(self):
        mfccs = np.mean(librosa.feature.mfcc(y=self.samples,
                                             sr=self.sample_rate,
                                             n_mfcc=self.n_mfcc).T, axis=0)
        return mfccs[:,np.newaxis]


    def get_features_zcr(self):
        zcr = librosa.feature.zero_crossing_rate(self.samples,
                                                 frame_length=self.frame_length,
                                                 hop_length=self.hop_length)
        return zcr.T


    def get_features_energy(self):
        energy = np.array([sum(abs(self.samples[i:i + self.frame_length] ** 2))
                           for i in range(0, len(self.samples), self.hop_length)])
        return energy[:,np.newaxis]


    def get_features_rmse(self):
        rmse = librosa.feature.rmse(self.samples,
                                    frame_length=self.frame_length,
                                    hop_length=self.hop_length,
                                    center=True)
        return rmse[0][:,np.newaxis]


    def get_features_bpm(self):
        onset_env = librosa.onset.onset_strength(self.samples,
                                                 sr=self.sample_rate)  # Assumes static tempo, dynamic:aggregate=None
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=self.sample_rate)
        return tempo[:,np.newaxis]

    def get_feature_array(self):
        feature_array = np.empty([1,1])
        for i in self.feature_set:
            #print(i)
            if i == 'mfcc':
                feature_array = np.concatenate((feature_array, self.get_features_mfcc()), 
                                               axis=0)
            if i == 'zcr':  
                feature_array = np.concatenate((feature_array, self.get_features_zcr()), 
                                               axis=0)
            if i == 'energy':
                feature_array = np.concatenate((feature_array, self.get_features_energy()), 
                                               axis=0)
            if i == 'rmse':
                feature_array = np.concatenate((feature_array, self.get_features_rmse()), 
                                               axis=0)
            if i == 'bpm':
                feature_array = np.concatenate((feature_array, self.get_features_bpm()), 
                                               axis=0)
        feature_array = feature_array.flatten()
        feature_array = np.delete(feature_array, 0)
        return feature_array


In [96]:
samples, sample_rate = librosa.load('../../../Source/Shuffle/4/1.wav')
feature_set = ['mfcc']



new_features = Features(samples=samples,
                       sample_rate=sample_rate,
                       clip_length=clip_length,
                       n_mfcc=n_mfcc,
                       frame_length=frame_length,
                       hop_length=hop_length,
                       feature_set=feature_set)


In [97]:
inputs = new_features.get_feature_array().flatten()
inputs.shape

(20,)

In [98]:
inputs

array([-2.89198211e+02,  9.24704301e+01,  4.30348416e+00,  3.81261584e+01,
        8.64961635e+00,  1.31438377e+01, -4.60695814e+00,  2.24177103e+00,
       -2.67799977e+00,  3.44075743e+00, -5.87519003e+00,  3.71596183e+00,
       -9.14437899e-01, -3.78216182e+00, -7.21278726e+00,  4.23203655e+00,
       -3.32227059e+00, -2.75906450e+00, -6.45973520e+00,  2.88711983e-01])

In [76]:
inputs

array([], dtype=float64)