# Start to finish
rough draft to implement in pycharm project

In [1]:
from pydub import AudioSegment
import librosa
import numpy as np
import torch
from pathlib import Path

## Select tap sound

In [15]:
# Prepared, reserved test clip recorded same as train
clip = '../../../Source/Clean_train_clips/Test_pad/Ball_change/1/15.wav'

# Audio extracted from youtube
#clip = '../../../Source/Shuffle/4/1.wav'

clip_path = Path(clip)

In [16]:
# Play sound
AudioSegment.from_wav(clip)

## Processing Pipeline

In [17]:
# Parameters (set in training and validation)
clip_length = 20772
n_mfcc = 20
frame_length = 256
hop_length = 128

In [18]:
def resize_signal(path, length):
    samples, sample_rate = librosa.load(path)
    if len(samples) < length:
        y = np.pad(samples, (0, length-len(samples)), 'constant')
    elif len(samples) > length:
        y = samples[:length]
    else:
        y = samples
    return y, sample_rate

def get_features_mfcc(samples, sample_rate):
    mfccs = np.mean(librosa.feature.mfcc(y=samples, sr=sample_rate, n_mfcc=n_mfcc).T,axis=0)
    return mfccs

def get_features_zcr(samples, sample_rate):
    zcr = librosa.feature.zero_crossing_rate(samples, frame_length=frame_length, hop_length=hop_length)
    return zcr
    
def get_features_energy(samples, sample_rate) :
    energy = np.array([sum(abs(samples[i:i+frame_length]**2)) for i in range (0, len(samples), hop_length)])
    return energy

def get_features_rmse(samples, sample_rate):
    rmse = librosa.feature.rmse(samples, frame_length=frame_length, hop_length=hop_length, center=True)
    return rmse[0]

def get_features_bpm(samples, sample_rate): 
    onset_env = librosa.onset.onset_strength(samples, sr=sample_rate) # Assumes static tempo, for dynamic: aggregate=None
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate)
    return tempo

def get_label(path):
    if path.parts[-3] == 'Shuffle':
        return 1
    else:
        return 0

In [19]:
# Pad with ending silence or cut to set length
y, sr = resize_signal(clip_path, clip_length)


In [20]:
# Extract features
mfcc = get_features_mfcc (y, sr)[:,np.newaxis]
zcr = get_features_zcr (y, sr).T
energy = get_features_energy(y, sr)[:,np.newaxis]
rmse = get_features_rmse (y, sr)[:,np.newaxis]
bpm = get_features_bpm (y, sr)[:,np.newaxis]


## Select features

In [21]:
feature_set = (mfcc)


In [22]:
# Combine features to form input for model
inputs = np.concatenate(feature_set, axis=0).T


## Select model

In [23]:
model = ('../src/train/trained_models/one_hidden_mfcc_128.pt')

# Models to compare

#model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm.pt')
#model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm_128.pt')
#model = ('../src/train/trained_models/one_hidden_mfcc_zcr_energy_rmse_bpm_256.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc_zcr_energy_rmse_bpm.pt')

#model = ('../src/train/trained_models/one_hidden_mfcc_128.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc.pt')
#model = ('../src/train/trained_models/one_hidden_mfcc_bpm_128.pt')
#model = ('../src/train/trained_models/two_hidden_mfcc_bpm.pt')



## Run and predict

In [24]:
dtype = torch.float
device = torch.device('cpu')

inputs = torch.tensor(inputs, device=device, dtype=dtype)

# Load model
model = torch.load(model)

outputs = model(inputs)

y_pred = (torch.argmax(outputs.data).numpy())

true = get_label(clip_path)

print("What's on tap?")
print()
if y_pred == 1:
    print('Predicted: Shuffle')
elif y_pred == 0:
    print('Predicted: Ball change')
print()
if true == 1:
    print('It was a Shuffle.')
elif true == 0:
    print('It was a Ball change.')


What's on tap?

Predicted: Ball change

It was a Ball change.


In [25]:
# What did that sound like again?
AudioSegment.from_wav(clip)