In [2]:
import numpy as np
import nussl

In [1]:
import os
import json
import pandas as pd

In [None]:
import musdb
import museval
import numpy as np
import librosa


In [57]:
np.hstack([np.zeros(3),np.ones(7)])

array([0., 0., 0., 1., 1., 1., 1., 1., 1., 1.])

In [59]:
A = np.array([np.array([1,1,0]),np.array([1,0,1]),np.array([2,1,1]),np.array([3,1,2])]).T
print(f"A: {A}")
Q, R = np.linalg.qr(A)
print(f"Q: {Q}")
print(f"R: {R}")

A: [[1 1 2 3]
 [1 0 1 1]
 [0 1 1 2]]
Q: [[-0.70710678  0.40824829 -0.57735027]
 [-0.70710678 -0.40824829  0.57735027]
 [-0.          0.81649658  0.57735027]]
R: [[-1.41421356 -0.70710678 -2.12132034 -2.82842712]
 [ 0.          1.22474487  1.22474487  2.44948974]
 [ 0.          0.          0.          0.        ]]


In [64]:
R==np.zeros((1,4))

array([[False, False, False, False],
       [ True, False, False, False],
       [ True,  True,  True,  True]])

In [66]:
R-np.zeros((1,4))

array([[-1.41421356, -0.70710678, -2.12132034, -2.82842712],
       [ 0.        ,  1.22474487,  1.22474487,  2.44948974],
       [ 0.        ,  0.        ,  0.        ,  0.        ]])

In [70]:
Q[np.linalg.norm(R-np.zeros((1,4)),axis=1)>1e-8]

array([[-0.70710678,  0.40824829, -0.57735027],
       [-0.70710678, -0.40824829,  0.57735027]])

In [82]:
class HPSS_evaluation_on_MUSDB18():
    def __init__(self, HPSS_function, train_dir = "/home/nithish/.nussl/musdb18/train/",test_dir = "/home/nithish/.nussl/musdb18/test/", estimates_dir="./hpss_estimates", output_dir="./hpss_scores"):
        self.HPSS_function = HPSS_function
        self.train_dir = train_dir
        self.test_dir = test_dir
        self.estimates_dir = estimates_dir
        self.output_dir = output_dir
        os.makedirs(estimates_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

    def projection_matrices(self,estimated_ground_truth_signal,interferring_ground_truth_signal,noise,distortion_mode='TVF',kernel_type='rectangle',kernel_length=3,length_of_filter=3):
        if distortion_mode=='TVF':
            if kernel_type=='rectangle':
                L_prime = kernel_length # Length of kernel
                T_prime = L_prime # hoplength of kernel
                kernel = np.ones(L_prime)

            L = length_of_filter
            T = estimated_ground_truth_signal.size
            U = (T+L-1)//T_prime

            estimated_ground_truth_vectors = [] #using the kernel, T_prime, L_prime and ground truth signals contruct a matrix whose columns for the spanning set for each of the vector spaces
            signal_space_vectors = []
            signal_noise_space_vectors = [] 

            for u in range(U):
                v = np.hstack([np.zeros(u*T_prime),kernel,np.zeros(L-(u*T_prime+L_prime))])
                for tau in range(L):
                    shifted_signal = np.hstack([np.zeros(tau),estimated_ground_truth_signal[tau:],np.zeros((L-(tau+T)))])
                    spanning_vector = np.multiply(v,shifted_signal)
                    estimated_ground_truth_vectors.append(spanning_vector)
                    signal_space_vectors.append(spanning_vector)
                    signal_noise_space_vectors.append(spanning_vector)

                    shifted_signal = np.hstack([np.zeros(tau),interferring_ground_truth_signal[tau:],np.zeros(L-(tau+T))])
                    spanning_vector = np.multiply(v,shifted_signal)
                    estimated_ground_truth_signal.append(spanning_vector)
                    signal_noise_space_vectors.append(spanning_vector)

                    shifted_signal = np.hstack([np.zeros(tau),noise[tau:],np.zeros(L-(tau+T))])
                    spanning_vector = np.multiply(v,shifted_signal)
                    signal_noise_space_vectors.append(spanning_vector)

                    
            estimated_ground_truth_vectors = np.array(estimated_ground_truth_vectors)
            signal_space_vectors = np.array(signal_space_vectors)
            signal_noise_space_vectors = np.array(signal_noise_space_vectors)

            est_GT_orth_matrix, R = np.linalg.qr(estimated_ground_truth_vectors.T)
            est_GT_orth_matrix = est_GT_orth_matrix[np.linalg.norm(R-np.zeros((1,R.shape[1])),axis=1)>1e-8]
            
            SS_orth_matrix, R = np.linalg.qr(signal_space_vectors.T)
            SS_orth_matrix = SS_orth_matrix[np.linalg.norm(R-np.zeros((1,R.shape[1])),axis=1)>1e-8]
            
            SNS_orthogonal_matrix, R = np.linalg.qr(signal_noise_space_vectors.T)
            SNS_orth_matrix = SNS_orth_matrix[np.linalg.norm(R-np.zeros((1,R.shape[1])),axis=1)>1e-8]
            
            return est_GT_orth_matrix@est_GT_orth_matrix.T , SS_orth_matrix@SS_orth_matrix.T , SNS_orthogonal_matrix@SNS_orthogonal_matrix.T
        
    def extract_components(self,estimate,estimated_ground_truth_signal,interferring_ground_truth_signal,noise):
        estimtated_ground_truth_projection_matrix, signal_space_projection_matrix, signal_noise_space_projection_matrix = self.projection_matrices(estimated_ground_truth_signal,interferring_ground_truth_signal,noise)
        
        s_target = estimtated_ground_truth_projection_matrix@estimate
        e_interference = signal_space_projection_matrix@estimate - s_target
        e_noise = estimate - signal_space_projection_matrix@estimate
        
        return s_target, e_interference, e_noise
    
    def evaluate_metrics(self,estimate,estimated_ground_truth_signal,interferring_ground_truth_signal,noise):
        s_target, e_interference, e_noise = self.extract_components(estimate,estimated_ground_truth_signal,interferring_ground_truth_signal,noise)
        SDR = 10*np.log10((np.linalg.norm(s_target)**2)/(np.linalg.norm(e_interference+e_noise)**2))
        SIR = 10*np.log10((np.linalg.norm(s_target)**2)/(np.linalg.norm(e_interference)**2))
        SNR = 10*np.log10((np.linalg.norm(s_target+e_interference)**2)/(np.linalg.norm(e_noise)**2))
        return SDR, SIR, SNR

    def evaluate_algorithm(self):
        musdb_train_test = nussl.datasets.MUSDB18(subsets=['train','test'])
        for idx in musdb_train_test.items[:1]:
            item = musdb_train_test[idx]
            Percussive_ground_truth = item['sources']['drums'].to_mono()
            Harmonic_ground_truth = item['sources']['vocals'].to_mono()+item['sources']['other'].to_mono()+item['sources']['bass'].to_mono()
            mixture_signal = item['mix'].to_mono()
            Harmonic_estimate, Percussive_estimate = self.HPSS_function(mixture_signal,window_length=1024,hop_length=512,window_type='hamming') # Fill in other hyperparameters needed for the function
            # We assume that the above estimates are nussl audio signals
            print("Harmonic estimate shape: ",Harmonic_estimate.audio_data.shape)
            Harmonic_estimation_metrics = self.evaluate_metrics(Harmonic_estimate.audio_data[0,:],Harmonic_ground_truth.audio_data[0,:],Percussive_ground_truth.audio_data[0,:],mixture_signal.audio_data[0,:]-(Harmonic_ground_truth.audio_data[0,:]+Percussive_ground_truth.audio_data[0,:]))
            Percussive_estimation_metrics = self.evaluate_metrics(Percussive_estimate.audio_data[0,:],Percussive_ground_truth.audio_data[0,:],Harmonic_ground_truth.audio_data[0,:],mixture_signal.audio_data[0,:]-(Harmonic_ground_truth.audio_data[0,:]+Percussive_ground_truth.audio_data[0,:]))
                    

    def get_results_df(self):
        """
        Aggregates the evaluation results into a pandas DataFrame for analysis.
        """
        # This part requires reading the generated JSON files
        scores_path = self.output_dir
        if not os.path.exists(scores_path):
            print("Scores directory not found. Run evaluation first.")
            return None
        
        # museval provides tools to load the results
        # This might need a custom function to parse all json files
        
        # A simple way: use a helper from museval or related library to aggregate
        # For now, let's assume we can load them manually or point the user to the `museval` documentation.
        # A full implementation would aggregate the JSONs into a DataFrame.

        # Example of loading results (conceptually):
        # results = museval.load_results(scores_path)
        # df = results.agg_scores()
        # return df

        print("For result aggregation, please refer to the museval documentation or implement a JSON parsing helper.")
        return None

# Example of a user-defined HPSS separation function that can be passed to the evaluator
# This is a basic librosa implementation and does not produce all MUSDB targets
# For a real evaluation, a more sophisticated function that returns 4 sources is needed
def basic_hpss_separator(track):
    """
    A placeholder for a user's actual source separation function.
    Uses librosa's HPSS to demonstrate. Note: This will not work directly 
    with `museval` without further processing to match the 4 sources (vocals, etc.)
    """
    # Load audio, convert to mono for simple librosa HPSS
    y = librosa.to_mono(track.audio)
    # Perform HPSS
    H, P = librosa.hpss(y)
    
    # In a real scenario, the estimates would need to be structured as a dictionary 
    # with keys 'vocals', 'drums', 'bass', 'other', and be the same shape as track.audio
    # This example cannot be run directly with the class as provided.

    # A better function would use a custom HPSS model to produce the actual sources
    # estimates = {
    #     'vocals': estimated_vocals,
    #     'drums': estimated_drums,
    #     'bass': estimated_bass,
    #     'other': estimated_other,
    # }
    # return estimates
    pass




In [83]:
def Complementary_Diffusion(audio_signal,window_length,hop_length,window_type,gamma=0.3,alpha=0.3,num_iters=50):
    audio_signal.stft_params = nussl.STFTParams(window_length=window_length,hop_length=hop_length,window_type=window_type)
    stft = audio_signal.stft()
    W = np.power(np.abs(stft),2*gamma)
    H = 0.5*np.copy(W)
    P = 0.5*np.copy(W)
    num_rows = W.shape[0]
    num_cols = W.shape[1]
    # print("W shape: ",W.shape)
    for k in range(num_iters):
        H_L_shift = np.concatenate([H[:,1:,:],np.expand_dims(np.expand_dims(np.zeros(num_rows),axis=1),axis=1)],axis=1)
        H_R_shift = np.concatenate([np.expand_dims(np.expand_dims(np.zeros(num_rows),axis=1),axis=1),H[:,:-1,:]],axis=1)
        P_U_shift = np.concatenate([P[1:,:,:],np.expand_dims(np.expand_dims(np.zeros(num_cols),axis=1),axis=0)],axis=0)
        P_L_shift = np.concatenate([np.expand_dims(np.expand_dims(np.zeros(num_cols),axis=1),axis=0),P[:-1,:,:]],axis=0)
        delta = alpha*(H_L_shift+H_R_shift-2*H)/4 - (1-alpha)*(P_L_shift+P_U_shift-2*P)/4
        H = np.minimum(np.maximum(H+delta,np.zeros_like(W)),W)
        P = W - H
    
    H = np.multiply(W,H>=P)
    P = np.multiply(W,H<P)
    
    harmonic_component = audio_signal.make_copy_with_stft_data(np.multiply(H,np.exp(1j*np.angle(stft))))
    percussive_component = audio_signal.make_copy_with_stft_data(np.multiply(P,np.exp(1j*np.angle(stft))))
    harmonic_component.istft()
    percussive_component.istft()
    return harmonic_component, percussive_component

In [84]:
evaluation_class = HPSS_evaluation_on_MUSDB18(HPSS_function=Complementary_Diffusion)

In [85]:
evaluation_class.evaluate_algorithm()

Harmonic estimate shape:  (1, 300032)


ValueError: negative dimensions are not allowed

In [47]:
noise = np.random.normal(0,1e-5,size=(1,9))
evaluation_class.evaluate_metrics(np.array([[1,1,1,2,2,2,3,3,3]])+0.5*noise,np.ones((1,9)),np.array([[0,0,0,1,1,1,0,0,0]]),noise)

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [37]:
train_set = nussl.datasets.MUSDB18(subsets=['train'])

In [44]:
train_set[0]['mix'].to_mono().audio_data.shape

(1, 300032)