In [19]:
# load dataset
import dataloader
import utilities
import transcribe
import madmom
import os
import features
import numpy as np
from tqdm import tqdm

parameters = {
        "chroma_type":"crp",  # cqt,crp
        "vocabulary":"majmin",  # majmin, triads, triads_extended or majmin_sevenths
        "eval_scheme":"majmin",
        "source_separation":"None", # hprs, demucs or None
        "prefilter":"median", # None, "median"
        "prefilter_length":10,  # N 
        "postfilter":"hmm", # None, "hmm" or "median"
        "transition_prob":0.2,  # 0..1
        "postfilter_length":1, # N
        "dataset":"beatles"  # rwc_popular
}

PATH = "/home/max/ET-TI/Masterarbeit/mirdata/"
output_path = "/home/max/ET-TI/Masterarbeit/results/"
results= []
print(f"Starting Transcription!")
for fold in range(1,9):
    fold_score = 0.0
    fold_seg_score = 0.0
    dataset = dataloader.MIRDataset(parameters.get("dataset","beatles"),basepath=PATH,split_nr=fold)
    for track_id in tqdm(dataset.getTrackList(),desc=f"fold {fold}/8"):
        try:
            name = dataset.getTitle(track_id).split("_-_")[-1]
            track_results = {}
        except KeyError:
            print(f"error loading {name}.. skipping song")
            continue
        audiopath,(ref_intervals,ref_labels) = dataset[track_id]

        ## HPS ##
        if parameters.get("source_separation",False) == "hprs":
            time_vector,signal = utilities.loadAudio(audiopath)
            y_harm,y_perc,y_res = transcribe.harmonicPercussiveResidualSeparation(signal,beta=3,n_fft=4096)
            signal = madmom.audio.signal.Signal(y_harm, sample_rate=signal.sample_rate, num_channels=1, start=signal.start, stop=signal.stop)
            rms = features.rms(signal,hop_length=4096)
        elif parameters.get("source_separation",False) == "demucs":
            basepath,filename = os.path.split(audiopath)
            filename = filename.rsplit('.', 1)[0]
            audiopath = basepath+"/instrumentals/"+filename+"_instrumental.mp3"
            time_vector,signal = utilities.loadAudio(audiopath)
            rms = features.rms(signal,hop_length=4096)
        else:
            time_vector,signal = utilities.loadAudio(audiopath)
            rms = features.rms(signal,hop_length=4096)

        if parameters.get("chroma_type","crp") == "crp":    
            t_chroma,chroma = features.crpChroma(signal)
        elif parameters.get("chroma_type") == "dcp":
            t_chroma,chroma = features.deepChroma(signal,split_nr=fold)
        else:
            t_chroma,chroma = features.cqtChroma(signal)

        est_intervals, est_labels = transcribe.transcribeChromagram(t_chroma,chroma,rms,**parameters)
        score,seg_score = transcribe.evaluateTranscription(est_intervals,est_labels,ref_intervals,ref_labels,parameters.get("eval_scheme","majmin"))
        results.append((name,score,seg_score))
        
mean_score = np.mean(np.array([x[1] for x in results]))
mean_seg_score = np.mean(np.array([x[2] for x in results]))
print(f"{parameters.get('eval_scheme','majmin')}: {round(mean_score,2)}, segmentation: {round(mean_seg_score,2)}")


Starting Transcription!


fold 1/8:   0%|          | 0/23 [00:04<?, ?it/s]
fold 2/8:   0%|          | 0/22 [00:02<?, ?it/s]
fold 3/8:   0%|          | 0/22 [00:05<?, ?it/s]
fold 4/8:   0%|          | 0/22 [00:05<?, ?it/s]
fold 5/8:   0%|          | 0/22 [00:02<?, ?it/s]
fold 6/8:   0%|          | 0/22 [00:03<?, ?it/s]
fold 7/8:   0%|          | 0/23 [00:03<?, ?it/s]
fold 8/8:   0%|          | 0/23 [00:03<?, ?it/s]


majmin: 0.52, segmentation: 0.71


Reference Model: Deep chroma processor

In [18]:
import numpy as np
results = []
for fold in range(1,9):
    dataset = dataloader.MIRDataset(parameters.get("dataset","beatles"),basepath=PATH,split_nr=fold)
    for track_id in tqdm(dataset.getTrackList(),desc=f"fold {fold}/8"):
        name = dataset.getTitle(track_id).split("_-_")[-1]
        audiopath,(ref_intervals,ref_labels) = dataset[track_id]
        time_vector,signal = utilities.loadAudio(audiopath)
        t_chroma,chroma = features.deepChroma(signal,split_nr=fold)
        est_intervals,est_labels = transcribe.transcribeDeepChroma(t_chroma,chroma)
        score,seg_score = transcribe.evaluateTranscription(est_intervals,est_labels,ref_intervals,ref_labels,parameters.get("eval_scheme","majmin"))
        results.append((name,score,seg_score))

mean_score = np.mean(np.array([x[1] for x in results]))
mean_seg_score = np.mean(np.array([x[2] for x in results]))
print(f"{parameters.get('eval_scheme','majmin')}: {round(mean_score,2)}, segmentation: {round(mean_seg_score,2)}")


fold 1/8: 100%|██████████| 23/23 [00:24<00:00,  1.07s/it]
fold 2/8: 100%|██████████| 22/22 [00:30<00:00,  1.38s/it]
fold 3/8: 100%|██████████| 22/22 [00:28<00:00,  1.28s/it]
fold 4/8: 100%|██████████| 22/22 [00:37<00:00,  1.72s/it]
fold 5/8: 100%|██████████| 22/22 [00:26<00:00,  1.22s/it]
fold 6/8: 100%|██████████| 22/22 [00:29<00:00,  1.36s/it]
fold 7/8: 100%|██████████| 23/23 [00:42<00:00,  1.84s/it]
fold 8/8: 100%|██████████| 23/23 [00:33<00:00,  1.44s/it]

parameters.get('eval_scheme','majmin'): 0.84, segmentation: 0.89



