# Working with STFT/note/scalar inputs (symbolic and audio) to combine the notebooks in a multimodal Neural Net architecture

## 0. imports

In [1]:
# ! pip install torcheval==0.0.7

In [3]:
import scipy
import scipy.io.wavfile
from scipy import signal

import numpy as np
import pandas as pd
import glob
import os 

import librosa

from collections import defaultdict

import sys

import torch

import itertools # creating list of hyperparameter choices

# for creating models:
import torch.nn as nn
import torch.nn.functional as F

# for dataset split:
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

from torcheval.metrics.functional import r2_score


In [4]:
sys.path.insert(0, '../../stage-1/overall_used_tools')
import requirements_check as rc

import torcheval
rc.check(sys, [scipy, np, pd, glob, os, librosa, torch , sklearn, torcheval], multimodal_prediction=True)



Your version identical with original version 
-> scipy 
-> 1.10.1


Your version identical with original version 
-> numpy 
-> 1.21.6


Your version identical with original version 
-> pandas 
-> 2.0.3


For 
glob 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
os 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


Your version identical with original version 
-> librosa 
-> 0.10.0.post2


Possibly different versions: 
-> torch 
-> original version 2.0.1+cu117 
-> your version 1.13.1


Your version identical with original version 
-> sklearn 
-> 1.3.0


Your version identical with original version 
-> torcheval 
-> 0.0.7


Possibly different versions: 
-> sys 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]




**ATTENTION: When you already have in the notebook's directory the directory 'created_input_tensors' containing the files**
- categorical_labels_tensor.pt
- mel_tensor.pt
- regressive_labels_tensor.pt
- scalar_tensor.pt
- symbolic_input_tensor.pt 
- midibert_input_tensor<br><br>
**you can directly jump to the notebook's section '2. Create Neural Network architecture' and go on from there.**

## 1. Prepare inputs for net

### 1.1. Import categorical and regressive labels

In [3]:
sys.path.insert(0, '../unimodal_scalarFeatures_model') # to import labels.py from another folder
import labels

Possibly different versions: 
-> pandas 
-> original version 1.3.5 
-> your version 2.0.3


For 
collections 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


Your version identical with original version 
-> numpy 
-> 1.21.6


No match for your module 
matplotlib 
found in the requirements.


Possibly different versions: 
-> sys 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]




In [4]:
categorical_labels = labels.get_categorical_labels()
categorical_labels.sort_index(inplace=True)
# Change names to MP3 code:
if '_accompaniment' in categorical_labels.index[0]: 
    categorical_labels.index = ['_'.join(i.split('_')[:-1])[1:] for i in list(categorical_labels.index)]
    
regres_labels = labels.get_regressive_labels()
regres_labels.sort_index(inplace=True)
# Change names to MP3 code:
if '_accompaniment' in regres_labels.index[0]: 
    regres_labels.index = ['_'.join(i.split('_')[:-1])[1:] for i in list(regres_labels.index)]
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['MP3-Code']= label_df['MP3-Code'] + '_accompaniment' # add to each cell a string in a certain column
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={'MP3-Code':'sample_id'},inplace=True) # rename column
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.dropna(inplace=True)


We import a csv file where we need the columns 'MP3-Code','sublimity','vitality','unease'. 'MP3-Code' describes the ID of each sample.
proportion of amount of samples (out of 370) with... ...zero ...one ...two ...three labels:
0.0 0.7753424657534247 0.2136986301369863 0.010958904109589041
We import a csv file where we need the columns 'MP3-Code','sublimity','vitality','unease'. 'MP3-Code' describes the ID of each sample.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['final_label'] = label_sample_list.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['sample_id'] = "'" + label_df['sample_id'] + ".wav'"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['MP3-Code']= label_df['MP3-Code'] + '_accompaniment' # add to each cell a string 

### 1.2. Symbolic features: using note_collector as 2 D input (rows = voices, column = time aspect) of a LSTM model

In [5]:
sys.path.insert(0, '../../stage2_feature_extraction/symbolicFeatureExtraction')
import basic_info_extraction

Your version identical with original version 
-> numpy 
-> 1.21.6


Possibly different versions: 
-> pandas 
-> original version 1.3.5 
-> your version 2.0.3


Possibly different versions: 
-> mido 
-> original version 1.2.10 
-> your version <module 'mido.version' from '/home/c/anaconda3/envs/multimodal_env/lib/python3.8/site-packages/mido/version.py'>


Possibly different versions: 
-> sys 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]




In [6]:
def reduce_symbolic_col_size(array, final_length=300):
    
    len_snippet = 30 # number of neighbouring notes which should be kept in shortened new array 
    number_snippets = int(final_length/len_snippet)
    
    piece_length = array.shape[1]
    ind_distance = int(piece_length/number_snippets)
    
    current_ind = 0
    
    while current_ind <= (piece_length - len_snippet - 1):

        snippet = array[:, current_ind:current_ind+len_snippet]
        
        if current_ind == 0:
            shortened_array = snippet

        else:
            shortened_array = np.concatenate((shortened_array, snippet),axis=1)

        current_ind += ind_distance # ind_distance includes len_snippet by this usage

    return shortened_array

In [7]:
l1 = ['H_Jay-Z_HardKno', 'K_Saint-Seans_Carniva','K_Schumann_EtudesS']
l2 = ['H_JayZ_HardKno', 'K_Saint-Saens_Carniva', 'K_Schumann_EtudeS']

In [8]:
midi_path = '../../stage1_data_collecting_phase/audio2midi_converter/audio2midi_Wang/GeneratedMIDI_Wang/'
midi_files = glob.glob(midi_path + '*.mid')
row_len_list = []
col_len_list = []

symbolic_input_dict = dict()
midi_name_list = [] # needed when extracting mel filterbank from .wav files. We don't want to do it for all
# 404 files but only for the ones which are also available in midi.

for midi_path in midi_files:
    midi_name = midi_path.split('/')[-1].split('.')[0] # e.g. P_BeachBoys_GoodVib_accompaniment
    
    midi_name_list.append('_'.join(midi_name.split('_')[:-1]))
    
    print('current midi: ', midi_name)
    midi_path_path = '/'.join(midi_path.split('/')[:-1]) +'/'
    
    pause_collector, nd, note_collector, note_collector_all,SAL,CD,time_passed_ar,creation_success = basic_info_extraction.basic_tools_panda(file_name = midi_name, directory_path_with_tracks=midi_path_path, start_time=float('inf'),end_time=float('-inf'))
    # --> only note_collector of interest
    
    # change nan-values/-1 to 0:
    note_collector[np.where(note_collector==-1)] = 0
    
    # filter out rows containing no notes:
    ind_list = []
    for ind,row in enumerate(note_collector):
        if np.all(row==0) == False:
            ind_list.append(ind)
    row_mask = np.array(ind_list)
    note_collector = note_collector[row_mask, :]
    
    # reduce note collector in size when it is too huge in column size:
    aimed_col_length = 300
    
    if note_collector.shape[1] > aimed_col_length:

        note_collector = reduce_symbolic_col_size(note_collector, final_length=aimed_col_length)
    
    
    # collecting the size to determine the biggest note_collector later, such that every array will have
    # same size:
    row_len_list.append(note_collector.shape[0])
    col_len_list.append(note_collector.shape[1])
        
    # normalize note values:
    note_collector_normal =  (note_collector - note_collector.mean()) / note_collector.std()
    
    # to tensor:
    note_collector_normal = torch.tensor(note_collector_normal)
    
    mp3_code = '_'.join(midi_name.split('_')[:-1]) # e.g. P_BeachBoys_GoodVib
    if mp3_code in l1:
        mp3_code = l2[l1.index(mp3_code)]
    symbolic_input_dict[mp3_code] = note_collector_normal

current midi:  P_BOBfeatBrunoMars_Nothing_accompaniment
information extraction (and midi snippet generation) done


ValueError: too many values to unpack (expected 8)

In [10]:
# Find out the biggest size and zeropad files with lower shape[1], i. e. samples which are shorter in time 
# shall be expanded by 0-s:
max_row = max(row_len_list)
max_col = max(col_len_list)

for key,value in  symbolic_input_dict.items():
            
    if value.shape[1] < max_col or value.shape[0] < max_row:
        
        value_tensor = value
        
        if value.shape[1] < max_col:
            difference = max_col - value.shape[1]
            to_add = torch.zeros((value_tensor.shape[0],difference))
            value_tensor = torch.cat((value_tensor, to_add),axis=1)
            
        if value.shape[0] < max_row:
            difference = max_row - value.shape[0]
            to_add = torch.zeros((difference, value_tensor.shape[1]))
            value_tensor =  torch.cat((value_tensor, to_add),axis=0)
            
        symbolic_input_dict[key] = value_tensor
        
# sort by key:
symbolic_input_dict = dict(sorted(symbolic_input_dict.items()))

print('symbolic extraction done')

symbolic extraction done


### 1.3. Transfer audio files to mel filtered STFT-s

In [11]:
def mel_filterbank(sample_rate, hop_size, segment_times, STFT):

    # from http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/#eqn1
    # and https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html

    STFT11 = np.copy(STFT)

    # 0. finding out the sample rate for one window:
    len_window = int(segment_times[1])

    in_sec_window = (len_window-1) * hop_size/ sample_rate
    sample_rate2 = int(in_sec_window * sample_rate) # that's the number of samples in one window
    

    # 1. lower border of human hearing https://www.audiologyresearch.org/human-hearing-range: 
    # 20 Hz:
    # get borders mel band:
    variable = 2959 # values seen in lecture and internet so far: 2959, 1125, 2595
    
    mel_lowest_f = (variable * np.log10(1 + (20) / 700))
    mel_highest_f = (variable * np.log10(1 + (sample_rate2 / 2) / 700))  # Convert Hz to Mel

    # 2. creating 40 bins/bands from which 38 are equally distributed between the lowest and
    # highest mel value:
    number_filters = 20 #40 # normal: 20-40, standard: 26
    # with 20 no final_bins with same value and in filterbank no row with set={0.0}

    distance_mel = (mel_lowest_f + mel_highest_f)/(number_filters+1)
    mel_binDistances = np.array([mel_lowest_f + i * distance_mel for i in range(0, number_filters+2)])

    # 3. Converting the the mel_bins into frequencies:
    f_binDistances = (700 * (10**(mel_binDistances / variable) - 1)) #700*(np.exp(mel_binDistances/2959)-1)
    # because log_10(b)=x leads to 10**x=b

    # 4. Missing freq. resolution. Therefore, round the found freq. to nearest DFT bin.
    # Need of number of DFT bins and sample_rate:

    final_bins = np.floor((STFT.shape[0]+1)*f_binDistances/sample_rate2) # last bin relates
    # to mel_highest_f

    # 5. Create filterbanks: 
    # Each filterbank starts at point i, has the peak at i+1 and is zero at i+2.
    # formula H_m(k) from http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
    # where k=...-th_filter, f=final_bins. Sentence before describes formula
    filterbank = np.zeros((number_filters,int(np.floor(STFT.shape[0] ))))
    # rows: number used filterbanks/individual filters, columns: values for certain frequencies

    for m in range(1,(number_filters+1)):

        left_val = int(final_bins[m-1])
        middle_val = int(final_bins[m])
        right_val = int(final_bins[m+1])

        for k in range(left_val, middle_val): # number of fitlers m between final bins
            filterbank[m-1,k] = (k-final_bins[m-1])/(final_bins[m]-final_bins[m-1])
        for k in range(middle_val, right_val):
            filterbank[m-1,k] = (final_bins[m+1]-k)/(final_bins[m+1]-final_bins[m])

    # 6. lecture content L04 slide 54: the magnitude approach:
    # Use the filterbank for each DFT magnitude to reduce it in bin size:
    
    shape_border = int(STFT.shape[1]/2)
    S1 = (filterbank @ abs(STFT[:,:shape_border]))
    S2 = (filterbank @ abs(STFT[:,:STFT.shape[1]]))
    S = np.concatenate((S1,S2),axis=1)
    S = 10 * np.log10(S+1) # rows: number of filters, column: like in STFT the 


    return S

In [12]:
def using_mel_filterbank(file, number_overlap = 100):    

    sample_rate, data = scipy.io.wavfile.read(file)
    try:
        data = (data[:,0] + data[:,1])/2
    except: # data has only one channel as we want it
        pass

    # with resampling the outcoming STFT tensor will have less columns:
    #resampled = data
    resampled = signal.resample(data, num=30000, t=None, axis=0, window=None)
    
    sample_freq, segment_times, STFT = signal.stft(resampled,noverlap=number_overlap) # all have the same size already!
    # noverlap: the number of samples two neighbouring DFT have in common / reverse of hop size

    hop_size = int(segment_times[1] - number_overlap)

    mel = mel_filterbank(sample_rate, hop_size, segment_times, STFT)
    # of course its the filtered STFT
    
    return mel, sample_rate, hop_size

In [9]:
path =  '../../stage1_data_collecting_phase/audio2midi_converter/track_preparation/accompaniment/'

filenames = glob.glob(path + "*.wav") # read all the files with extension .wav

samples_mel_dict = dict() 
samples_mel_sizes = []

for ind, file in enumerate(filenames):
    
    # Getting title of file:
    mp3_code = (file.split('/')[-1]).replace('_accompaniment.wav','') 
    
    if mp3_code in midi_name_list:
        
        print('current file: ', file)
        sample_rate, data = scipy.io.wavfile.read(file)

        mel, sample_rate, hop_size = using_mel_filterbank(file, number_overlap = 100)
        
        if mp3_code in l1:
            mp3_code = l2[l1.index(mp3_code)]

        # Typing result into dictionary:
        samples_mel_dict[mp3_code] = torch.tensor(mel)
        samples_mel_sizes.append(mel.shape[1])
print('mel extraction done')

current file:  ../../stage1_data_collecting_phase/audio2midi_converter/track_preparation/accompaniment/P_BOBfeatBrunoMars_Nothing_accompaniment.wav


NameError: name 'using_mel_filterbank' is not defined

In [14]:
# Find out the biggest size and zeropad files with lower shape[1], i. e. samples which are shorter in time 
# shall be expanded by 0-s. So, we get for sure same batch sizes.:
max_sample_len = max(samples_mel_sizes)

for key,value in samples_mel_dict.items():
    
    # normalize mel:
    value_normal = (value - value.mean()) / value.std()
        
    if value.shape[1] < max_sample_len:
        difference = max_sample_len - value.shape[1]
        to_add = torch.zeros((value.shape[0],difference))
        samples_mel_dict[key] = torch.concatenate((value_normal, to_add),axis=1)
    else:
        samples_mel_dict[key] = value_normal
        
# sort by key:
samples_mel_dict = dict(sorted(samples_mel_dict.items()))

### 1.4. Scalar features

#### 1.4.1. Import already built scalar dataset

In [15]:
# We don't care whether categorical or regressive dataset as the labels gets dropped:
scalar_df = pd.read_csv('../unimodal_scalarFeatures_model/dataframes/audioAndSymbolic_dataframe_categorical_audio2midiWang.csv')

# Change names to MP3 code:
if '_accompaniment' in scalar_df['name'][0]: 
    name_list_scalar = ['_'.join(i.split('_')[:-1])[1:] for i in list(scalar_df['name'])]
    scalar_df['name'] = name_list_scalar
      
scalar_df.set_index('name', inplace=True)

# drop label column:
scalar_df.drop(['final_label'], axis=1, inplace=True) 
scalar_df.sort_index(inplace=True)

scalar_df.head()

Unnamed: 0_level_0,midi_Number of Pitches,midi_Number of Pitch Classes,midi_Number of Common Pitches,midi_Number of Common Pitch Classes,midi_Range,midi_Importance of Bass Register,midi_Importance of Middle Register,midi_Importance of High Register,midi_Dominant Spread,midi_Strong Tonal Centres,...,audio_W(239)_lld,audio_W(240)_lld,audio_W(241)_lld,audio_W(242)_lld,audio_W(243)_lld,audio_W(244)_lld,audio_W(245)_lld,audio_W(246)_lld,audio_W(247)_lld,audio_W(248)_lld
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H_2Pac_AllEyez,-0.69105,0.00687,1.431812,-0.8625,-1.95603,0.173771,0.553268,-0.748694,-0.215021,1.896439,...,-0.395981,-0.475631,-0.360202,-0.244258,-0.222566,3.228343,-0.538287,-1.170944,0.701196,-0.224492
H_2Pac_KeepYaH,-0.992124,-0.54689,1.431812,0.567901,-1.644949,0.121261,0.606396,-0.748694,-0.215021,0.19429,...,0.308935,-0.475631,-0.360202,-1.270476,-0.222566,-0.328788,-0.538287,-1.170944,0.701196,-0.224492
H_50Cent_CandySh,0.061634,0.00687,0.62139,-0.8625,-0.089542,1.697301,-1.752025,0.035989,-0.215021,0.19429,...,0.308935,0.384911,-0.360202,-0.244258,-0.222566,1.915512,-0.538287,0.42162,0.701196,-0.224492
H_50Cent_InDaClu,0.513245,1.114391,0.62139,0.567901,0.37708,-0.007348,-0.834989,0.86511,-0.215021,0.19429,...,-0.395981,-0.475631,2.653326,-0.244258,-0.222566,3.228343,-0.538287,-1.170944,0.701196,-0.224492
H_ASAPRocky_Fashion,-1.142661,-1.10065,0.62139,0.567901,-1.80049,0.576342,0.145953,-0.748694,-0.949373,0.19429,...,-0.395981,0.6787,2.653326,-0.244258,-0.222566,-0.328788,-0.538287,-1.170944,0.701196,-0.224492


#### 1.4.2. Keep only top k features of scalar_df

In [10]:
print('start finding top k scalar features...')
user_input = input('Do you want to use the already provided top 30 features?[y/n]')

if user_input.lower() == 'y':
    scalar_df = scalar_df[['audio_tonal.hpcp_entropy.stdev', 'audio_pcm_fftMag_spectralRollOff90.0_sma_leftctime_f2', 'audio_audspec_lengthL1norm_sma_de_stddevRisingSlope_f2', 'audio_F0final_sma_de_flatness_f2', 'audio_pcm_fftMag_spectralSlope_sma_de_maxSegLen_f2', 'audio_logHNR_sma_de_lpgain_f2', 'audio_jitterDDP_sma_linregc2_f2', 'audio_pcm_fftMag_spectralSkewness_sma_peakDistStddev_f2', 'audio_pcm_fftMag_spectralEntropy_sma_de_iqr2-3_f2', 'audio_lowlevel.dissonance.mean', 'audio_F0_sma_maxPos', 'audio_logHNR_sma_lpc4_f2', 'audio_tonal.hpcp_entropy.mean', 'audio_pcm_fftMag_spectralVariance_sma_de_iqr2-3_f2', 'audio_audspec_lengthL1norm_sma_lpc3_f2', 'audio_pcm_fftMag_spectralSlope_sma_de_quartile2_f2', 'audio_pcm_fftMag_spectralSkewness_sma_lpc2_f2', 'audio_pcm_fftMag_spectralSlope_sma_de_minRangeRel_f2', 'audio_pcm_fftMag_spectralEntropy_sma_de_quartile3_f2', 'audio_audspec_lengthL1norm_sma_de_iqr2-3_f2', 'audio_pcm_fftMag_spectralEntropy_sma_de_iqr1-2_f2', 'audio_shimmerLocal_sma_de_upleveltime90_f2', 'audio_tonal.tuning_diatonic_strength', 'audio_lowlevel.spectral_entropy.mean', 'audio_audspec_lengthL1norm_sma_de_upleveltime50_f2', 'audio_audspec_lengthL1norm_sma_iqr2-3_f2', 'audio_audspec_lengthL1norm_sma_de_iqr1-3_f2', 'midi_Average Note Duration', 'audio_audspec_lengthL1norm_sma_de_quartile1_f2', 'audio_audspec_lengthL1norm_sma_de_iqr1-2_f2']]
    
else:
    sys.path.insert(0, '../unimodal_scalarFeatures_model')
    import feature_reduction as fr

    instance = fr.PipelinedScalarTraining()
    instance.x = scalar_df

    mask_for_reg_labels_top = regres_labels.index.isin(scalar_df.index) 
    regres_labels_top = regres_labels[mask_for_reg_labels_top]
    regres_labels_top[['y_sublim','y_ease','y_vital']] = pd.DataFrame(regres_labels_top.final_label.tolist(), index=regres_labels_top.index)
    del regres_labels_top['final_label']

    instance.y = np.array(regres_labels_top)
    instance.select_top_features(feature_number_approach1=20, feature_number_approach2=20, categorical=False)
    scalar_df = instance.x
    scalar_df.shape

print('...done finding top k scalar features')

start finding top k scalar features...
Do you want to use the already provided top 30 features?[y/n]y


NameError: name 'scalar_df' is not defined

### 1.5. Prepare inputs for model

#### 1.5.1. Bring dataset parts together. Remove samples which are not represented in all dataset parts

In [17]:
# mask the other feature inputs of the same dataset:
# for mel filterbank samples:
mask_for_scalar_df = scalar_df.index.isin(symbolic_input_dict) & scalar_df.index.isin(samples_mel_dict) & scalar_df.index.isin(regres_labels.index) & scalar_df.index.isin(categorical_labels.index)
mask_for_cat_labels = categorical_labels.index.isin(symbolic_input_dict) & categorical_labels.index.isin(samples_mel_dict) & categorical_labels.index.isin(scalar_df.index) & categorical_labels.index.isin(regres_labels.index)
mask_for_reg_labels = regres_labels.index.isin(symbolic_input_dict) & regres_labels.index.isin(samples_mel_dict) & regres_labels.index.isin(scalar_df.index) & regres_labels.index.isin(categorical_labels.index)


# convert dictionary keys to df to filter them. Then use them as mask for the dictionary:
mel_dict = pd.DataFrame(columns=['name'])
mel_dict['name'] = list(samples_mel_dict.keys())
mel_dict.set_index('name', inplace=True)

mask_for_mel_dict = mel_dict.index.isin(symbolic_input_dict) & mel_dict.index.isin(regres_labels.index) & mel_dict.index.isin(scalar_df.index) & mel_dict.index.isin(categorical_labels.index)
mel_filter = mel_dict[mask_for_mel_dict]

samples_mel_list = list(map(samples_mel_dict.get, mel_filter.index))

# use the masks for the dataframes:
scalar_df = scalar_df[mask_for_scalar_df]
categorical_labels = categorical_labels[mask_for_cat_labels]
regres_labels = regres_labels[mask_for_reg_labels]


#### 1.5.2. Convert dataset parts to usable inputs of the multimodal model

In [18]:
# for symbolic note array samples:
# convert dictionary keys to df to filter them. Then use them as mask for the dictionary:
note_dict = pd.DataFrame(columns=['name'])
note_dict['name'] = list(symbolic_input_dict.keys())
note_dict.set_index('name', inplace=True)

mask_for_note_dict = note_dict.index.isin(samples_mel_dict) & note_dict.index.isin(regres_labels.index) & note_dict.index.isin(scalar_df.index) & note_dict.index.isin(categorical_labels.index)
note_filter = note_dict[mask_for_note_dict]

symbolic_input_list = list(map(symbolic_input_dict.get, note_filter.index))

# stack all single note_arrays in symbolic_input_list together such that each tensor has same shape:
max_width, max_height = 0,0
for ele in symbolic_input_list:
    width = ele.shape[1]
    height = ele.shape[0]
    
    if max_width==0:
        max_width = width
        max_height = height
        
    elif max_width < width:
        max_width = width
        
    elif max_height < height:
        max_height = height

tensor_list = []
for sample_tensor in symbolic_input_list:
    ground_tensor = torch.zeros((max_height, max_width))
    ground_tensor[:sample_tensor.shape[0], :sample_tensor.shape[1]] = sample_tensor
    tensor_list.append(ground_tensor)
    
symbolic_input_tensor = torch.stack(tensor_list, dim=0)

In [19]:
# convert the dataframes:
scalar_tensor = torch.tensor(scalar_df.values, dtype=torch.float32) # float32 because the network 
# weights are also defined with that data type


# split the list label entities into several columns:
categorical_labels[['y_sublim','y_ease','y_vital']] = pd.DataFrame(categorical_labels.final_label.tolist(), index=categorical_labels.index)
categorical_labels.drop('final_label', inplace=True, axis=1)

categorical_labels_tensor = torch.tensor(categorical_labels.values, dtype=torch.int32)


regres_labels[['y_sublim','y_ease','y_vital']] = pd.DataFrame(regres_labels.final_label.tolist(), index=regres_labels.index)
regres_labels.drop('final_label', inplace=True, axis=1)

regres_labels_tensor = torch.tensor(regres_labels.values, dtype=torch.float32)


# convert lists of tensors into 3D tensor:
samples_mel_tensor = torch.stack([sample_tensor for sample_tensor in samples_mel_list], dim=0)


### 1.6 Generate inputs based on MIDI files for the case that the MusicBERT model is used instead of the note array as symbolic track of the multi-modal model

#### Load musicBERT model with all its pre-trained weights

In [1]:
import MIDIBert.my_musicBERT_loader as berter
music_bert_model = berter.load()
music_bert_model = music_bert_model.cpu()

disable_cp = False
mask_strategy = ['bar']
convert_encoding = OCTMIDI
crop_length = None
Your version identical with original version 
-> fairseq 
-> 0.10.2


For 
ast 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
collections 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
contextlib 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
inspect 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
logging 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


For 
os 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]

#### Make predictions

In [2]:
import MIDIBert.musicBERT_predictions as bert_predictor
filtered_mp3code_list = (note_filter.index).tolist()
midibert_predictions = bert_predictor.predictor(filtered_mp3code_list, music_bert_model, device='cpu', path_to_midi ='../../stage1_data_collecting_phase/audio2midi_converter/audio2midi_Wang/GeneratedMIDI_Wang')

Your version identical with original version 
-> music21 
-> 7.3.3


Your version identical with original version 
-> miditoolkit 
-> 0.1.16


For 
random 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


Possibly different versions: 
-> time 
-> original version 3.1.2 
-> your version  is not clearly visible. Go to your used python folder path .../python3.7/site-packages to investigate the version of your package. Also possible: have a look at the shell commands "pip show module_name" and "apt show module_name"


For 
math 
python version needs to fit 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]


Possibly different versions: 
-> sys 
-> original version 3.7.3 
-> your version 3.8.10 (default, Jun  4 2021, 15:09:15) 
[GCC 7.5.0]




NameError: name 'note_filter' is not defined

In [22]:
samples_mel_tensor.shape, symbolic_input_tensor.shape, scalar_tensor.shape, categorical_labels_tensor.shape, regres_labels_tensor.shape, midibert_predictions.shape

(torch.Size([360, 20, 291]),
 torch.Size([360, 1, 300]),
 torch.Size([360, 30]),
 torch.Size([360, 3]),
 torch.Size([360, 3]),
 torch.Size([360, 13]))

#### 1.5.3. Store tensors

In [23]:
'''
input_folder = 'created_input_tensors'
os.makedirs(input_folder, exist_ok=True)

torch.save(samples_mel_tensor, input_folder+'/mel_tensor.pt')
torch.save(symbolic_input_tensor, input_folder+'/symbolic_input_tensor.pt')
torch.save(scalar_tensor, input_folder+'/scalar_tensor.pt')

torch.save(categorical_labels_tensor, input_folder+'/categorical_labels_tensor.pt')
torch.save(regres_labels_tensor, input_folder+'/regressive_labels_tensor.pt')

torch.save(midibert_predictions, input_folder+'/midibert_input_tensor.pt')
'''

## 2. Create Neural Network architecture

### 2.1. Build up LSTM boxes for audio part - mel filtered STFT AND symbolic part can also use the same LSTM architecture

In [5]:
class StackedBiLSTM(torch.nn.Module):
    
    def __init__(self, num_feat, hidden_dim, out_dim):
        # row_col_id for symbolic input note_collector meaning: all notes played at the same time neighbourhood
        # (more or less, note_collector columns aren't exactly building up times, i.e. first note of voice1
        # is in the same column as first note of voice2). 
        # row_col_id for audio input mel filtered STFT meaning: we have fixed time windows from which each one
        # will be one input.
        # ROWS ARE MEANT OF INPUT
        super().__init__() # super(NextCharLSTM, self).__init__()
        
        #self.embedding = nn.Embedding(row_col_id, embedding_dim) # size dict embeddings; size each 
        # embedding vector
        self.lstm = nn.LSTM(input_size=num_feat, hidden_size=hidden_dim, bidirectional=True, 
                            num_layers=3, dropout=0.2, batch_first=True,
                            proj_size=out_dim) # number features; number features in hidden state
        # bidirectional LSTM and dropout=0.2 worked in m_c.pdf the best
        # num_layers mean how many LSTM to stack
        # batch_first=True gives (batchsize, seq_len, out_len) as output instead of (seq_len, batchsize, out_len)
        # proj_size = when 0 (default) then output has hidden size, hidden_size should be by nature of size of output
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        #self.linear = nn.Linear(hidden_dim, alphabet_size) # ???? change size
    
    def forward(self, x):
        
        # change x shape from (batch_size x rows/features x columns/sequence)
        # to (batch_size x columns/sequence x rows/features) which is the input of nn.LSTM
        x = torch.swapdims(x, 1, 2)
        
        #x = self.embedding(x) 
        output, (final_hidden_state, final_cell_state) = self.lstm(x)
        #logits = self.linear(final_hidden_state) # when arguemt x has an shape of 
        # [sample_character_string, int_encoding] the logits have output  
        # [sample_character_string, int_encoding, alphabet_size] 
        
        return output#final_hidden_state

### 2.2. Build MLP which will be integrated in the multimodal net used for the scalar features

In [6]:
class ScalarMLP(nn.Module):
    def __init__(self, num_feat: int, hidden_dim, out_dim):
        super().__init__() 
        
        self.model = nn.Sequential(
            
            nn.Linear(in_features=num_feat, out_features=hidden_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=hidden_dim, out_features=hidden_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=hidden_dim, out_features=out_dim, bias=True),
            nn.ReLU()
            
        )
        
    def forward(self, x):
        output = self.model(x)
        return output
    

In [4]:
'''
# Input the scalar features into the net:
scalar_net = ScalarMLP(scalar_df.shape[1])
scalar_tensor = torch.tensor(scalar_df.values, dtype=torch.float32)
scalar_hidden_out = scalar_net.forward(scalar_tensor)
'''

'\n# Input the scalar features into the net:\nscalar_net = ScalarMLP(scalar_df.shape[1])\nscalar_tensor = torch.tensor(scalar_df.values, dtype=torch.float32)\nscalar_hidden_out = scalar_net.forward(scalar_tensor)\n'

### 2.3. Overall model which connects the different inputs with each other by integrating the individual neural models

The model architecture idea comes from the paper 'MULTI-MODAL EMOTION RECOGNITION ON IEMOCAP WITH NEURAL NETWORKS' (Tripathi et al., n.d., https://arxiv.org/abs/1804.05788).

In [7]:
class FullModel(nn.Module): 
    
    def __init__(self,conv_mode=True,
                 num_feat_stft=-1, time_stft=-1, hidden_dim_stft=256, out_dim_stft=0, bi_direct_stft=True,
                 kernel_size_stft=(5,5), channel_size_stft=6,
                 midibert_mode=True, num_feat_note=-1, time_note=-1, hidden_dim_note=256, out_dim_note=0, bi_direct_note=True,
                 dense_out_stft=256, 
                 dense_out_note=13,#256,
                 final_hidden_dense_out=256,
                 num_feat_scalar:int =-1, hidden_dim_scalar=-1, out_dim_scalar=-1,
                 dense_out_scalar=256,
                 device='cpu',
                 label_type_categorical=False):
                 # None/-1 when we put in values later
        
        super().__init__() 
        self.midibert_mode = midibert_mode
        self.conv_mode = conv_mode
        self.label_type_categorical = label_type_categorical
        
        # compute number of remaining height (features) after each convolution
        if conv_mode and num_feat_stft % 2 != 0:
            num_feat_stft += 1
            
        if conv_mode and time_stft % 2 != 0:  
            time_stft += 1
            
        conv_out_comp = lambda x,k: int(((x - (k-1)*2 -1 + 2*0)/1 +1)/2)
        
        
        # height audio input:
        stage1_h = conv_out_comp(num_feat_stft, kernel_size_stft[0])
        
        if stage1_h % 2 != 0:
            pool_stage1_h = 1
        else:
            pool_stage1_h = 0
        
        stage2_h = conv_out_comp(stage1_h+pool_stage1_h, kernel_size_stft[0])
       
        
        # width audio input:
        stage1_w = conv_out_comp(time_stft, kernel_size_stft[1])
        
        if stage1_w % 2 != 0:
            pool_stage1_w = 1
        else:
            pool_stage1_w = 0
        
        stage2_w = conv_out_comp(stage1_w+pool_stage1_w, kernel_size_stft[1])


        self.stft_block = nn.Sequential(
            
        nn.Conv2d(in_channels=1, out_channels=channel_size_stft, kernel_size=(kernel_size_stft[0],kernel_size_stft[1]), stride=1, padding=0, dilation=2, groups=1, bias=True, padding_mode='zeros'),
        nn.MaxPool2d((2,2)),
        nn.BatchNorm2d(channel_size_stft, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        nn.ZeroPad2d((pool_stage1_w, 0, pool_stage1_h, 0)), # left, right, top, bottom

        nn.Conv2d(in_channels=channel_size_stft, out_channels=channel_size_stft, kernel_size=(kernel_size_stft[0],kernel_size_stft[1]), stride=1, padding=0, dilation=2, groups=1, bias=True, padding_mode='zeros'),
        nn.MaxPool2d((2,2)),
        nn.BatchNorm2d(channel_size_stft, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
             
        nn.Flatten(start_dim=1, end_dim=2), # assume batched input then [32,1,5,5]-->[32,5,5]
            
        StackedBiLSTM(stage2_h*channel_size_stft, hidden_dim_stft, out_dim_stft)
        ).to(device)
        
        self.stft_lstm = StackedBiLSTM(num_feat_stft, hidden_dim_stft, out_dim_stft).to(device)
        
        if bi_direct_stft == True and out_dim_stft>0:
            in_feat_stft = out_dim_stft*2
        elif bi_direct_stft == True and out_dim_stft==0:
            in_feat_stft = hidden_dim_stft*2
        elif bi_direct_stft == True and out_dim_stft>0:
            in_feat_stft = out_dim_stft
        elif bi_direct_stft == False and out_dim_stft==0:
            in_feat_stft = hidden_dim_stft
        
        if conv_mode:
            time_stft = stage2_w
            
        self.stft_dense_layer = nn.Sequential(
            nn.Flatten(start_dim=1, end_dim=- 1),
            nn.Linear(in_features=in_feat_stft*time_stft, out_features=dense_out_stft, bias=True),
            nn.ReLU()).to(device)

        
        
        self.note_lstm = StackedBiLSTM(num_feat_note, hidden_dim_note, out_dim_note).to(device)
        
        if bi_direct_note == True and out_dim_note>0:
            in_feat_note = out_dim_note*2
        elif bi_direct_note == True and out_dim_note==0:
            in_feat_note = hidden_dim_note*2
        elif bi_direct_note == True and out_dim_note>0:
            in_feat_note = out_dim_note
        elif bi_direct_note == False and out_dim_note==0:
            in_feat_note = hidden_dim_note
        
        self.note_dense_layer = nn.Sequential(
            nn.Flatten(start_dim=1, end_dim=- 1),
            nn.Linear(in_features=in_feat_note*time_note, out_features=dense_out_note, bias=True),
            nn.ReLU()).to(device)

        
        self.scalar_mlp = ScalarMLP(num_feat_scalar, hidden_dim_scalar, out_dim_scalar).to(device)
        
        self.scalar_dense_layer = nn.Sequential(
            nn.Linear(in_features=out_dim_scalar, out_features=dense_out_scalar, bias=True),
            nn.ReLU()).to(device)
            
        
        self.final_part = nn.Sequential(
            nn.Linear(in_features=dense_out_stft+dense_out_note+dense_out_scalar, out_features=final_hidden_dense_out, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=final_hidden_dense_out, out_features=3, bias=True) # 3 labels (GEMS factors to predict)
        ).to(device)
        
        self.device = device
        self.label_type_categorical = label_type_categorical
        if label_type_categorical: # number outputs for categorical case
            out_feat = 2
        else: # number of outputs for regressive case
            out_feat = 1
        
        self.dim1 = nn.Sequential(
            nn.Linear(in_features=dense_out_stft+dense_out_note+dense_out_scalar, out_features=final_hidden_dense_out, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=final_hidden_dense_out, out_features=out_feat, bias=True)
        ).to(device)
        
        self.dim2 = nn.Sequential(
            nn.Linear(in_features=dense_out_stft+dense_out_note+dense_out_scalar, out_features=final_hidden_dense_out, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=final_hidden_dense_out, out_features=out_feat, bias=True)
        ).to(device)
            
        self.dim3 = nn.Sequential(
            nn.Linear(in_features=dense_out_stft+dense_out_note+dense_out_scalar, out_features=final_hidden_dense_out, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=final_hidden_dense_out, out_features=out_feat, bias=True)
        ).to(device)
        
        self.regressive_tanh = nn.Tanh().to(device)              
        
    def forward(self, x_stft, x_symbolic_note_ar, x_scalar):
        
        if self.conv_mode:
            # when input is odd for convolutional layers:
            padding = True
            # shape of x_stft is batchxfeaturextime
            if x_stft.shape[1] % 2 != 0 and x_stft.shape[2] % 2 != 0:
                padder = nn.ZeroPad2d((1, 0, 1, 0)) # left, right, top, bottom
            elif x_stft.shape[1] % 2 != 0:
                 padder = nn.ZeroPad2d((0, 0, 1, 0))
            elif x_stft.shape[2] % 2 != 0:
                padder = nn.ZeroPad2d((1, 0, 0, 0))
            else:
                padding = False

            if padding:
                x_stft = padder(x_stft)
                
            # add channel:
            x_stft = x_stft.unsqueeze(dim=1) # [4,20,290]-->[4,1,20,290]

            h1_stft = self.stft_block(x_stft.to(torch.float32).to(self.device))
        else:                       
            h1_stft = self.stft_lstm(x_stft.to(torch.float32).to(self.device))
                                  
        h2_stft = self.stft_dense_layer(h1_stft)#.reshape(h1_stft.shape[0],-1))
        if self.midibert_mode:
            h2_note = x_symbolic_note_ar
        else:
            h1_note = self.note_lstm(x_symbolic_note_ar.to(self.device))
            h2_note = self.note_dense_layer(h1_note)#.reshape(h1_note.shape[0],-1))
        
        h1_scalar = self.scalar_mlp(x_scalar.to(self.device))
        h2_scalar = self.scalar_dense_layer(h1_scalar)

        
        input_final_part = torch.cat((h2_stft, h2_note, h2_scalar), dim=1) # sample number stays same
        #logits = self.final_part(input_final_part) # predictions for sublimity, ease, vitality
        logits1 = self.dim1(input_final_part)
        logits2 = self.dim2(input_final_part)
        logits3 = self.dim3(input_final_part)
        
        if not self.label_type_categorical:
            logits1 = self.regressive_tanh(logits1)*3
            logits2 = self.regressive_tanh(logits2)*3
            logits3 = self.regressive_tanh(logits3)*3
            
        return logits1, logits2, logits3
              

## 3. Training process

### 3.1. Create trainer

In [8]:
class Training:
    
    def __init__(self, full_model, criterion, optimizer, device):
        self.criterion = criterion
        self.optimizer = optimizer
        self.full_model = full_model
        self.device = device
        
        self.number_epochs = None
        self.train_loss = False
        self.val_loss = False
        self.test_loss = False
        
        self.train_acc = False
        self.val_acc = False
        self.test_acc = False

        self.train_r_squared = False
        self.val_r_squared = False
        self.test_r_squared = False

        self.sig = nn.Sigmoid()

    def train(self, 
              train_batches_stft, train_batches_symbolic, train_batches_scalar, train_y, 
              val_batches_stft, val_batches_symbolic, val_batches_scalar, val_y, 
              number_epochs):

        self.number_epochs = number_epochs
        
        
        # evaluate first:
        train_loss, train_acc, train_r_squared = self.evaluate(train_batches_stft, train_batches_symbolic, train_batches_scalar, train_y)
        val_loss, val_acc, val_r_squared = self.evaluate(val_batches_stft, val_batches_symbolic, val_batches_scalar, val_y)

        train_loss_total = train_loss.reshape(1,-1)
        val_loss_total = val_loss.reshape(1,-1)
        train_r_squared_total = [train_r_squared]
        val_r_squared_total = [val_r_squared]
        for epoch_ind in range(number_epochs):

            train_loss, train_acc,train_r_squared = self.update(train_batches_stft, train_batches_symbolic, train_batches_scalar, train_y)
            val_loss, val_acc, val_r_squared = self.evaluate(val_batches_stft, val_batches_symbolic, val_batches_scalar, val_y)

            train_loss_total = torch.cat((train_loss_total,train_loss.reshape(1,-1)),dim=0)
            val_loss_total = torch.cat((val_loss_total,val_loss.reshape(1,-1)),dim=0)
            train_r_squared_total.append(train_r_squared)
            val_r_squared_total.append(val_r_squared)
            
            
        self.train_loss, self.val_loss = train_loss_total[-1], val_loss_total[-1]
        self.train_r_squared, self.val_r_squared = train_r_squared_total[-1], val_r_squared_total[-1]
        self.train_acc, self.val_acc = train_acc, val_acc

        return train_loss_total, val_loss_total,train_r_squared,val_r_squared
        
    def update(self, batches_stft, batches_symbolic, batches_scalar, y):
        
        # bring model into training mode:
        self.full_model.train()
               
        c_loss = 0
        loss_total = 0
        r_squared = 0
        rc_loss = 0
        counted_correct_pred_total = 0
               
        for x_stft, x_symbolic_note_ar, x_scalar, y_ in zip(batches_stft, batches_symbolic, batches_scalar, y):
        
            if y_.shape[1]>3: # we have categorical case
                y1 = y_[:,0:2]
                y2 = y_[:,2:4]
                y3 = y_[:,4:]
                act = lambda x: x

            else: # regressive y
                y1 = y_[:,0].reshape((-1,1))
                y2 = y_[:,1].reshape((-1,1))
                y3 = y_[:,2].reshape((-1,1))
                tanh_ = torch.nn.Tanh()
                act = lambda x: tanh_(x)*5 # range [-5, 5] because regressive labels were transfered 
                    # into z-score

            x_stft, x_symbolic_note_ar, x_scalar, y1,y2,y3= x_stft.to(self.device), x_symbolic_note_ar.to(self.device), x_scalar.to(self.device), y1.to(self.device), y2.to(self.device), y3.to(self.device)
            
            logits1, logits2, logits3 = self.full_model(x_stft, x_symbolic_note_ar, x_scalar)
              
            logits = torch.cat((logits1, logits2, logits3), dim=1) 
            y = torch.cat((y1,y2,y3), dim=1) 
                
            loss1 = self.criterion(act(logits1), y1)
            loss2 = self.criterion(act(logits2), y2)
            loss3 = self.criterion(act(logits3), y3)

            loss = loss1 + loss2 + loss3
            
            c_loss += loss.shape[0]
            loss_total += torch.sum(loss, dim=0)


            counted_correct_pred = self.my_acc_tool(act(logits), y)
            counted_correct_pred_total += counted_correct_pred

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            if y.shape[0]>1:
                rc_loss += 1
                r_squared += r2_score(logits, y).item()

        avg_loss = loss_total/c_loss
        
        avg_acc = counted_correct_pred_total/c_loss
        r_squared_mean = r_squared/rc_loss
        return avg_loss, avg_acc, r_squared_mean
    
    @torch.no_grad()
    def my_acc_tool(self, logits, y):

        #if 'BCE' in str(self.criterion): # binary labels
        if len(set(y.flatten().tolist())) < 3: # binary labels
            predictions1 = (self.sig(logits[:,0]) >= 0.5).int().reshape((-1,1))
            predictions2 = (self.sig(logits[:,2]) >= 0.5).int().reshape((-1,1))
            predictions3 = (self.sig(logits[:,4]) >= 0.5).int().reshape((-1,1))
            y1 = y[:,0].reshape((-1,1))
            y2 = y[:,2].reshape((-1,1))
            y3 = y[:,4].reshape((-1,1))
            
            predictions = torch.cat((predictions1,predictions2,predictions3),dim=1)
            y = torch.cat((y1,y2,y3),dim=1)
            
        else:
            predictions = torch.round(logits, decimals=1)
            y = torch.round(y, decimals=1)

        wrong_values = abs(predictions-y)
        wrong_values[wrong_values > 0] = 1 # we only count correct (0) and incorrect (1) here

        wrong_values_sum = torch.sum(wrong_values, dim=0)
        counted_correct_pred = (predictions.shape[0] - wrong_values_sum).int()
        return counted_correct_pred
        
    @torch.no_grad()
    def evaluate(self, batches_stft, batches_symbolic, batches_scalar, y, val=True):
        
        self.full_model.eval()
          
        c_loss = 0
        loss_total = 0
        counted_correct_pred_total = 0
        r_squared_sum = 0
        rc_loss = 0
        for x_stft, x_symbolic_note_ar, x_scalar, y_ in  zip(batches_stft, batches_symbolic, batches_scalar, y): # (x,y)=single batch
             
            if y_.shape[1]>3: # we have categorical case
                y1 = y_[:,0:2]
                y2 = y_[:,2:4]
                y3 = y_[:,4:]
                act = lambda x: x
                     
            else: # regressive y

                y1 = y_[:,0].reshape((-1,1))
                y2 = y_[:,1].reshape((-1,1))
                y3 = y_[:,2].reshape((-1,1))
                tanh_ = torch.nn.Tanh()
                act = lambda x: tanh_(x)*5 # range [-5, 5] because regressive labels were transfered 
                    # into z-score*5  
                    
            x_stft, x_symbolic_note_ar, x_scalar = x_stft.to(self.device), x_symbolic_note_ar.to(self.device), x_scalar.to(self.device)
            y1, y2, y3 = y1.to(self.device), y2.to(self.device), y3.to(self.device)
            logits1, logits2, logits3 = self.full_model(x_stft, x_symbolic_note_ar, x_scalar)

            
            logits = torch.cat((logits1, logits2, logits3), dim=1) 
            y = torch.cat((y1,y2,y3), dim=1) 
            
            loss1 = self.criterion(act(logits1), y1)
            loss2 = self.criterion(act(logits2), y2)
            loss3 = self.criterion(act(logits3), y3)
                        
            loss = loss1 + loss2 + loss3
            
            
            c_loss += loss.shape[0]
            loss_total += torch.sum(loss, dim=0)
                                       
            counted_correct_pred = self.my_acc_tool(logits, y)
            counted_correct_pred_total += counted_correct_pred

            if y.shape[0]>1:
                r_squared = r2_score(logits, y)
                r_squared_sum += r_squared.item()
                rc_loss += 1

        avg_loss = loss_total/c_loss
        avg_acc = counted_correct_pred_total/c_loss
        r_squared_mean = r_squared_sum/rc_loss
        if val == False:
            self.test_loss = avg_loss    
            self.test_acc = avg_acc       
            self.test_r_squared = r_squared_mean

        return avg_loss, avg_acc, r_squared_mean
        
    def state_dict(self):

        state_dict = {
            "test_loss": self.test_loss,
            "val_loss": self.val_loss,
            "train_loss": self.train_loss,
            
            "test_acc": self.test_acc,
            "val_acc": self.val_acc,
            "train_acc": self.train_acc,

            "test_r_squared": self.test_r_squared,
            "train_r_squared": self.train_r_squared,
            "val_r_squared": self.val_r_squared,

            "model": self.full_model.state_dict(),
            "objective": self.criterion.state_dict(),
            "optimizer": self.optimizer,#.state_dict(),
            "number_epochs": self.number_epochs,
            "criterion": self.criterion
        }
        
        return state_dict
        

### 3.2. Load tensors and MusicBERT model for MIDI files

#### Use convolution to work with the mel scale?

In [9]:
conv_mode = True

#### Use MusicBERT model instead of note array as symbolic track of the multi-modal model?

In [10]:
midibert_mode = True

This is only needed when '1. Prepare inputs for net' section of this notebook wasn't run to generate the inputs as the inputs already exist in the directory 'created_input_tensors'.

In [14]:
input_folder = 'created_input_tensors'

samples_mel_tensor = torch.load(input_folder+'/mel_tensor.pt')

if midibert_mode:
    symbolic_input_tensor = torch.load(input_folder+'/midibert_input_tensor.pt')
    symbolic_input_tensor.requires_grad = False
else:
    symbolic_input_tensor = torch.load(input_folder+'/symbolic_input_tensor.pt')

scalar_tensor = torch.load(input_folder+'/scalar_tensor.pt')

categorical_labels_tensor = torch.load(input_folder+'/categorical_labels_tensor.pt')
regres_labels_tensor = torch.load(input_folder+'/regressive_labels_tensor.pt')

In [15]:
print(samples_mel_tensor.shape, symbolic_input_tensor.shape, scalar_tensor.shape, categorical_labels_tensor.shape, regres_labels_tensor.shape)


torch.Size([360, 20, 291]) torch.Size([360, 13]) torch.Size([360, 30]) torch.Size([360, 3]) torch.Size([360, 3])


#### Reformulate categorical_labels_tensor

In [16]:
def one_hotter(full_y_arr):
    for i in range(full_y_arr.shape[1]):
        cat_label = full_y_arr[:,i].reshape((-1,1))
        reverse = abs(cat_label - torch.ones((full_y_arr.shape[0],1),dtype=torch.int))
        yield torch.cat((reverse, cat_label),dim=1)

In [17]:
one_hot_sub, one_hot_vital, one_hot_un = one_hotter(categorical_labels_tensor)

In [18]:
one_hot_y = torch.cat((one_hot_sub, one_hot_vital, one_hot_un),dim=1)

#### Reformulate regressive values

In [19]:
one_sub, one_vital, one_un = regres_labels_tensor[:,0],regres_labels_tensor[:,1],regres_labels_tensor[:,2]

### 3.3. Create training loop: Unnested CV with Hyperparameter variation

In [20]:
# set all hyperparameters:
# 8748 options
lr = [1e-5, 1e-3, 1e-5]
batch_size = [16, 4, 32]
number_epochs = [800]

# optimizer:
betas = [(0.98, 0.999)] 

# hyperparameters for network architecture of scalar features dataset:
hidden_dim_scalar = [50, 500, 750]
out_dim_scalar = [10]

kernel_size_stft=[(3,3),(3,5)]
channel_size_stft=[3,6,9]
# given by Tripathi, Tripathi, Beigi paper:
hidden_dim_stft=[56]
out_dim_stft=[0]
bi_direct_stft=[True]



dense_out_stft= [5, 10, 100] 
if midibert_mode:
    dense_out_note= [13]
    hidden_dim_note=[56]
    out_dim_note=[0]
    bi_direct_note=[True]
else:
    dense_out_note= [10, 20, 56] 
    hidden_dim_note=[56]
    out_dim_note=[0]
    bi_direct_note=[True]
    
dense_out_scalar= [20, 30, 200] 

final_hidden_dense_out = [50, 100, 500] 

# set the label to categorical or regressive:
label_type_categorical = [False, True]

device = ['cuda'] if torch.cuda.is_available() else ['cpu']

# create dictionary which is filled with all hyperparameters:
global hyper_dict
hyper_dict = {}

def fill_hyper_dict(*var):
    
    global_variables = globals()
    var = list(var)
    var_id_list = [id(var_ele) for var_ele in var]
    
    for name, value in global_variables.items():
        
        if id(value) in var_id_list:
            found_ind = var_id_list.index(id(value))
            hyper_dict[name] = var[found_ind]
            
            var_id_list.pop(found_ind)
            var.pop(found_ind)

        if len(var_id_list)==0:
            return None
        
    return None

fill_hyper_dict(lr, batch_size, number_epochs,
               betas,
               hidden_dim_scalar, out_dim_scalar,
               hidden_dim_stft, out_dim_stft, bi_direct_stft, kernel_size_stft, channel_size_stft,
               hidden_dim_note, out_dim_note, bi_direct_note,
               dense_out_stft, 
               dense_out_note,
               final_hidden_dense_out,
               dense_out_scalar, 
               label_type_categorical,
               device)

# combine all hyperparameter options with each other by cartesian product/create grid:
hyper_cartesian_prod = [dict(zip(hyper_dict.keys(), values)) for values in itertools.product(*hyper_dict.values())]
#hyper_cartesian_prod

### 3.4. Actual training loop with cross validation split

In [21]:
# text file in which the loss scores of model choices are saved:
txt_title = 'scores_of_best_hyperparameter_choices_20230901.txt'

# create dataloader with same sampling/shuffling mode for each dataset:
g = torch.Generator()
g.manual_seed(19923286)


# split datasets into train, test and validation sets:
kfold_test = StratifiedKFold(n_splits=5, shuffle=True, random_state=np.random.seed(19923286))
kfold_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=np.random.seed(19923286))

# create folder with best models:
folder = 'best_multimodal_models'
os.makedirs(folder, exist_ok=True)


for hyper_ind, hyper_choice in enumerate(hyper_cartesian_prod):
        
    print(f'start hyper choice loop {hyper_ind}...')
    best_test_loss_in_hyper_scenario = float('inf')
    if hyper_choice['label_type_categorical']==True:
        y_split1 = categorical_labels_tensor[:,2].squeeze()
    else:
        y_split1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile').fit_transform(regres_labels_tensor[:,2].reshape(-1,1)).squeeze()
    for rest_id, test_id in kfold_test.split(scalar_tensor,y_split1):

        X_symbolic_rest, X_symbolic_test = symbolic_input_tensor[rest_id], symbolic_input_tensor[test_id]
        X_mel_rest, X_mel_test = samples_mel_tensor[rest_id], samples_mel_tensor[test_id]
        X_scalar_rest, X_scalar_test = scalar_tensor[rest_id], scalar_tensor[test_id]

        if hyper_choice['label_type_categorical']==True:
            
            y_rest, y_test = one_hot_y[rest_id], one_hot_y[test_id]

            y_rest = y_rest.type(torch.float32)
            y_test = y_test.type(torch.float32)

        else:
            
            y_rest, y_test = regres_labels_tensor[rest_id], regres_labels_tensor[test_id]

        best_val_loss = float('inf')
        if hyper_choice['label_type_categorical']==True:
            y_split2 = categorical_labels_tensor[:,2][rest_id].squeeze()
        else:
            y_split2 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile').fit_transform(y_rest[:,2].reshape(-1,1)).squeeze()
        for train_id, val_id in kfold_val.split(X_scalar_rest, y_split2):

            X_symbolic_train, X_symbolic_val = X_symbolic_rest[train_id], X_symbolic_rest[val_id]
            X_mel_train, X_mel_val =  X_mel_rest[train_id],  X_mel_rest[val_id]
            X_scalar_train, X_scalar_val = X_scalar_rest[train_id], X_scalar_rest[val_id]

            y_train, y_val = y_rest[train_id], y_rest[val_id]

            # create dataloaders:
            s_train = torch.utils.data.RandomSampler(X_symbolic_train, generator=g)
            s_val = torch.utils.data.RandomSampler(X_symbolic_val, generator=g)

            X_symbolic_trainloader = torch.utils.data.DataLoader(X_symbolic_train, batch_size=hyper_choice['batch_size'], sampler=s_train,num_workers=4)
            X_symbolic_valloader = torch.utils.data.DataLoader(X_symbolic_val, batch_size=hyper_choice['batch_size'], sampler=s_val,num_workers=4)

            X_mel_trainloader = torch.utils.data.DataLoader(X_mel_train, batch_size=hyper_choice['batch_size'], sampler=s_train,num_workers=4)
            X_mel_valloader = torch.utils.data.DataLoader(X_mel_val, batch_size=hyper_choice['batch_size'], sampler=s_val, num_workers=4)

            X_scalar_trainloader = torch.utils.data.DataLoader(X_scalar_train, batch_size=hyper_choice['batch_size'], sampler=s_train, num_workers=4)
            X_scalar_valloader = torch.utils.data.DataLoader(X_scalar_val, batch_size=hyper_choice['batch_size'], sampler=s_val, num_workers=4)

            y_trainloader = torch.utils.data.DataLoader(y_train, batch_size=hyper_choice['batch_size'], sampler=s_train, num_workers=4)
            y_valloader = torch.utils.data.DataLoader(y_val, batch_size=hyper_choice['batch_size'], sampler=s_val, num_workers=4)

            # initialize model:
            if midibert_mode:
                time_note = 1
            else: 
                time_note = X_symbolic_train.shape[2]
            
            full_model = FullModel(conv_mode=conv_mode, num_feat_stft=X_mel_train.shape[1], time_stft=X_mel_train.shape[2], hidden_dim_stft=hyper_choice['hidden_dim_stft'], out_dim_stft=hyper_choice['out_dim_stft'], bi_direct_stft=hyper_choice['bi_direct_stft'],
                     # num_feat_stft: the rows represent one bin/time window (features per sample)
                     kernel_size_stft=hyper_choice['kernel_size_stft'], channel_size_stft=hyper_choice['channel_size_stft'],
                     midibert_mode=midibert_mode, num_feat_note=X_symbolic_train.shape[1], time_note=time_note, hidden_dim_note=hyper_choice['hidden_dim_note'], out_dim_note=hyper_choice['out_dim_note'], bi_direct_note=hyper_choice['bi_direct_note'],
                     # num_feat_note: the rows represent one time part (features per sample)
                     dense_out_stft=hyper_choice['dense_out_stft'], 
                     dense_out_note=hyper_choice['dense_out_note'],
                     final_hidden_dense_out=hyper_choice['final_hidden_dense_out'],
                     num_feat_scalar=X_scalar_train.shape[-1], hidden_dim_scalar=hyper_choice['hidden_dim_scalar'], out_dim_scalar=hyper_choice['out_dim_scalar'],
                     dense_out_scalar=hyper_choice['dense_out_scalar'],
                     device=hyper_choice['device'],
                     label_type_categorical=hyper_choice['label_type_categorical'])

            # start training (evaluation and optimization):
            # loss criterion:
            if hyper_choice['label_type_categorical']==True:
                # multilabel classification: https://machinelearningmastery.com/multi-label-classification-with-deep-learning/
                criterion = nn.BCEWithLogitsLoss(reduction="none").to(hyper_choice['device']) #, nn.CrossEntropyLoss(reduction='mean').to(config.dev) # BinaryCrossEntropy
            else:
                criterion = nn.MSELoss(reduction="none").to(hyper_choice['device']) #, nn.CrossEntropyLoss(reduction='mean').to(config.dev)

            # Optimizer:
            optimizer = torch.optim.Adam(full_model.parameters(), lr=hyper_choice['lr'], betas=hyper_choice['betas'], eps=1e-08, weight_decay=0, amsgrad=False, foreach=None, maximize=False, capturable=False, differentiable=False, fused=None)

            # training:
            my_trainer = Training(full_model=full_model, criterion=criterion, optimizer=optimizer, device=hyper_choice['device'])
            train_loss_list, val_loss_list, train_r_squared, val_r_squared = my_trainer.train(train_batches_stft=X_mel_trainloader, train_batches_symbolic=X_symbolic_trainloader, train_batches_scalar=X_scalar_trainloader, train_y=y_trainloader, 
                  val_batches_stft=X_mel_valloader, val_batches_symbolic=X_symbolic_valloader, val_batches_scalar=X_scalar_valloader, val_y=y_valloader, 
                  number_epochs=hyper_choice['number_epochs'])

            if val_loss_list[-1].mean() < best_val_loss:

                best_val_loss = val_loss_list[-1].mean()
                best_state_dict = my_trainer.state_dict()
                best_model = full_model

        # evaluate performance on test set:
        # dataloader on test set:
        s_test = torch.utils.data.RandomSampler(X_symbolic_test, generator=g)

        X_symbolic_testloader = torch.utils.data.DataLoader(X_symbolic_test, batch_size=hyper_choice['batch_size'], sampler=s_test, num_workers=4)
        X_mel_testloader = torch.utils.data.DataLoader(X_mel_test, batch_size=hyper_choice['batch_size'], sampler=s_test, num_workers=4)
        X_scalar_testloader = torch.utils.data.DataLoader(X_scalar_test, batch_size=hyper_choice['batch_size'], sampler=s_test, num_workers=4)
        y_testloader = torch.utils.data.DataLoader(y_test, batch_size=hyper_choice['batch_size'], sampler=s_test, num_workers=4)

        my_trainer_test = Training(full_model=best_model, criterion=best_state_dict['criterion'], optimizer=None, device=hyper_choice['device'])

        test_loss, test_acc, test_r_squared = my_trainer_test.evaluate(batches_stft=X_mel_testloader, batches_symbolic=X_symbolic_testloader, batches_scalar=X_scalar_testloader, y=y_testloader, val=False)
        if test_loss.mean() < best_test_loss_in_hyper_scenario:

            best_test_loss_in_hyper_scenario = test_loss.mean()

            # recreate state dict:
            my_trainer_test.val_loss = best_state_dict['val_loss']
            my_trainer_test.train_loss = best_state_dict['train_loss']

            my_trainer_test.val_r_squared = best_state_dict['val_r_squared']
            my_trainer_test.train_r_squared = best_state_dict['train_r_squared']

            my_trainer_test.val_acc = best_state_dict['val_acc']
            my_trainer_test.train_acc = best_state_dict['train_acc']

            my_trainer_test.objective = best_state_dict['objective']
            my_trainer_test.optimizer = best_state_dict['optimizer']
            my_trainer_test.number_epochs = best_state_dict['number_epochs']

            best_state_dict_test = my_trainer_test.state_dict()

            print('better model found')

    # store best test loss in each combination of hyper-parameter scenario:    


    if True:
        testl = best_state_dict_test['test_loss']
        testa = best_state_dict_test['test_acc'] 
        testr = best_state_dict_test['test_r_squared'] 
        print('test acc:', testa, 'test r', testr)

        save_check = False
        if hyper_choice['label_type_categorical']==True:
        
            b = testa.clone().detach()
            max_ = torch.max(b).item()
            if max_ >= 0.8:
                save_check = True 
            elif max_ >= 0.6:
                max_ind = torch.argmax(b).item()
                b[max_ind] = 0
                max2_ = torch.max(b).item()
                if max2_ >= 0.60:
                    save_check = True
        
        elif (hyper_choice['label_type_categorical']==False and (testl.item() < 2.1 or torch.any(testa>0.1).item()) or testr>=-1):
            save_check = True
       
        if save_check==True:
        
            path_to_store = folder + f'/multimodal_model_dict_hyper_round{hyper_ind}'
            torch.save(best_state_dict_test, path_to_store + '.pth')
    
            with open(txt_title, 'a') as f:

                tl = best_state_dict_test['train_loss'].tolist()#.item()
                vl = best_state_dict_test['val_loss'].tolist()#.item()
                testl = testl.tolist()
                ta = best_state_dict_test['train_acc'].tolist()#.item()
                va = best_state_dict_test['val_acc'].tolist()#.item()
                testa = testa.tolist()

                tr = best_state_dict_test["train_r_squared"]
                vr = best_state_dict_test["val_r_squared"]
                testr = best_state_dict_test["test_r_squared"]

                f.write(f'############# Hyper ind: {hyper_ind}; categorical: {hyper_choice["label_type_categorical"]} ##############')
                f.write(f'test accuracy: {testa}, test loss: {testl}, r^2 : {testr}')
                f.write(str(hyper_choice))
                f.write(f"train acc: {ta}, val acc: {va}, train loss: {tl}, val loss: {vl}, r^2 train: {tr}, r^2 val: {vr}\n\n")

                
    print('...done hyper choice loop')
        


start hyper choice loop 0...


Traceback (most recent call last):
  File "/home/c/anaconda3/envs/multimodal_env/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/home/c/anaconda3/envs/multimodal_env/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/c/anaconda3/envs/multimodal_env/lib/python3.8/multiprocessing/connection.py", line 411, in _send_bytes
    self._send(header + buf)
  File "/home/c/anaconda3/envs/multimodal_env/lib/python3.8/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe

KeyboardInterrupt



In [None]:
print('NOTEBOOK FINISHED')