In [17]:
import os
# import environment variables from direnv
from dotenv import load_dotenv
load_dotenv()
RAW_DATA_PATH = os.getenv('RAW_DATA_PATH')
raw_data_path = RAW_DATA_PATH
directory =  raw_data_path + '/Bass monophon/Lists/Chorus'
files = os.listdir(directory)
for file in files:
    print(file)

B51-00000-3311-023.xml
B21-00000-3312-028.xml
B53-00000-3311-024.xml
B23-00000-3312-030.xml
B31-00000-3313-043.xml
B22-00000-3312-029.xml
B51-00000-3312-035.xml
B53-00000-3313-048.xml
B31-00000-3311-019.xml
B21-00000-3313-040.xml
B12-00000-3313-038.xml
B12-00000-3311-014.xml
B22-00000-3311-017.xml
B31-00000-3312-031.xml
B21-00000-3311-016.xml
B41-00000-3312-033.xml
B41-00000-3311-021.xml
B12-00000-3312-026.xml
B33-00000-3311-020.xml
B22-00000-3313-041.xml
B43-00000-3312-034.xml
B53-00000-3312-036.xml
B33-00000-3312-032.xml
B13-00000-3312-027.xml
B11-00000-3312-025.xml
B13-00000-3311-015.xml
B13-00000-3313-039.xml
B51-00000-3313-047.xml
B11-00000-3311-013.xml
B23-00000-3311-018.xml
B43-00000-3311-022.xml
B23-00000-3313-042.xml
B41-00000-3313-045.xml
B11-00000-3313-037.xml
B43-00000-3313-046.xml
B33-00000-3313-044.xml


In [18]:
# each file is an xml file with a list of audiofile
#    <audiofile>
#       <fileID>B11-28100-3311-00625</fileID>
#       <instrument>B</instrument>
#       <instrumentsetting>1</instrumentsetting>
#       <playstyle>1</playstyle>
#       <midinr>28</midinr>
#       <string>1</string>
#       <fret>00</fret>
#       <fxgroup>3</fxgroup>
#       <fxtype>31</fxtype>
#       <fxsetting>1</fxsetting>
#       <filenr>00625</filenr>
#    </audiofile>


# for each file, read the xml and extract the audiofile properties
import xml.etree.ElementTree as ET
import pandas as pd

data = []

for file in files:
    tree = ET.parse(os.path.join(directory, file))
    root = tree.getroot()
    for audiofile in root.findall('audiofile'):
        fileID = audiofile.find('fileID').text
        instrument = audiofile.find('instrument').text
        instrumentsetting = audiofile.find('instrumentsetting').text
        playstyle = audiofile.find('playstyle').text
        midinr = audiofile.find('midinr').text
        string = audiofile.find('string').text
        fret = audiofile.find('fret').text
        fxgroup = audiofile.find('fxgroup').text
        fxtype = audiofile.find('fxtype').text
        fxsetting = audiofile.find('fxsetting').text
        filenr = audiofile.find('filenr').text

        # save in a dataframe
        data.append({
            'fileID': fileID,
            'instrument': instrument,
            'instrumentsetting': instrumentsetting,
            'playstyle': playstyle,
            'midinr': midinr,
            'string': string,
            'fret': fret,
            'fxgroup': fxgroup,
            'fxtype': fxtype,
            'fxsetting': fxsetting,
            'filenr': filenr
        })

In [19]:
df = pd.DataFrame(data)
display(df.head())
df.info()

Unnamed: 0,fileID,instrument,instrumentsetting,playstyle,midinr,string,fret,fxgroup,fxtype,fxsetting,filenr
0,B51-28100-3311-01145,B,5,1,28,1,0,3,31,1,1145
1,B51-29101-3311-01146,B,5,1,29,1,1,3,31,1,1146
2,B51-30102-3311-01147,B,5,1,30,1,2,3,31,1,1147
3,B51-31103-3311-01148,B,5,1,31,1,3,3,31,1,1148
4,B51-32104-3311-01149,B,5,1,32,1,4,3,31,1,1149


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1872 entries, 0 to 1871
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   fileID             1872 non-null   object
 1   instrument         1872 non-null   object
 2   instrumentsetting  1872 non-null   object
 3   playstyle          1872 non-null   object
 4   midinr             1872 non-null   object
 5   string             1872 non-null   object
 6   fret               1872 non-null   object
 7   fxgroup            1872 non-null   object
 8   fxtype             1872 non-null   object
 9   fxsetting          1872 non-null   object
 10  filenr             1872 non-null   object
dtypes: object(11)
memory usage: 161.0+ KB


In [20]:
# dictionary to map bass string and fret to note name
note_names = {
    0: 'C', 1: 'C#', 2: 'D', 3: 'D#', 4: 'E', 5: 'F',
    6: 'F#', 7: 'G', 8: 'G#', 9: 'A', 10: 'A#', 11: 'B'
}
def get_note_name(string, fret):
    # bass standard tuning E1 A1 D2 G2
    string_tuning = {
        '1': 28,  # E1
        '2': 33,  # A1
        '3': 38,  # D2
        '4': 43   # G2
    }
    midi_number = string_tuning[string] + int(fret)
    note_index = midi_number % 12
    octave = (midi_number // 12) - 1
    return f"{note_names[note_index]}{octave}"

df['note_name'] = df.apply(lambda row: get_note_name(row['string'], row['fret']), axis=1)
display(df.head())

Unnamed: 0,fileID,instrument,instrumentsetting,playstyle,midinr,string,fret,fxgroup,fxtype,fxsetting,filenr,note_name
0,B51-28100-3311-01145,B,5,1,28,1,0,3,31,1,1145,E1
1,B51-29101-3311-01146,B,5,1,29,1,1,3,31,1,1146,F1
2,B51-30102-3311-01147,B,5,1,30,1,2,3,31,1,1147,F#1
3,B51-31103-3311-01148,B,5,1,31,1,3,3,31,1,1148,G1
4,B51-32104-3311-01149,B,5,1,32,1,4,3,31,1,1149,G#1


In [23]:
# add file path. They should be in raw_data/Bass monophon/Audio/Samples/<fileID>.wav
df['file_path'] = df['fileID'].apply(lambda x: os.path.join('Bass monophon', 'Audio', 'Samples', f"{x}.wav"))
display(df.head())

Unnamed: 0,fileID,instrument,instrumentsetting,playstyle,midinr,string,fret,fxgroup,fxtype,fxsetting,filenr,note_name,file_path
0,B51-28100-3311-01145,B,5,1,28,1,0,3,31,1,1145,E1,Bass monophon/Audio/Samples/B51-28100-3311-011...
1,B51-29101-3311-01146,B,5,1,29,1,1,3,31,1,1146,F1,Bass monophon/Audio/Samples/B51-29101-3311-011...
2,B51-30102-3311-01147,B,5,1,30,1,2,3,31,1,1147,F#1,Bass monophon/Audio/Samples/B51-30102-3311-011...
3,B51-31103-3311-01148,B,5,1,31,1,3,3,31,1,1148,G1,Bass monophon/Audio/Samples/B51-31103-3311-011...
4,B51-32104-3311-01149,B,5,1,32,1,4,3,31,1,1149,G#1,Bass monophon/Audio/Samples/B51-32104-3311-011...


In [24]:
# save the dataframe to a csv file in data/preprocessed/chorus_bass_list.csv
output_dir = os.path.join('data', 'preprocessed')
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'chorus_bass_list.csv')
df.to_csv(output_file, index=False)
print(f"Saved chorus bass list to {output_file}")

Saved chorus bass list to data/preprocessed/chorus_bass_list.csv
