In [2]:
import music21
import os
import glob
import json
import csv
import pandas as pd



In [3]:
#Elaborate on all the filters, provide the results
base_dir1 = '../CPDL NO REPEAT_1'
base_dir2 = '../CPDL NO REPEAT_2'
all_score_files_dir1 = os.listdir(base_dir1)
all_score_files_dir2 = os.listdir(base_dir2)

In [4]:
len(all_score_files_dir1), len(all_score_files_dir2)

(353, 536)

## We find a total of 353 scores in directory1 and 536 scores in directory 2 i.e. a toal of 889 score files

### Score Filters Criteria

### (1) Filter all scores that are not parsable i.e. cannot be read using xml reader using music21

### Parse for directory 1

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
#Script to check the files that can be opened sucessfully using music21!
possible_file_type_extensions = ['*.xml', '*.musicxml']
total_correct_files = 0
correct_file_names = []
for score_file in all_score_files_dir1:
    if score_file=='.DS_Store':
        continue
    base_path = os.path.join(base_dir1, score_file)
    
    for file_extension in possible_file_type_extensions:
        musicxml_file = sorted(glob.glob(os.path.join(base_path, file_extension)))
        if len(musicxml_file)!= 0:
            xml_file_path = musicxml_file[0] 
    try:
        score = music21.converter.parse(xml_file_path)
        correct_file_names.append(xml_file_path)
    except:
        pass
        #print("Could not parse", musicxml_file)

In [7]:
len(correct_file_names)

296

### Parse for directory 2

In [8]:
#PArse directory 2
possible_file_type_extensions = ['*.xml', '*.musicxml']
total_correct_files_dir2 = 0
correct_file_names_dir2 = []
for score_file in all_score_files_dir2:
    if score_file=='.DS_Store':
        continue
    base_path = os.path.join(base_dir2, score_file)
    for file_extension in possible_file_type_extensions:
        musicxml_file = sorted(glob.glob(os.path.join(base_path, file_extension)))
        if len(musicxml_file)!= 0:
            xml_file_path = musicxml_file[0] 
    try:
        score = music21.converter.parse(xml_file_path)
        correct_file_names_dir2.append(xml_file_path)
    except:
        pass
        #print("Could not parse", musicxml_file)

In [9]:
len(correct_file_names_dir2)

531

# Results
### We obtain a total of 296 files from folder 1 and 531 files for folder 2 i.e. a total of 827 folders

## Back to more filters ...
### (2) Filter all scores for which the count of the number of streams in ‘voices’ folder is not the same as the number of streams in the xml file read using music21

## Parse for directory 1

In [10]:
#Of correct file names, get the number of scores that have the same count of files in the voices folder
#We observe that sometimes the streams for accompaniment are counted as separate streams in music21, 
#but they are synthesized as one accompaniment in the voices folder. To account for that, we convert the stream 
#information to a set and compare using the set length
correct_files_with_all_voices_dir1 = []
for music_xml_file in correct_file_names:
    score = music21.converter.parse(music_xml_file)
    parts_list = []
    for part in score.parts:
        parts_list.append(part.partName)
    num_unique_streams = set(parts_list)    
    # Get the number of streams in the parsed score
    #num_streams = len(score.getElementsByClass(music21.stream.Stream))
    #print("Number of streams in the parsed score:", num_streams)
    #Test if the number of streams is the same as the number of voices in the voices folder
    voices_folder_path = os.path.join(os.path.dirname(music_xml_file), 'voices')
    all_voices = os.listdir(voices_folder_path)
    if len(all_voices) == len(num_unique_streams):
        correct_files_with_all_voices_dir1.append(music_xml_file)

In [11]:
len(correct_files_with_all_voices_dir1)

250

## Parse for directory 2

In [12]:
#Of correct file names, get the number of scores that have the same count of files in the voices folder
#We observe that sometimes the streams for accompaniment are counted as separate streams in music21, 
#but they are synthesized as one accompaniment in the voices folder. To account for that, we convert the stream 
#information to a set and compare using the set length

correct_files_with_all_voices_dir2 = []
for music_xml_file in correct_file_names_dir2:
    score = music21.converter.parse(music_xml_file)
    parts_list = []
    for part in score.parts:
        parts_list.append(part.partName)
    num_unique_streams = set(parts_list)   
    #print("Number of streams in the parsed score:", num_streams)
    #Test if the number of streams is the same as the number of voices in the voices folder
    voices_folder_path = os.path.join(os.path.dirname(music_xml_file), 'voices')
    all_voices = os.listdir(voices_folder_path)
    if len(all_voices) == len(num_unique_streams):
        correct_files_with_all_voices_dir2.append(music_xml_file)

In [16]:
len(correct_files_with_all_voices_dir2)

451

# Results
###  We obtain a total of len 250 files for directory 1 and 451 files from directory 2 i.e. a total of 701 files

## Back to more filters ...

## We now apply some musical criteria that was found by listening to several  songs (greater than 300 from the list) from both the folders by a musician 

## (3) Filter out all files with 'Tr..mp3' in the voices folder - this criteria is specific to the dataset
#### Automatically discard all the folders that name the first voice (often named as Soprano or Cantus or Voice or Voice 1) “Treble” or “Tr”; they correspond to audio files “treble.mp3”, “tr..mp3”, “Tr.mp3” and for this DATASET they are instrumental generated instead of voice generated, even though they should be voice generated because they carry the leading voice


## Parse directory 1

In [13]:
#select the voices folder again and check for all files with Tr..mp3
correct_files_with_all_voices_dir1_filter_treble = []
for music_xml_file_path in correct_files_with_all_voices_dir1:
    score = music21.converter.parse(music_xml_file_path)
    # Get the number of streams in the parsed score
    num_streams = len(score.getElementsByClass(music21.stream.Stream))
    #print("Number of streams in the parsed score:", num_streams)
    #Test if the number of streams is the same as the number of voices in the voices folder
    voices_folder_path = os.path.join(os.path.dirname(music_xml_file_path), 'voices')
    all_voices = os.listdir(voices_folder_path)
    if 'Tr..mp3' in all_voices:
        continue
    else:
        correct_files_with_all_voices_dir1_filter_treble.append(music_xml_file_path)

In [14]:
len(correct_files_with_all_voices_dir1_filter_treble)

223

## Parse directory 2

In [15]:
#select the voices folder again and check for all files with Tr..mp3
correct_files_with_all_voices_dir2_filter_treble = []
for music_xml_file_path in correct_files_with_all_voices_dir2:
    score = music21.converter.parse(music_xml_file_path)
    # Get the number of streams in the parsed score
    #num_streams = len(score.getElementsByClass(music21.stream.Stream))
    #print("Number of streams in the parsed score:", num_streams)
    #Test if the number of streams is the same as the number of voices in the voices folder
    voices_folder_path = os.path.join(os.path.dirname(music_xml_file_path), 'voices')
    all_voices = os.listdir(voices_folder_path)
    if 'Tr..mp3' in all_voices:
        continue
    else:
        correct_files_with_all_voices_dir2_filter_treble.append(music_xml_file_path)

In [16]:
len(correct_files_with_all_voices_dir2_filter_treble)

423

# Results
### We obtain a total of 223 files from folder 1 and 423 files for folder 2 i.e. a total of 646 folders
## Back to more filters ...

## We now apply some more musical criteria for both the folders by the musician

#### (4) Automatically discard all the folders for which, one or more voices in the score, have not at all lyrics, when should have (some editors avoid to repeat the same lyric for all voices, assuming that the musicians will relate to the first voice to read the corresponding verse). For these cases, the audio files are instrumental generated instead of voice generated. So, it is possible to automatically discard these folders by detecting in one or more than one audio files, this instrumental generation when the file name indicate a voice channel (‘soprano’, ‘alto’, ‘tenor’ ‘bass’, ‘cantus’, ‘quintus’, ‘altus’, ‘bassus’, ‘voice#’)


## Parse directory 1

In [17]:
def preprocess_string(s):
    # Remove hyphens and blank spaces
    return s.replace('-', '').replace(' ', '')

In [18]:
#Get the same number of files with score directory 1, the name of accompaniment can be different 
#in different scores. We filter out data using the file "info.json"
possible_candidates_with_wrong_lyrics_dir1 = []
for music_xml_file_path in correct_files_with_all_voices_dir1_filter_treble:
    missing_lyrics = False
    #music_xml_file_path = music_xml_file['Parsable Files']
    score = music21.converter.parse(music_xml_file_path)
    all_lyrics = []
    #Get if the file has accompaniment, if yes, then fetch the name from json file and remove it from lyrics list
    json_path = os.path.join(os.path.dirname(music_xml_file_path), 'info.json')
    with open(json_path, "r") as json_file:
    # Load the JSON data from the file
        data = json.load(json_file)
    #default accompaniment name
    accompaniment_name = 'Piano'
    if data['parts'][-1]['type'] == 'accompaniment':
        accompaniment_name = data['parts'][-1]['name']    
    
    for part in score.parts:
        if part.partName == accompaniment_name:
            continue
        
        lyrics_for_part = ''
        for note in part.recurse().getElementsByClass('Note'):            
            if note.lyric:
                lyrics_for_part += note.lyric
        processed_string = preprocess_string(lyrics_for_part)
        all_lyrics.append(str(processed_string))
    #print(all_lyrics)
    missing_lyrics = [s=='' for s in all_lyrics]
    if any(missing_lyrics):            
        possible_candidates_with_wrong_lyrics_dir1.append(music_xml_file_path)

In [19]:
len(possible_candidates_with_wrong_lyrics_dir1)

10

In [20]:
#Remove the identified songs from the final list
for item in correct_files_with_all_voices_dir1_filter_treble:
    if item in possible_candidates_with_wrong_lyrics_dir1:
        correct_files_with_all_voices_dir1_filter_treble.remove(item)

In [22]:
len(correct_files_with_all_voices_dir1_filter_treble)

213

## Parse directory 2

In [23]:
#Get the same number of files with score directory 1, the name of accompaniment can be different 
#in different scores. We filter out data using the file "info.json"
possible_candidates_with_wrong_lyrics_dir2 = []
for music_xml_file_path in correct_files_with_all_voices_dir2_filter_treble:
    missing_lyrics = False
    #music_xml_file_path = music_xml_file['Parsable Files']
    score = music21.converter.parse(music_xml_file_path)
    all_lyrics = []
    #Get if the file has accompaniment, if yes, then fetch the name from json file and remove it from lyrics list
    json_path = os.path.join(os.path.dirname(music_xml_file_path), 'info.json')
    with open(json_path, "r") as json_file:
    # Load the JSON data from the file
        data = json.load(json_file)
    if data['parts'][-1]['type'] == 'accompaniment':
        accompaniment_name = data['parts'][-1]['name']    
    
    for part in score.parts:
        if part.partName == accompaniment_name:
            continue
        
        lyrics_for_part = ''
        for note in part.recurse().getElementsByClass('Note'):            
            if note.lyric:
                lyrics_for_part += note.lyric
        processed_string = preprocess_string(lyrics_for_part)
        all_lyrics.append(str(processed_string))
    #print(all_lyrics)
    missing_lyrics = [s=='' for s in all_lyrics]
    if any(missing_lyrics):            
        possible_candidates_with_wrong_lyrics_dir2.append(music_xml_file_path)

In [31]:
len(possible_candidates_with_wrong_lyrics_dir2)

51

In [33]:
#Remove the identified songs from the final list
for item in correct_files_with_all_voices_dir2_filter_treble:
    if item in possible_candidates_with_wrong_lyrics_dir2:
        correct_files_with_all_voices_dir2_filter_treble.remove(item)

In [34]:
len(correct_files_with_all_voices_dir2_filter_treble)

373

In [24]:
# Save the files in csv 
#Save the filtered data based on lyrics in a csv
file_path_CPDL1 = 'correct_file_names_removed_treble_wrong_lyrics_cpdl_repeat1.csv'
file_path_CPDL2 = 'correct_file_names_removed_treble_wrong_lyrics_cpdl_repeat2.csv'
# Open the file in write mode
with open(file_path_CPDL1, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Parsable Files'])
    for file_name in correct_files_with_all_voices_dir1_filter_treble:
        writer.writerow([file_name])

# Confirm that the list has been saved
print("All data saved successfully!")

# Open the file in write mode
with open(file_path_CPDL2, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Parsable Files'])
    for file_name in correct_files_with_all_voices_dir2_filter_treble:
        writer.writerow([file_name])

# Confirm that the list has been saved
print("All data saved successfully!")


All data saved successfully!
All data saved successfully!


# Results
### We obtain a total of 213 files from folder 1 and 373 files for folder 2 i.e. a total of 586 folders from which we chose 20 songs and mark them as accepted based on manually listening to the accepted files. It is to be noted that a large number of files are rejected in the manual listening experiment 

# Future Work

We can add more musically informed criteria to filter out more renditions, however, since the data is not very consistent, it is hard to implement the identified criteria. The other musical criteria for consideration includes:

(5) Automatically discard all the folders that, corresponding to strophic chorale songs, have no indication of repetition (repetition bars, da capo or da segno). This could be addressed by simple math calculation knowing the pulse, the time signature, the number of measures and the number of stanzas

Comments: Its not easy to identify how many parts are meant for Repeat and so on, and how many times the score is to be repeated

(6) Automatically discard all the folders that, instead of relating each note with its corresponding syllable of the verse, put the whole verse continuously in just the first note. As a consequence of this, the voice generation consists on the melody sung by a continuous sound “ah” for all the channels. So, it is possible to automatically discard these folders by detecting in one or more than one audio files, this onomatopoeic generation 

Comments: This criteria has been implemented by using lyrics information. If the lyrics are absent in any of the score parts, we filter out that score. 

In the analysis of musicxml scores, it was found that the lyrics information is sometimes included in place of TextExpression at the beginning of the score. The lyrics were not tied to notes in that case, or the first note. We tried implementing a way to filter out using the length of TextExpression, assuming that if the lyrics are indeed included as part of it, the length of TextExpression would be more than say a specific threshold (maybe 30), however, many times the TextExpression consists of some details of the score like the composer etc. Hence, a basic thresholding based criteria does not provide sufficient information to filter based on TextExpression. We decided to go for the lyrics approach thereafter.
 