In [84]:
# Importing libraries
import os
import music21 as m21

In [85]:
# Loadind the data from application yaml
import yaml

with open("../config/application.yaml", "r") as file:
    config = yaml.safe_load(file)
    
dataset_path = config['dataset_path']
acceptable_durations = config['acceptable_durations']
save_path = config['save_path']

In [86]:
# defining a funtion to load all the files of the data set
songs = []

def load_songs_kern_dataset(dataset_path):
    for path, subdirs, files in os.walk(dataset_path):
        
        # we need to filter out the ".krn" files from the dataset
        for file in files:
            if file.endswith('.krn'):
                song = m21.converter.parse(os.path.join(path, file))
                songs.append(song)
    return songs

In [87]:
# Function to transpose a song to another scale
def transpose(song): 
    # get key from the song
    parts = song.getElementsByClass(m21.stream.Part)
    measure_part0 = parts[0].getElementsByClass(m21.stream.Measure)
    key = measure_part0[0][4]
    
    # if key not present then estimate the key using music21
    if not isinstance(key, m21.key.Key):
        key = song.analyze('key')
        
    # get the interval for transposition (example: BMaj to CMaj)
    if key.mode == 'major':
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('C'))
    elif key.mode == 'minor':
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('A'))
        
    # transpose the song
    transposed_song = song.transpose(interval)
    
    return transposed_song  

Here we will encode the song into code-understandable form. We will encode the pitch by numbers and duration by '_'. The encode_song function will convert the song to its encoded form. 

For example, if pitch is 60 and duration is 1.0 then this note will be encoded as:
[60, "\_", "\_", "\_"]

In [88]:
# Function to encode pitch and duration of song to machine-readable format
def encode_song(song, time_step=0.25):
    encoded_song = []
    for event in song.flat.notesAndRests:
        if isinstance(event, m21.note.Note):
            symbol = event.pitch.midi
        elif isinstance(event, m21.note.Rest):
            symbol = 'r'
            
        # convert the notes and rests into time series notation
        steps = int(event.duration.quarterLength/time_step)
        for step in range(steps):
            if step == 0:
                encoded_song.append(symbol)
            else:
                encoded_song.append('_')
                
    # Calculate the duration of the song
    encoded_song = " ".join(map(str, encoded_song))
    
    return encoded_song

In [89]:

# function to preproces the songs dataset and prepare the data for our model

def preprocess_songs(dataset_path):
    # Load the songs from the dataset
    print("Loading songs from dataset...")
    songs = load_songs_kern_dataset(dataset_path)
    print(f"Loaded {len(songs)} songs from the dataset.")
    
    # Filter songs based on acceptable durations
    print("Filtering songs based on acceptable durations...")
    for i, song in enumerate(songs):
        if not has_acceptable_duration(song, acceptable_durations):
            continue
    
        # Transpose song to C major or A minor
        song = transpose(song)
        
        # Encode songs with music time series representation
        encoded_song = encode_song(song)   
        
        # Save the encoded song to a file in save path
        saved_path = os.path.join(save_path,  f"song_{i}.txt")
        with open(saved_path, 'w') as fp:
            fp.write(encoded_song)

In [90]:
# Function to check whether a song has acceptable duration
def has_acceptable_duration(song, acceptable_durations):
    for note in song.flat.notesAndRests:
        if note.duration.quarterLength not in acceptable_durations:
            return False
    return True

In [91]:
if __name__ == "__main__":
    preprocess_songs(dataset_path)

Loading songs from dataset...
Loaded 12 songs from the dataset.
Filtering songs based on acceptable durations...
