In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
metadata_df = pd.read_csv("data/billboard-2.0-index.csv")

In [None]:
SONG_ID, LINE_NUMBER, MEASURE_NUMBER, CHORD_NUMBER, CHORD, SUFFIX, TYPE, TIME, STRUCTURE = \
"song_id","line_id", "measure_id", "chord_id", "chord", \
"instrument", "section_type", "time", "section_structure"

def immutable_merge(dic1, dic2):
    result = dic1.copy()
    result.update(dic2)
    return result

def create_row(persistent_attributes, line_attributes, measure_number = None, chord_number = None, chord = None):
    result = immutable_merge(persistent_attributes, line_attributes)
    
    if not (measure_number is None and measure_number is None and chord_number is None):
        result[MEASURE_NUMBER] = measure_number
        result[CHORD_NUMBER] = chord_number
        result[CHORD] = chord
    
    return result

def process_line_metadata(header, line_counter, old_line_attributes, suffix = ""):
    
    result = {}
    
    #Suffix (main instrument)
    old_suffix = str(old_line_attributes.get(SUFFIX))
    
    ##New suffix
    if len(suffix) > 0 and suffix != "\n":
        result[SUFFIX] = suffix.strip("\n").strip(",").strip()
    
    ##Main instrument continued (experimental)
    elif not old_suffix.endswith(")") and old_suffix.lower() not in ["nan","none"] and len(old_suffix)>0:
        result[SUFFIX] = old_suffix.strip("(")
        
    #Line number
    result[LINE_NUMBER] = line_counter

    #Header    
    header_items = header.split()
        
    result[TIME] = header_items[0]
    
    #Case where a section is continued
    if len(header_items) == 1:
        result[TYPE] = old_line_attributes.get(TYPE)
        result[STRUCTURE] = old_line_attributes.get(STRUCTURE)
    
    #Case where a section has no structure (silence, end, fadeout)
    elif len(header_items) == 2:
        result[TYPE] = header_items[1].strip().strip(",")
    
    #Case where a section begins.
    elif len(header_items) == 3:
        result[STRUCTURE] = header_items[1].strip().strip(",")
        result[TYPE] = header_items[2].strip().strip(",")
    
    return result

In [None]:
def parse_song_to_dict(song_id, path):
    
    rows = []
    persistent_attributes = {}
    
    persistent_attributes[SONG_ID] = song_id
    
    with open(path,"r") as file:
        line = file.readline()
        
        line_counter = 0
        measure_counter = 0
        chord_counter = 0
        line_attributes = {}
 

        while line:
        
            if line != "\n":

                #Attribute lines
                if line.startswith("#"):
                    attribute, value = line.strip("#").split(":",1)
                    persistent_attributes[attribute.strip(" ")] = value.strip(" ").strip("\n")

                else:
                    line_items = line.split("|")

                    #Special lines
                    if len(line_items) <= 1:
                        line_attributes = process_line_metadata(line, line_counter, line_attributes)
                        row = create_row(persistent_attributes, line_attributes)
                        rows.append(row)

                    #Standard lines    
                    else:                    
                        header = line_items[0]
                        suffix = line_items[-1]
                        measures = line_items[1:-1]

                        line_attributes = process_line_metadata(header, line_counter, line_attributes, suffix)  

                        for measure in measures:

                            for chord in measure.split():
                                row = create_row(persistent_attributes, line_attributes,
                                                 measure_counter, chord_counter, chord)
                                rows.append(row)
                                chord_counter += 1

                            measure_counter += 1
            
            #Finally
            line_counter += 1
            line = file.readline()
    
    
    return rows

In [None]:
test = pd.DataFrame(parse_song_to_dict(0,"data/McGill-Billboard/0004/salami_chords.txt"))

In [None]:
def create_whole_collection_df():
    
    path = "data/McGill-Billboard/"
    file_name = "/salami_chords.txt"
    UPPER_BOUND = 1300
    
    whole_collection = []
    
    i = 0
    while i <= UPPER_BOUND:
        full_path = path + "0"*(4-len(str(i)))+ str(i) + file_name
        
        if os.path.exists(full_path):
            whole_collection += parse_song_to_dict(i, full_path)
        
        i += 1
        
    whole_collection_df = pd.DataFrame(whole_collection)
    
    return whole_collection_df.astype({'measure_id': 'Int64', 'chord_id': 'Int64'})

In [None]:
collection_df = create_whole_collection_df()

In [None]:
collection_df.sample(10)

In [None]:
#BUGS: Point = répétition du même accord? A élucider et modifier.