In [1]:
import pandas as pd
import numpy as np
import os

## Metadata

In [2]:
metadata_df = pd.read_csv("data/billboard-2.0-index.csv")

In [None]:
print('There are %d entries in the index table.' %len(metadata_df))

In [None]:
print('There are %d entries with a given title.' %metadata_df.title.isna().value_counts()[0])

In [None]:
print('There are %d entries with a given artist.' %metadata_df.artist.isna().value_counts()[0])

In [None]:
print('There are %d entries with a given chart date.' %metadata_df.chart_date.isna().value_counts()[0])

In [None]:
months = {'01':'January', 
          '02':'February',
          '03':'March',
          '04':'April',
          '05':'May',
          '06':'June',
          '07':'July',
          '08':'August',
          '09':'September',
          '10':'October',
          '11':'November',
          '12':'December'}
def format_date(date):
    year = date[:4]
    month = date[5:7]
    day = date[-2:]
    if day == '01':
        suffix = 'st'
    elif day == '02':
        suffix = 'nd'
    elif day == '03':
        suffix = 'rd'
    else:
        suffix = 'th'
        
    if day[0] == '0':
        day = day[1]
    
    date_string = months[month] + ' ' + day + suffix + ', ' + year
    return(date_string)

#Test
format_date('1958-08-04')

In [None]:
print('The songs range from %s to %s.' %(format_date(metadata_df.chart_date.min()), format_date(metadata_df.chart_date.max())))

## Parser 

In [3]:
SONG_ID, LINE_NUMBER, MEASURE_NUMBER, CHORD_NUMBER, CHORD, SUFFIX, TYPE, TIME, STRUCTURE = \
"song_id","line_id", "measure_id", "chord_id", "chord", \
"instrument", "section_type", "time", "section_structure"

def immutable_merge(dic1, dic2):
    result = dic1.copy()
    result.update(dic2)
    return result

def create_row(persistent_attributes, line_attributes, measure_number = None, chord_number = None, chord = None):
    result = immutable_merge(persistent_attributes, line_attributes)
    
    if not (measure_number is None and measure_number is None and chord_number is None):
        result[MEASURE_NUMBER] = measure_number
        result[CHORD_NUMBER] = chord_number
        result[CHORD] = chord
    
    return result

def process_line_metadata(header, line_counter, old_line_attributes, suffix = ""):
    
    result = {}
    
    #Suffix (main instrument)
    old_suffix = str(old_line_attributes.get(SUFFIX))
    
    ##New suffix
    if len(suffix) > 0 and suffix != "\n":
        result[SUFFIX] = suffix.strip("\n").strip(",").strip()
    
    ##Main instrument continued (experimental)
    elif not old_suffix.endswith(")") and old_suffix.lower() not in ["nan","none"] and len(old_suffix)>0:
        result[SUFFIX] = old_suffix.strip("(")
        
    #Line number
    result[LINE_NUMBER] = line_counter

    #Header    
    header_items = header.split()
        
    result[TIME] = header_items[0]
    
    #Case where a section is continued
    if len(header_items) == 1:
        result[TYPE] = old_line_attributes.get(TYPE)
        result[STRUCTURE] = old_line_attributes.get(STRUCTURE)
    
    #Case where a section has no structure (silence, end, fadeout)
    elif len(header_items) == 2:
        result[TYPE] = header_items[1].strip().strip(",")
    
    #Case where a section begins.
    elif len(header_items) == 3:
        result[STRUCTURE] = header_items[1].strip().strip(",")
        result[TYPE] = header_items[2].strip().strip(",")
    
    return result

In [4]:
def parse_song_to_dict(song_id, path):
    
    rows = []
    persistent_attributes = {}
    
    persistent_attributes[SONG_ID] = song_id
    
    with open(path,"r") as file:
        line = file.readline()
        
        line_counter = 0
        measure_counter = 0
        chord_counter = 0
        line_attributes = {}
 

        while line:
        
            if line != "\n":

                #Attribute lines
                if line.startswith("#"):
                    attribute, value = line.strip("#").split(":",1)
                    persistent_attributes[attribute.strip(" ")] = value.strip(" ").strip("\n")

                else:
                    line_items = line.split("|")

                    #Special lines
                    if len(line_items) <= 1:
                        line_attributes = process_line_metadata(line, line_counter, line_attributes)
                        row = create_row(persistent_attributes, line_attributes)
                        rows.append(row)

                    #Standard lines    
                    else:                    
                        header = line_items[0]
                        suffix = line_items[-1]
                        measures = line_items[1:-1]

                        line_attributes = process_line_metadata(header, line_counter, line_attributes, suffix)  

                        for measure in measures:

                            for chord in measure.split():
                                row = create_row(persistent_attributes, line_attributes,
                                                 measure_counter, chord_counter, chord)
                                rows.append(row)
                                chord_counter += 1

                            measure_counter += 1
            
            #Finally
            line_counter += 1
            line = file.readline()
    
    
    return rows

In [5]:
test = pd.DataFrame(parse_song_to_dict(0,"data/McGill-Billboard/0004/salami_chords.txt"))

In [6]:
def create_whole_collection_df():
    
    path = "data/McGill-Billboard/"
    file_name = "/salami_chords.txt"
    UPPER_BOUND = 1300
    
    whole_collection = []
    
    i = 0
    while i <= UPPER_BOUND:
        full_path = path + "0"*(4-len(str(i)))+ str(i) + file_name
        
        if os.path.exists(full_path):
            whole_collection += parse_song_to_dict(i, full_path)
        
        i += 1
        
    whole_collection_df = pd.DataFrame(whole_collection)
    
    return whole_collection_df.astype({'measure_id': 'Int64', 'chord_id': 'Int64'})

In [7]:
collection_df = create_whole_collection_df()

In [8]:
collection_df.sample(10)

Unnamed: 0,song_id,title,artist,metre,tonic,line_id,time,section_type,section_structure,measure_id,chord_id,chord,instrument
39361,393,Misunderstanding,Genesis,4/4,C,8,12.261224489,intro,A,4,10,D:min,guitar
63993,636,After the Lovin',Engelbert Humperdinck,6/8,C,33,165.641564625,chorus,D,107,110,Gb:maj7,voice
126678,1273,We Don't Talk Anymore,Cliff Richard,4/4,C,16,83.007165532,chorus,A,38,56,C:maj9,voice
1495,23,And She Was,Talking Heads,4/4,E,25,145.22170068,pre-chorus,C,79,128,F:maj,voice
82547,814,Big Iron,Marty Robbins,4/4,E,13,39.388730158,verse,B,38,38,E:maj,voice
76209,759,Foggy Mountain Breakdown,Flatt & Scruggs,4/4,Ab,9,9.878276643,verse,A,12,12,Eb:7,banjo
4315,53,Over The Hills and Far Away,Led Zeppelin,4/4,G,33,280.844149659,outro,A',116,196,C:maj,guitar)
89470,882,Somebody s Watching Me,Rockwell,4/4,C#,36,227.687437641,fadeout,,121,167,A:maj,voice)
9495,100,Maggie May,Rod Stewart,4/4,D,20,98.360884353,verse,C,48,96,A:maj,voice
101969,1013,Time Will Reveal,Debarge,4/4,C,18,96.007573696,verse,B,27,81,B:min7,voice


In [None]:
#BUGS: Point = répétition du même accord? A élucider et modifier.