In [85]:
import requests as re
import hdf5_getters as GETTERS
from midi_methods import *
from collections import defaultdict
from sklearn import linear_model, model_selection
import numpy as np
import json
import os 
import h5py
import tables
import pandas as pd
import time 

In [86]:
# NOTE: this cell assumes you already parsed genre metadata from lmd_matched_h5 (see match_genres function)

# load genre metadata from above to dataframe
json_match_fp = 'example_match_genre.json'      # example_match_genre.json only contains 46 files
genre_df = pd.read_json(json_match_fp) 
genre_df.head()

Unnamed: 0,artist,song,genres,filename
0,Cyndi Lauper,Into The Nightlife,"[rock, electronic, pop rock]",TRAAAGR128F425B14B.h5
1,Matthew Wilder,Break My Stride,"[electronic, synth-pop, euro house]",TRAAAZF12903CCCF6B.h5
2,Tesla,Caught In A Dream,"[rock, electronic, pop]",TRAABVM128F92CA9DC.h5
3,Brian Wilson,Keep An Eye On Summer (Album Version),"[rock, electronic, pop]",TRAABXH128F42955D6.h5
4,Old Man River,Summer,"[rock, electronic, synth-pop]",TRAACQE12903CC706C.h5


In [87]:
# this is an example call, and does not go through the entire lmd_matched dataset
subfolder_path = '..\lmd_unwrapped_files\lmd_matched\A\A\\' 

for root, dirs, files in os.walk(subfolder_path): 
    for file in files:
        filepath = os.path.join(root, file)
        
        # retrieve the song ID that corresponds to its h5 equivalent. can change this to be more robust depending on what we need
        song_h5_match_id = filepath.split('\\')[-2] + '.h5'

        # get metadata from dataframe
        song = list(genre_df[genre_df['filename']==song_h5_match_id]['song'])
        artist = list(genre_df[genre_df['filename']==song_h5_match_id]['artist'])
        genres = list(genre_df[genre_df['filename']==song_h5_match_id]['genres'])

        # print out results
        print(f'METADATA FOR FILE AT {filepath}')
        print(f'\tsong h5 match id: {song_h5_match_id}')
        print(f'\tsong: {song}')
        print(f'\tartist: {artist}')
        print(f'\tgenres: {genres}\n')


METADATA FOR FILE AT ..\lmd_unwrapped_files\lmd_matched\A\A\A\TRAAAGR128F425B14B\1d9d16a9da90c090809c153754823c2b.mid
	song h5 match id: TRAAAGR128F425B14B.h5
	song: ['Into The Nightlife']
	artist: ['Cyndi Lauper']
	genres: [['rock', 'electronic', 'pop rock']]

METADATA FOR FILE AT ..\lmd_unwrapped_files\lmd_matched\A\A\A\TRAAAGR128F425B14B\5dd29e99ed7bd3cc0c5177a6e9de22ea.mid
	song h5 match id: TRAAAGR128F425B14B.h5
	song: ['Into The Nightlife']
	artist: ['Cyndi Lauper']
	genres: [['rock', 'electronic', 'pop rock']]

METADATA FOR FILE AT ..\lmd_unwrapped_files\lmd_matched\A\A\A\TRAAAGR128F425B14B\b97c529ab9ef783a849b896816001748.mid
	song h5 match id: TRAAAGR128F425B14B.h5
	song: ['Into The Nightlife']
	artist: ['Cyndi Lauper']
	genres: [['rock', 'electronic', 'pop rock']]

METADATA FOR FILE AT ..\lmd_unwrapped_files\lmd_matched\A\A\A\TRAAAGR128F425B14B\dac3cdd0db6341d8dc14641e44ed0d44.mid
	song h5 match id: TRAAAGR128F425B14B.h5
	song: ['Into The Nightlife']
	artist: ['Cyndi Lauper']

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'From': 'placeholder'  # TODO: replace this with your MB email
}

# possible genres: https://musicbrainz.org/genres 
def get_genre(artist, song, top_genre_count=1):  
    """ 
    get_genre makes a MB query and return N most common genres for the song, N=top_genre_count 
    artist: string
    song: string 
    top_genre_count: the number of genres to return, in descending order from most to least common
    """
    mb_url = "https://musicbrainz.org/ws/2/release-group"
    
    params = {
        'query': f'artist:"{artist}" AND recording:"{song}""?inc=genres',
        'fmt': 'json',
    }

    response = re.get(mb_url, params=params)    # GET request

    if response.status_code != 200: 
        print(f"Error for query <{artist}, {song}>: {response.status_code}")
        return None
    
    data = response.json()
    genres = defaultdict(int)   # key: genre, value: number of votes (# labels applied) for that genre

    if not data['release-groups']:  # assert not empty
        print(f"No valid release groups for <{artist}, {song}> found")
        return None 

    for i in range(len(data['release-groups'])):    # scum
        if 'tags' in data['release-groups'][i] and data['release-groups'][i]['tags']:
            for tag in data['release-groups'][i]['tags']:
                genres[tag['name']] += tag['count']

    if not genres:  # assert not empty
        print(f"No valid tags for <{artist}, {song}> found, skipping")
        return None 

    sorted_genres = sorted(genres, key=genres.get, reverse=True)[:top_genre_count]  # get N most upvoted genres
    return sorted_genres


test_release = 'cold weather'
test_artist = 'glass beach'
print(get_genre(test_artist, test_release, 10)) # returns 10 most upvoted genre labels for cold weather by glass beach


In [None]:
# NOTE: scraping rate is not currently handled. can add 1 sec wait or smth but use at own risk

def match_genres(directory, output_fp, backup='backup.json', num_genres=3):
    """
    match_genres matches genres to each h5 file 
    directory: root directory containing all nested h5 files
    output_fp: filepath of output json file containing genre matching data
    backup: filepath of backup json file for intermediate writes (uncomment code under intermediary save to use)
    num_genres: number of genres to match to each song
    """
    json_data = []  # list of dicts that will be stored as json strings
    for root, dirs, files in os.walk(directory):   # recurse through directory
        backup_data = []
        for file in files:
            hdf5_path = os.path.join(root, file)
            data = tables.open_file(hdf5_path, mode='r')
            try:
                artist = data.root.metadata.songs.cols.artist_name[:][0].decode('utf-8')
                song = data.root.metadata.songs.cols.title[:][0].decode('utf-8')
                genres = get_genre(artist, song, num_genres)
                if genres:
                    match = { 'artist': artist, 'song': song, 'genres': genres, 'filename': file}
                    json_data.append(match)
                    backup_data.append(match)
                
                print(f"artist: {artist}\tsong: {song}\tgenres: {genres}\tfilename: {file}")
            except Exception as e:  # generic catch-all, stops code from terminating halfway
                print(f"Error parsing info for file {hdf5_path}: {e}")
                data.close()

            data.close()

        # intermediary save - data is uglier, but prevents total loss of progress if runtime error is encountered
        if backup_data:
            with open(backup, 'a') as file:
                json.dump(backup_data, file, indent=4)

    with open(output_fp, 'w') as file:
        json.dump(json_data, file, indent=4)
    
    print('finished')
    

match_genres('../lmd_unwrapped_files/lmd_matched_h5/', 'match_genre.json') # pass in folder with lmd_matched_h5 data and write the output to match_genre.json

In [None]:
# # scratch cell, ignore
# SCORE_FILE = os.path.join('../lmd_unwrapped_files/', 'match_scores.json')

# with open(SCORE_FILE) as f:
#     scores = json.load(f)
# # Grab a Million Song Dataset ID from the scores dictionary
# msd_id = list(scores.keys())[0]
# print(f'Million Song Dataset ID {msd_id} has {len(scores[msd_id])} MIDI file matches:')
# for midi_md5, score in scores[msd_id].items():
#     print( f' {midi_md5} with confidence score {score}')