In [38]:
import requests as re
import hdf5_getters as GETTERS
from midi_methods import *
from collections import defaultdict
from sklearn import linear_model, model_selection
import numpy as np
import json
import os 
import h5py
import tables
import time 

In [26]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'From': 'placeholder'  # TODO: replace this with your MB email
}

# possible genres: https://musicbrainz.org/genres 
def get_genre(artist, song, top_genre_count=1):  
    """ 
    get_genre makes a MB query and return N most common genres for the song, N=top_genre_count 
    artist: string
    song: string 
    top_genre_count: the number of genres to return, in descending order from most to least common
    """
    mb_url = "https://musicbrainz.org/ws/2/release-group"
    
    params = {
        'query': f'artist:"{artist}" AND recording:"{song}""?inc=genres',
        'fmt': 'json',
    }

    response = re.get(mb_url, params=params)    # GET request

    if response.status_code != 200: 
        print(f"Error for query <{artist}, {song}>: {response.status_code}")
        return None
    
    data = response.json()
    genres = defaultdict(int)   # key: genre, value: number of votes (# labels applied) for that genre

    if not data['release-groups']:  # assert not empty
        print(f"No valid release groups for <{artist}, {song}> found")
        return None 

    for i in range(len(data['release-groups'])):    # scum
        if 'tags' in data['release-groups'][i] and data['release-groups'][i]['tags']:
            for tag in data['release-groups'][i]['tags']:
                genres[tag['name']] += tag['count']

    if not genres:  # assert not empty
        print(f"No valid tags for <{artist}, {song}> found, skipping")
        return None 

    sorted_genres = sorted(genres, key=genres.get, reverse=True)[:top_genre_count]  # get N most upvoted genres
    return sorted_genres


test_release = 'cold weather'
test_artist = 'glass beach'
print(get_genre(test_artist, test_release, 10)) # returns 10 most upvoted genre labels for cold weather by glass beach


['rock', 'electronic', 'future jazz', 'downtempo', 'pop rock', 'alternative rock', 'thrash', 'dubstep', 'deep house', 'house']


In [43]:
# NOTE: scraping rate is not currently handled. can add 1 sec wait or smth but use at own risk

def match_genres(directory, output_fp, backup='backup.json', num_genres=3):
    """
    match_genres matches genres to each h5 file 
    directory: root directory containing all nested h5 files
    output_fp: filepath of output json file containing genre matching data
    backup: filepath of backup json file for intermediate writes (uncomment code under intermediary save to use)
    num_genres: number of genres to match to each song
    """
    json_data = []  # list of dicts that will be stored as json strings
    for root, dirs, files in os.walk(directory):   # recurse through directory
        backup_data = []
        for file in files:
            hdf5_path = os.path.join(root, file)
            data = tables.open_file(hdf5_path, mode='r')
            try:
                artist = data.root.metadata.songs.cols.artist_name[:][0].decode('utf-8')
                song = data.root.metadata.songs.cols.title[:][0].decode('utf-8')
                genres = get_genre(artist, song, num_genres)
                if genres:
                    match = { 'artist': artist, 'song': song, 'genres': genres, 'filename': file}
                    json_data.append(match)
                    backup_data.append(match)
                
                print(f"artist: {artist}\tsong: {song}\tgenres: {genres}\tfilename: {file}")
            except Exception as e:  # generic catch-all, stops code from terminating halfway
                print(f"Error parsing info for file {hdf5_path}: {e}")
                data.close()

            data.close()

        # intermediary save - data is uglier, but prevents total loss of progress if runtime error is encountered
        if backup_data:
            with open(backup, 'a') as file:
                json.dump(backup_data, file, indent=4)

    with open(output_fp, 'w') as file:
        json.dump(json_data, file, indent=4)
    
    print('finished')
    

match_genres('../lmd_unwrapped_files/lmd_matched_h5/', 'match_genre.json') # pass in folder with lmd_matched_h5 data and write the output to match_genre.json

artist: Cyndi Lauper	song: Into The Nightlife	genres: ['rock', 'electronic', 'pop rock']	filename: TRAAAGR128F425B14B.h5
artist: Matthew Wilder	song: Break My Stride	genres: ['electronic', 'synth-pop', 'euro house']	filename: TRAAAZF12903CCCF6B.h5
artist: Tesla	song: Caught In A Dream	genres: ['rock', 'electronic', 'pop']	filename: TRAABVM128F92CA9DC.h5
artist: Brian Wilson	song: Keep An Eye On Summer (Album Version)	genres: ['rock', 'electronic', 'hip hop']	filename: TRAABXH128F42955D6.h5
artist: Old Man River	song: Summer	genres: ['rock', 'electronic', 'synth-pop']	filename: TRAACQE12903CC706C.h5
artist: Tracy Chapman	song: Fast Car (LP Version)	genres: ['pop', 'electronic', 'rock']	filename: TRAADKW128E079503A.h5
artist: Chris Rea	song: Driving Home For Christmas	genres: ['pop', 'rock', 'christmas']	filename: TRAAEEH128E0795DFE.h5
artist: Hank Williams Jr.	song: Tuesday's Gone (Remastered Album Version)	genres: ['rock', 'country', 'blues rock']	filename: TRAAEJH128E0785506.h5
artist

In [7]:
# # scratch cell, ignore
# SCORE_FILE = os.path.join('../lmd_unwrapped_files/', 'match_scores.json')

# with open(SCORE_FILE) as f:
#     scores = json.load(f)
# # Grab a Million Song Dataset ID from the scores dictionary
# msd_id = list(scores.keys())[0]
# print(f'Million Song Dataset ID {msd_id} has {len(scores[msd_id])} MIDI file matches:')
# for midi_md5, score in scores[msd_id].items():
#     print( f' {midi_md5} with confidence score {score}')

Million Song Dataset ID TRRNARX128F4264AEB has 1 MIDI file matches:
 cd3b9c8bb118575bcd712cffdba85fce with confidence score 0.7040202098544246
