In [1]:
import json
from collections import defaultdict
from sklearn import linear_model, model_selection
import numpy as np
import random
import gzip
import dateutil.parser
import math
import mido
import pretty_midi
import joblib
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec
import collections
import os
import tarfile
import tables
import hdf5_getters as GETTERS
from midi_methods import *

%store -r

In [None]:
# #### # UNCOMMENT AND RUN THIS BLOCK OF CODE IF YOU HAVE NOT EXTRACTED MIDI FILES
# # FROM GZ FILE

# # Download following file at http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz
# # Put it in the resources folder!!!

# f = tarfile.open("resources/lmd_aligned.tar.gz")
# f.extractall()
# f.close

# # Download following file at http://hog.ee.columbia.edu/craffel/lmd/lmd_matched_h5.tar.gz
# # Put it in the resources folder!!!

# f = tarfile.open("resources/lmd_matched_h5.tar.gz")
# f.extractall()
# f.close

In [9]:
# https://github.com/craffel/pretty-midi/blob/main/Tutorial.ipynb

# Some song that was extracted
test_song_path = 'lmd_matched/A/A/A/TRAAAGR128F425B14B/1d9d16a9da90c090809c153754823c2b.mid'

song_pm = pretty_midi.PrettyMIDI(test_song_path)
all_instruments = song_pm.instruments
# Instrument.program to get program number
all_instruments[:5]

[Instrument(program=33, is_drum=False, name="main bass           "),
 Instrument(program=0, is_drum=True, name="drums + fills       "),
 Instrument(program=6, is_drum=False, name="harpsichord         "),
 Instrument(program=19, is_drum=False, name="verse vibe h        "),
 Instrument(program=19, is_drum=False, name="verse vibe m        ")]

In [10]:
# FIrst instrument listed is main synth. Name isnt as important as program number
# as name can be assigned to anything.
main_synth = all_instruments[0]
main_synth

Instrument(program=33, is_drum=False, name="main bass           ")

In [38]:
# Get specific notes played for each instrument
notes_main_synth = main_synth.notes
notes_main_synth[0], notes_main_synth[-1] 

(Note(start=0.018229, end=0.184896, pitch=69, velocity=100),
 Note(start=215.851562, end=215.989583, pitch=71, velocity=100))

In [11]:
# Respective song's hdf5 file

test_hdf5_path = 'lmd_matched_h5\A\A\A\TRAAAGR128F425B14B.h5'
h5 = GETTERS.open_h5_file_read(test_hdf5_path)
# Get song metadata
h5.root.metadata

/metadata (Group) 'metadata about the song'
  children := ['artist_terms' (EArray), 'artist_terms_freq' (EArray), 'artist_terms_weight' (EArray), 'similar_artists' (EArray), 'songs' (Table)]

In [36]:
# Get song title
h5.root.metadata.songs.cols.title[:]

array([b'Into The Nightlife'], dtype='|S1024')

In [None]:
# Get album

h5.root.metadata.songs.cols.release[:]

In [None]:
# Get song id. I don't know what the id is referring to (just that dataset or 
# also a foreign key to another datset)

h5.root.metadata.songs.cols.song_id[:]

In [None]:
# Get song artist

h5.root.metadata.songs.cols.artist_name[:]

In [18]:
# Getting genre of a song. If its empty, that means it has no genre assigned in 
# the dataset, but may still have a genre
h5.root.metadata.songs.cols.genre[:]

array([b''], dtype='|S1024')

In [10]:
#https://github.com/tbertinmahieux/MSongsDB/blob/master/PythonSrc/hdf5_getters.py

In [2]:
import os

In [3]:
fp = "lmd_matched_h5/"

# Metadata filepath for each song
metadata = np.array([])

# Unnest the three alphabetic directories
nest1 = os.listdir(fp)
for l1 in nest1:
    nest2 = os.listdir(fp + l1)
    for l2 in nest2:
        nest3 = os.listdir(fp + l1 +"/" + l2)
        for l3 in nest3:
            
            # Get all the song's .h5 filepath for this level of nesting
            curr_fp = fp + l1 + "/" + l2 + "/" + l3
            files = os.listdir(curr_fp)

            # Add them all to our 1d array
            for md_f in files:
                metadata = np.append(metadata, curr_fp + "/" + md_f)
                
metadata = np.unique(metadata)

In [12]:
metadata_sub = np.random.choice(metadata, size=100)

In [13]:
f = open('resources/match_scores.json')
confidence = json.load(f)

In [19]:
data = []

# df filepath for our midi files
midi_df_fp = "lmd_aligned/"

# For each song's metadata filepath
for md_fp in metadata:

    # Read song's .h5 file
    song_metadata = GETTERS.open_h5_file_read(md_fp)
    
    # Extract the song's title as a string
    song_name = song_metadata.root.metadata.songs.cols.title[:][0].decode('UTF-8')
    
    # Extract the song's unique identifier in our dataset
    song_id = md_fp.split("/")[-1].split(".")[0]
    
    # Using our confidence data, get the id of midi file that is the best representation of this song 
    midi_id = max(confidence[song_id], key=confidence[song_id].get)
    
    # Construct the filepath for the best choice midi file
    midi_fp = (midi_df_fp + '/'.join(md_fp.split("/")[1:])).split(".")[0] + "/" + midi_id + ".mid"

    data.append({"song_id": song_id, "title": song_name, "midi_fp": midi_fp})

In [36]:
import pandas as pd

In [38]:
pd.Series(np.array([d['song_id'] for d in data])).value_counts()
# data

TRTJAGE12903CB2FB9    3
TRTMLGP128F4236493    3
TRKTEIH128F14B103E    2
TRYHQSB128F4234D80    2
TROAPTG12903CBB9A0    2
                     ..
TRPJRHS12903CFB77F    1
TRCOTFH128E078C611    1
TRBXIGD128F425FE59    1
TRFLZID128F427E1BA    1
TRVOIZW128F92E1C8C    1
Name: count, Length: 979, dtype: int64

In [40]:
[d for d in data if d['song_id']=="TRTJAGE12903CB2FB9"]

[{'song_id': 'TRTJAGE12903CB2FB9',
  'title': 'Reflections Of My Life',
  'midi_fp': 'lmd_matched/T/J/A/TRTJAGE12903CB2FB9/891565c67f84e614e58eb87234848104.mid'},
 {'song_id': 'TRTJAGE12903CB2FB9',
  'title': 'Reflections Of My Life',
  'midi_fp': 'lmd_matched/T/J/A/TRTJAGE12903CB2FB9/891565c67f84e614e58eb87234848104.mid'},
 {'song_id': 'TRTJAGE12903CB2FB9',
  'title': 'Reflections Of My Life',
  'midi_fp': 'lmd_matched/T/J/A/TRTJAGE12903CB2FB9/891565c67f84e614e58eb87234848104.mid'}]