# This notebook will load our data
# (DO NOT RUN THIS NOTEBOOK)
# This is already done in advance, there is no need to use any of this notebook. Output is stored in data/loaded_data.json

#### 
## Credits: Garvey Li, Kai Lee, Christian DerManuelian

#### The cell below prepares our data (It should take several minutes to run)

In [1]:
# Get our data

""" UNCOMMENT THESE LINES TO DOWNLOAD AND UNCOMPRESS DATA """
# from download_data import download_lmd_data, decompress_lmd_data
# download_lmd_data()
# decompress_lmd_data()

"""
# NOTE THE FUNCTION BELOW IS NOT NECESSARY AS IT'S OUTPUT IS ALREADY PROVIDED
# See load_genre_data.py to see how we pulled genre data for the Lakh dataset
"""
# Get genre data
# from load_genre_data import match_genres
# match_genres('../data/lmd_matched_h5/', '../data/match_genre.json')

In [2]:
import os
import numpy as np
import json
import sys
import pandas as pd
import gc

# Packages from utils/ directory
sys.path.insert(0, '../utils')
import hdf5_getters as GETTERS

# MetaData
#### As of this stage, each song in our data folder has a .h5 file associated with it that contains that song's metadata.
#### The data however has a weird nested structure
#### The below cell creates a numpy array of each song's .h5 filepath


In [3]:
fp = "../data/lmd_matched_h5/"

# Metadata filepath for each song
metadata = np.array([])

# Unnest the three alphabetic directories
nest1 = os.listdir(fp)
for l1 in nest1:
    nest2 = os.listdir(fp + l1)
    for l2 in nest2:
        nest3 = os.listdir(fp + l1 +"/" + l2)
        for l3 in nest3:
            
            # Get all the song's .h5 filepath for this level of nesting
            curr_fp = fp + l1 + "/" + l2 + "/" + l3
            files = os.listdir(curr_fp)

            # Add them all to our 1d array
            for md_f in files:
                metadata = np.append(metadata, curr_fp + "/" + md_f)
                
metadata = np.unique(metadata)

# Save these filepaths
with open('../data/metadata.npy', 'wb') as f:
    np.save(f, metadata)

In [4]:
# Shorthand metadata read as we saved the output from the cell above
with open('../data/metadata.npy', 'rb') as f:
    metadata = np.load(f)

# Genre
#### Load genre data for each song

In [5]:
# Load genre data as a pd dataframe
json_match_fp = '../data/match_genre.json'
genre_df = pd.read_json(json_match_fp) 

# Extract song id from metadata filename
genre_df["song_id"] = genre_df['filename'].str.replace(".h5", "")

genre_df = genre_df.set_index("song_id")
genre_df.head()

Unnamed: 0_level_0,artist,song,genres,filename
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TRAAAGR128F425B14B,Cyndi Lauper,Into The Nightlife,"[rock, electronic, pop rock]",TRAAAGR128F425B14B.h5
TRAAAZF12903CCCF6B,Matthew Wilder,Break My Stride,"[electronic, synth-pop, euro house]",TRAAAZF12903CCCF6B.h5
TRAABVM128F92CA9DC,Tesla,Caught In A Dream,"[rock, electronic, pop]",TRAABVM128F92CA9DC.h5
TRAABXH128F42955D6,Brian Wilson,Keep An Eye On Summer (Album Version),"[rock, electronic, hip hop]",TRAABXH128F42955D6.h5
TRAACQE12903CC706C,Old Man River,Summer,"[rock, electronic, synth-pop]",TRAACQE12903CC706C.h5


# Midi files
#### Each song may have multiple midi files in our data
#### We only need 1. Thankfully, match_scores.json from the Lakh dataset tells us which midi file is the best representation for that song. We will use this confidence dictionary in the following cell

In [6]:
f = open('../data/match_scores.json')
confidence = json.load(f)

# Let's create our data
#### The below cell defines a function to create our data in the format we want
#### Our data is a list of dictionaries, where each dictionary an observation(song)
#### Each dictionary contains these (features)keys (unique identifier, song title, filepath to the song's midi, list of genres, filepath to the song's metadata .h5 file)

#### We are going to have to break this into parts, because there is too much data for this loop to run on the current local machine that is running this notebook

In [7]:
# Function to create our dataset, we need to do it multiple times on subsets however
# It takes in a list of metadata files
# It reads each one, and creates a dictionary for that song (representing the feature vector)
# It appends that dictionary to an output file as a json string
def make_dataset(metadata_sub, out_fp, first_iteration=False):

    # df filepath for our midi files
    midi_df_fp = "../data/lmd_aligned/"

    # For each song's metadata filepath
    for md_fp in metadata_sub:
        
        # Read song's .h5 file
        song_metadata = GETTERS.open_h5_file_read(md_fp)

        # Extract the song's title as a string
        song_name = song_metadata.root.metadata.songs.cols.title[:][0].decode('UTF-8')

        # Extract the song's unique identifier in our dataset
        song_id = md_fp.split("/")[-1].split(".")[0]

        # Using our confidence data, get the id of midi file that is the best representation of this song 
        midi_id = max(confidence[song_id], key=confidence[song_id].get)

        # Construct the filepath for the best choice midi file
        midi_fp = (midi_df_fp + "/".join(md_fp.split("/")[3:])).replace(".h5", "")  + "/" + midi_id + ".mid"

        # Get the list of this song's top 3 matching genres
        try:
            genres = genre_df.loc[song_id]["genres"]
        # If we never got a genre for this song, don't include it
        except:
            continue

        # Construct our feature vector for this observation
        new_entry = {"song_id": song_id, "title": song_name, "midi_fp": midi_fp, "genres": genres, "metadata": md_fp}
            
        # Append it to our output file
        with open(out_fp, 'a') as fout:
            if (not first_iteration):
                fout.write(",")
            first_iteration=False
            fout.write(json.dumps(new_entry, indent=4))
        
        # Clear up memory
        del song_metadata
        del song_name
        del song_id
        del midi_id
        del midi_fp
        del genres
        del new_entry
        gc.collect()

## Make metadata subset steps (as indices)
#### Due to local machine memory issues, we did these splits manually for all 30k songs in steps, changing the metadata slice manually each time

In [8]:
splits = 61
n_k = 30500
step_size = n_k // splits
steps = []
for i in range(splits):
    steps.append(step_size*(i+1))
steps += [len(metadata)]
np.array(steps)

array([  500,  1000,  1500,  2000,  2500,  3000,  3500,  4000,  4500,
        5000,  5500,  6000,  6500,  7000,  7500,  8000,  8500,  9000,
        9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500,
       14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000,
       18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500,
       23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000,
       27500, 28000, 28500, 29000, 29500, 30000, 30500, 31034])

In [9]:
out_fp = '../data/loaded_data.json'

In [10]:
# Be careful this resets the .json file

# # Create our empty output .json file
# with open(out_fp, 'w') as fout:
#     fout.write("[")

In [11]:
# Make a subset (based on steps array)
metadata_sub = metadata[30500:len(metadata)]

make_dataset(metadata_sub, out_fp, False)

In [12]:
# # Create our empty output .json file
# with open(out_fp, 'a') as fout:
#     fout.write("]")

# Ignore the cells below
#### They were used to help the manual data loading process

In [16]:
# with open('../data/loaded_data.json', 'r') as fout:
#     jstr = fout.read()
#     jstr = '[' + jstr[1:] + ']'
#     c = json.loads(jstr)

In [17]:
# import math
# x = np.array([d['song_id'] for d in c])
# print(math.ceil(len(x) / 100) * 100)
# len(x), len(np.unique(x))