In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

for dirname, _, filenames in os.walk('EMOPIA_1.0/midis/'):
    print(dirname, len(filenames))

In [None]:
# !pip install mido
# !pip install einops

In [7]:
import mido
from mido import MidiFile
import numpy as np

def write_midi(pr, ticks_per_beat, write_path, tempo=80):
    def pr_to_list(pr):
        # List event = (pitch, velocity, time)
        T, N = pr.shape
        t_last = 0
        pr_tm1 = np.zeros(N)
        list_event = []
        for t in range(T):
            pr_t = pr[t]
            mask = (pr_t != pr_tm1)
            if (mask).any():
                for n in range(N):
                    if mask[n]:
                        pitch = n
                        velocity = int(pr_t[n])
                        # Time is incremented since last event
                        t_event = t - t_last
                        t_last = t
                        list_event.append((pitch, velocity, t_event))
            pr_tm1 = pr_t
        return list_event
    # Tempo
    microseconds_per_beat = mido.bpm2tempo(tempo)
    # Write a pianoroll in a midi file
    mid = MidiFile()
    mid.ticks_per_beat = ticks_per_beat

    # Each instrument is a track
    for instrument_name, matrix in pr.items():
        # Add a new track with the instrument name to the midi file
        track = mid.add_track(instrument_name)
        # transform the matrix in a list of (pitch, velocity, time)
        events = pr_to_list(matrix)
        # Tempo
        track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
        # Add the program_change
        try:
            program = program_change_mapping[instrument_name]
        except:
            # Defaul is piano
            # print instrument_name + " not in the program_change mapping"
            # print "Default value is 1 (piano)"
            # print "Check acidano/data_processing/utils/program_change_mapping.py"
            program = 1
        track.append(mido.Message('program_change', program=program))

        # This list is required to shut down
        # notes that are on, intensity modified, then off only 1 time
        # Example :
        # (60,20,0)
        # (60,40,10)
        # (60,0,15)
        notes_on_list = []
        # Write events in the midi file
        for event in events:
            pitch, velocity, time = event
            if velocity == 0:
                # Get the channel
                track.append(mido.Message('note_off', note=pitch, velocity=0, time=time))
                notes_on_list.remove(pitch)
            else:
                if pitch in notes_on_list:
                    track.append(mido.Message('note_off', note=pitch, velocity=0, time=time))
                    notes_on_list.remove(pitch)
                    time = 0
                track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time))
                notes_on_list.append(pitch)
    mid.save(write_path)
    return

#!/usr/bin/env python
# -*- coding: utf8 -*-

# from mido import MidiFile
from unidecode import unidecode
# import numpy as np

#######
# Pianorolls dims are  :   TIME  *  PITCH


class Read_midi(object):
    def __init__(self, song_path, quantization):
        ## Metadata
        self.__song_path = song_path
        self.__quantization = quantization

        ## Pianoroll
        self.__T_pr = None

        ## Private misc
        self.__num_ticks = None
        self.__T_file = None

    @property
    def quantization(self):
        return self.__quantization

    @property
    def T_pr(self):
        return self.__T_pr

    @property
    def T_file(self):
        return self.__T_file

    def get_total_num_tick(self):
        # Midi length should be written in a meta message at the beginning of the file,
        # but in many cases, lazy motherfuckers didn't write it...

        # Read a midi file and return a dictionnary {track_name : pianoroll}
        mid = MidiFile(self.__song_path)

        # Parse track by track
        num_ticks = 0
        for i, track in enumerate(mid.tracks):
            tick_counter = 0
            for message in track:
                # Note on
                time = float(message.time)
                tick_counter += time
            num_ticks = max(num_ticks, tick_counter)
        self.__num_ticks = num_ticks

    def get_pitch_range(self):
        mid = MidiFile(self.__song_path)
        min_pitch = 200
        max_pitch = 0
        for i, track in enumerate(mid.tracks):
            for message in track:
                if message.type in ['note_on', 'note_off']:
                    pitch = message.note
                    if pitch > max_pitch:
                        max_pitch = pitch
                    if pitch < min_pitch:
                        min_pitch = pitch
        return min_pitch, max_pitch

    def get_time_file(self):
        # Get the time dimension for a pianoroll given a certain quantization
        mid = MidiFile(self.__song_path)
        # Tick per beat
        ticks_per_beat = mid.ticks_per_beat
        # Total number of ticks
        self.get_total_num_tick()
        # Dimensions of the pianoroll for each track
        self.__T_file = int((self.__num_ticks / ticks_per_beat) * self.__quantization)
        return self.__T_file

    def read_file(self):
        # Read the midi file and return a dictionnary {track_name : pianoroll}
        mid = MidiFile(self.__song_path)
        # Tick per beat
        ticks_per_beat = mid.ticks_per_beat

        # Get total time
        self.get_time_file()
        T_pr = self.__T_file
        # Pitch dimension
        N_pr = 128
        pianoroll = {}

        def add_note_to_pr(note_off, notes_on, pr):
            pitch_off, _, time_off = note_off
            # Note off : search for the note in the list of note on,
            # get the start and end time
            # write it in th pr
            match_list = [(ind, item) for (ind, item) in enumerate(notes_on) if item[0] == pitch_off]
            if len(match_list) == 0:
                print("Try to note off a note that has never been turned on")
                # Do nothing
                return

            # Add note to the pr
            pitch, velocity, time_on = match_list[0][1]
            pr[time_on:time_off, pitch] = velocity
            # Remove the note from notes_on
            ind_match = match_list[0][0]
            del notes_on[ind_match]
            return

        # Parse track by track
        counter_unnamed_track = 0
        for i, track in enumerate(mid.tracks):
            # Instanciate the pianoroll
            pr = np.zeros([T_pr, N_pr])
            time_counter = 0
            notes_on = []
            for message in track:

                ##########################################
                ##########################################
                ##########################################
                # TODO : keep track of tempo information
                # import re
                # if re.search("tempo", message.type):
                #     import pdb; pdb.set_trace()
                ##########################################
                ##########################################
                ##########################################


                # print message
                # Time. Must be incremented, whether it is a note on/off or not
                time = float(message.time)
                time_counter += time / ticks_per_beat * self.__quantization
                # Time in pr (mapping)
                time_pr = int(round(time_counter))
                # Note on
                if message.type == 'note_on':
                    # Get pitch
                    pitch = message.note
                    # Get velocity
                    velocity = message.velocity
                    if velocity > 0:
                        notes_on.append((pitch, velocity, time_pr))
                    elif velocity == 0:
                        add_note_to_pr((pitch, velocity, time_pr), notes_on, pr)
                # Note off
                elif message.type == 'note_off':
                    pitch = message.note
                    velocity = message.velocity
                    add_note_to_pr((pitch, velocity, time_pr), notes_on, pr)

            # We deal with discrete values ranged between 0 and 127
            #     -> convert to int
            pr = pr.astype(np.int16)
            if np.sum(np.sum(pr)) > 0:
                name = unidecode(track.name)
                name = name.rstrip('\x00')
                if name == u'':
                    name = 'unnamed' + str(counter_unnamed_track)
                    counter_unnamed_track += 1
                if name in pianoroll.keys():
                    # Take max of the to pianorolls
                    pianoroll[name] = np.maximum(pr, pianoroll[name])
                else:
                    pianoroll[name] = pr
        return pianoroll



def get_pianoroll_time(pianoroll):
    T_pr_list = []
    for k, v in pianoroll.items():
        T_pr_list.append(v.shape[0])
    if not len(set(T_pr_list)) == 1:
        print("Inconsistent dimensions in the new PR")
        return None
    return T_pr_list[0]

def get_pitch_dim(pianoroll):
    N_pr_list = []
    for k, v in pianoroll.items():
        N_pr_list.append(v.shape[1])
    if not len(set(N_pr_list)) == 1:
        print("Inconsistent dimensions in the new PR")
        raise NameError("Pr dimension")
    return N_pr_list[0]

def dict_to_matrix_old(pianoroll):
    T_pr = get_pianoroll_time(pianoroll)
    N_pr = get_pitch_dim(pianoroll)
    rp = np.zeros((T_pr, N_pr), dtype=np.int16)
    for k, v in pianoroll.items():
        if rp.sum() < v.sum():
            rp = v
    return rp

## MULTI CHANNEL
def dict_to_matrix(pianoroll):
    T_pr = get_pianoroll_time(pianoroll)
    N_pr = get_pitch_dim(pianoroll)
    all_m = []
    for k, v in pianoroll.items():
        rp = np.zeros((T_pr, N_pr), dtype=np.int16)
        all_m.append(np.maximum(rp, v))
        
    all_m = sorted(all_m, reverse=True, key=lambda x: x.sum())
    
    return all_m[:2]

In [2]:
import random
import pandas as pd
import imageio
import numpy as np
from argparse import ArgumentParser

from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import einops
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

print("GPU -->", torch.cuda.is_available())

GPU --> True


In [11]:
ARTIST_DIR = 'LakhMIDI/'

import os

audio_data_arr = []
audio_tags_arr = []
audio_artist_arr = []

songs_to_be_extracted = 1555
genre_to_be_extracted = "blues" # 'ALL' # 'disco' # 'trance' # 'jazz'
disable_genre = False


df = pd.read_csv(r'artists_filtered.csv')

for dirname, _, filenames in os.walk(ARTIST_DIR):
    artist = dirname.split('LakhMIDI/')[1].replace('_', ' ')
    
    entry = df[df['artist_mb'].str.contains(artist) == True]
    
    if len(entry) !=  1:
        continue
    
    print(artist)

    tags = entry.iloc[0]['tags_mb']
    if pd.isna(tags):
        tags = entry.iloc[0]['tags_lastfm']
    if pd.isna(tags):
        continue
        
    tags = [a.strip().replace('-', ' ').replace('&', ' ').lower() for a in tags.split(';')]
    
    if not disable_genre and genre_to_be_extracted not in tags and genre_to_be_extracted != 'ALL':
        continue
    if disable_genre:
        print('Not Checking Genre')
    
    for filename in filenames:
        if len(audio_data_arr) > songs_to_be_extracted:
            break
        try:
            if '.mid' in filename:
                filepath = os.path.join(dirname, filename)
                
                audio_dict = Read_midi(filepath, 4).read_file()
                audio_matrix = dict_to_matrix(audio_dict)
                
                if len(audio_matrix) < 2:
                    continue
                
                audio_data_arr.append(audio_matrix)
                audio_tags_arr.append(tags)
                audio_artist_arr.append(artist)
                
                print('-->',len(audio_data_arr),'<--', len(audio_matrix), end="\r")
#                 print('-->',len(audio_data_arr),'<--', end="\r")
        except Exception as err:
            print("ERROR", err)
            pass

print('')

print(len(audio_data_arr))
print(len(audio_tags_arr))
print(len(audio_artist_arr))
# audio_data_arr1 = audio_data_arr 

Rick Derringer
Jimmy Barnes
Lois Lane
Duran Duran
Ludwig van Beethoven
Gene Pitney
The Happenings
Glenn Frey
Tag Team
Ed Townsend
The Offspring
Helen Reddy
Herbie Hancock
Joe Diffie
En Vogue
Sergio
Gerry Mulligan Quartet
Wang Chung
Hughie Cannon
MFSB
Barclay James Harvest
Joan Osborne
Baltimora
Ekseption
Andrea Bocelli
De Deurzakkers
Mariah Carey
Matia Bazar
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try

Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been tur

Major Harris
Jerome Kern
Alain Bashung
Fleetwood Mac
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been tur

John Lennon
Dr. Alban
Howard Carpendale
Mocedades
Bay City Rollers
Atlantic Starr
Fluitsma
Digno Garcia
The Tony Rich Project
Brian McKnight
Erroll Garner
Steps Ahead
Chris Isaak
Jackie McLean
Huub Hangop
Eddie Floyd
Bush
Blues Traveler
Kenny Loggins
Michael McDonald
Steve Perry
Stiltskin
Belinda Carlisle
Crests
Frank Mills
Electric Light Orchestra
Smash Mouth
Frankie Avalon
Peter Allen
Alannah Myles
Axelle Red
Steve Winwood
Freda Payne
Alpenrebellen
Toni Basil
The Five Satins
Desmond Dekker
Au bonheur des dames
John Denver
Dance 2 Trance
Gompie
Ann Peebles
Diana Ross
Rionegro
Marvin Hamlisch
The Clash
Junior Walker
The Hollies
Gary Glitter
Christopher Cross
Michael Bolton
Sugarloaf
Milli Vanilli
Kalua Beach Boys
Crowded House
Mitch Ryder
Dusty Springfield
The Marshall Tucker Band
The Archies
Rush
Alpentrio Tirol
Tasmin Archer
Clouseau
Jive Bunny
Godsmack
Janet Jackson
Don Backy
Laid Back
Julie Covington
Jann Arden
Feargal Sharkey
Fourplay
Fats Domino
Dalla
Robert Knight
The Rivieras
V

Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been tur

The Partridge Family
Bob Welch
Nakatomi
Vicki Brown
Stevie Nicks
Chuck Berry
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Zhi-Vago<-- 2
Dion
Otis Redding
Battiato<-- 2
Soul II Soul
Oingo Boingo
Lou Bega
Freek de Jonge
Tracey Ullman
Harry Chapin
Quarterflash
Gladys Knight
The Rutles
Twila Paris
Simon an

  entry = df[df['artist_mb'].str.contains(artist) == True]


Norman Blake
George Baker Selection
Allan Sherman
Skee-Lo
The Bluebells
Rick James
Ohio Players
Fiona Apple
Alma Cogan
Luniz
Stephen Bishop
Dave Matthews Band
Vengaboys
King Crimson
Nine Inch Nails
Soundgarden
Eddie Cochran
Johnny Mathis
The Standells
Tom Astor
Reel Big Fish
Jackson 5
Imperial Teen
Whitney Houston
Los Bravos
Chyp-notic
Frankie Goes to Hollywood
Puff Daddy
Telly Savalas
Chris Montez
Isabelle Adjani
Eric Clapton
ERROR data byte must be in range 0..127
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on
Try to note off a note that has never been turned on

Sawyer Brown2
Bonnie Raitt
Kate Bush
Miami Sound Machine
FIXX
Britney Spears
Ronan Keating
Jodeci
Carly Simon
Al Jolson
Bronski Beat
Die Fantastischen Vier
Gary Puckett
Yazoo
Mary Wells
Eric Carmen
Van McCoy
Johnny Bristol
Weather Report
Billy Paul
Restless Heart
Boys Town Gang
UZEB
Stone Temple Pilots
Skid Row
Taylor James
The Shamen
The Ink Spots
Supermax
Eddie Kendricks
Perplexer
Lobo
Classics<-- 2
Die Doofen
Cliff Richard
Korgis
Patrick Bruel
Stephanie Mills
Star Wash
UB40
Gene Chandler
Fine Young Cannibals
Katrina and the Waves
Aaron Neville
Bloodhound Gang
Jimmy Buffett
Johnny Rivers
Astrud Gilberto
Grace Jones
Roots Syndicate
Tim Immers
Jefferson Starship
Herb Alpert
Garth Brooks
Strawberry Alarm Clock
Honey Cone
LL Cool J
Victoria Williams
Ted Nugent
George M. Cohan
702
Allen Toussaint
Eiffel 65
Worlds Apart
Will to Power
Ice Cube
Nick Gilder
Deep Purple
Traveling Wilburys
Brian Hyland
Billy Ocean
Right Said Fred
Toys
Drukwerk
Rembrandts
Chris de Burgh
Scott Joplin
Crystal Gayl

Joe Satriani
Buffalo Springfield
Lloyd James
Edvard Grieg
Kim Appleby
John Waite
Gloria Estefan
Marianne Faithfull
Kadoc
Cheryl Lynn
Ruth Jacott
Bertie Higgins
David Dundas
Sister Hazel
Paola Turci
Zhane
Starship
Caterina Caselli
Paul Desmond
Boyz II Men
Puckett
M People
Chocolate
LeAnn Rimes
Rick Astley
New Radicals
Mungo Jerry
Rammstein
Paul Revere and The Raiders
Giorgio Moroder
Carpenters

516
516
516


In [37]:
all_genres = {}

for gs in audio_tags_arr:
    for g in gs:
        if g in all_genres.keys():
            all_genres[g] = all_genres[g]+1
        else:
            all_genres[g] = 0
            
print(all_genres)

{'rock': 4308, 'pop': 3619, 'blues': 734, 'hard rock': 864, 'blues rock': 735, 'jazz fusion': 69, 'christian rock': 13, 'electric blues': 47, 'blues-rock': 42, 'aor': 223, 'rock & roll': 618, 'soul': 963, 'rhythm and blues': 42, 'dutch': 120, 'seen live': 461, 'cover': 9, 'Hip-Hop': 42, 'hip hop': 143, 'rap': 89, 'female vocalists': 291, 'catchy': 4, 'american': 2073, 'hiphop': 17, 'rnb': 366, 'live gezien': 3, 'pur': 3, 'ned': 3, 'Less than 30000': 3, 'electronic': 676, 'synthpop': 562, 'alternative rock': 890, '70s': 399, '80s': 650, '90s': 627, 'dance': 439, 'british': 1749, 'new wave': 475, 'electro': 73, 'uk': 1435, 'pop rock': 3046, 'new romantic': 65, 'english': 1306, '00s': 299, 'classic pop and rock': 3126, 'male vocalists': 372, '10s': 144, 'new romantics': 41, 'classical': 283, 'german': 150, 'german composer': 20, 'production music': 53, 'opera': 59, 'piano': 83, 'pianist': 43, 'composer': 190, 'romantic': 74, 'european': 489, 'symphony': 19, 'concerto': 14, 'lo spirito': 1

In [27]:
data = {'sig':audio_data_arr,
'tags':audio_tags_arr,
'artist':audio_artist_arr}


np.save('midi_2_channel_'+ str(len(audio_data_arr)) +'__' + genre_to_be_extracted + '__song_tags_artists.npy', data)

In [12]:
print(data['tags'])

[['rock', 'pop', 'blues', 'hard rock', 'blues rock', 'jazz fusion', 'christian rock', 'electric blues', 'blues-rock', 'aor', 'rock & roll'], ['rock', 'hard rock', 'soul', 'rhythm and blues', 'blues-rock'], ['pop', 'dutch', 'seen live', 'cover', 'Hip-Hop', 'hip hop', 'soul', 'rap', 'female vocalists', 'catchy', 'american', 'hiphop', 'rnb', 'live gezien', 'pur', 'ned', 'Less than 30000'], ['pop', 'dutch', 'seen live', 'cover', 'Hip-Hop', 'hip hop', 'soul', 'rap', 'female vocalists', 'catchy', 'american', 'hiphop', 'rnb', 'live gezien', 'pur', 'ned', 'Less than 30000'], ['pop', 'dutch', 'seen live', 'cover', 'Hip-Hop', 'hip hop', 'soul', 'rap', 'female vocalists', 'catchy', 'american', 'hiphop', 'rnb', 'live gezien', 'pur', 'ned', 'Less than 30000'], ['pop', 'dutch', 'seen live', 'cover', 'Hip-Hop', 'hip hop', 'soul', 'rap', 'female vocalists', 'catchy', 'american', 'hiphop', 'rnb', 'live gezien', 'pur', 'ned', 'Less than 30000'], ['rock', 'electronic', 'synthpop', 'pop', 'alternative roc

In [5]:
npy_loc = 'midi_single_channel_7501__ALL__song_tags_artists.npy'
# npy_loc = '/kaggle/input/midi-numpy-array/midi_single_channel_2500__pop__song_tags_artists.npy'

data = np.load(npy_loc,allow_pickle=True)

x_audio_data_arr = data.item().get('sig') 
x_audio_tag_arr = data.item().get('tags')
x_audio_artist_arr = data.item().get('artist')

In [7]:
x_audio_data_arr = x_audio_data_arr[1000:3500]
x_audio_tag_arr = x_audio_tag_arr[1000:3500]
x_audio_artist_arr = x_audio_artist_arr[1000:3500]

In [8]:
data = {'sig':x_audio_data_arr,
'tags':x_audio_tag_arr,
'artist':x_audio_artist_arr}

np.save('midi_single_channel_2501__ALL__song_tags_artists.npy', data)

# EMOPIA PARSER

In [19]:
ARTIST_DIR = 'EMOPIA_1.0/EMOPIA_1.0/midis'

import os

audio_data_arr = []
audio_qs_arr = []

songs_to_be_extracted = 1555
genre_to_be_extracted = "blues" # 'ALL' # 'disco' # 'trance' # 'jazz'
disable_genre = False


df = pd.read_csv(r'artists_filtered.csv')

for dirname, _, filenames in os.walk(ARTIST_DIR):
    for filename in filenames:
        try:
            q = filename.split('_')[0]
            _rest = filename.split('_')[-1]
            name = filename.split('_' + _rest)[0].split(q + "_")[1]
            
            q = int(q[1])

            if '.mid' in filename:
                filepath = os.path.join(dirname, filename)

                audio_dict = Read_midi(filepath, 4).read_file()
                audio_matrix = dict_to_matrix(audio_dict)

#                 if len(audio_matrix) < 2:
#                     continue

                audio_data_arr.append(audio_matrix)
                audio_qs_arr.append(q)

                print('-->',len(audio_data_arr),'<--', len(audio_matrix), end="\r")
        except Exception as err:
            print("ERROR", err)
            pass

print('')

print(len(audio_data_arr))
print(len(audio_qs_arr))
# print(len(audio_artist_arr))
# audio_data_arr1 = audio_data_arr 

--> 1078 <-- 1
1078
1078


In [20]:
data = {
    'sig':audio_data_arr,
    'q':audio_qs_arr
}


np.save('EMOPIA_'+ str(len(audio_data_arr)) + '__song_quads.npy', data)

# -- EMPOIA PARSER

## Metadata Extractor

This is used to assign a genre from the dataset 

In [None]:
import pandas as pd

df = pd.read_csv(r'LakhMIDI/artists.csv')

print(df)

In [None]:
found = 0
notfound = 0

dfCopy = df.copy(deep=True)
dfCopy = dfCopy[0:0]

display(dfCopy)

for dirname, _, filenames in os.walk('LakhMIDI/'):
#     for filename in filenames[:1]:
    if True:
        artist = dirname.split('LakhMIDI/')[1].replace('_', ' ')
        print(artist)
        if len(artist) < 2:
            continue
            
        entry = df[df['artist_mb'].str.contains(artist) == True] 
        IDs = entry.index.values
#         print(IDs)

        if len(entry) > 0 and len(entry) < 5:
            if len(entry) > 1:
                entry_new = df[(df['artist_mb'] == artist) == True]
                if len(entry_new) == 1:
                    entry = entry_new
                else:
                    entry = entry.sort_values(by=['listeners_lastfm', 'scrobbles_lastfm'], ascending=False)[0:1]
                    
            dfCopy = pd.concat([dfCopy,entry])
            display(dfCopy)
            found += 1
        else:
            notfound += 1

print("Found ", found)
print("Not Found ", notfound)
        

In [77]:
dfCopy.to_csv('artists_filtered.csv', index=False)  

In [None]:
display(dfCopy)

In [37]:
print("Found ", found)
print("Not Found ", notfound)

Found  20
Not Found  0


## -- End of Metadata Extractor