# Step 1: Compression and Vectorization of Songs

References: 

https://github.com/shubham3121/music-generation-using-rnn 

https://www.hackerearth.com/blog/developers/jazz-music-using-deep-learning/

https://pyguitarpro.readthedocs.io/en/stable/index.html

## Imports

In [1]:
from glob import glob
import guitarpro
from guitarpro import *
from matplotlib import pyplot as plt
import mgzip
import numpy as np
import os
import pandas as pd
import pickle
import string
from tqdm import tqdm

from keras.utils import np_utils

from _Compressor import compress_track
from _NoteData import NoteData

## Constants / Hyperparameters

In [2]:
MIDI  = {str(GuitarString(number=0, value=val)) : val for val in range(128)}

as_fingerings = True

## Choose Songs (Make sure to convert to 4/4 before running)

In [3]:
# Choose a certain cohort of songs


filenames = glob('songs\\*.gp5')
#filenames = glob('songs\\pantera*')

## Compress Each Track

In [4]:
def get_artist_name(filename):
    
    band_name = filename.split('\\')[-1].split('-')[0] # Isolate the band's name from the filepath
    
    band_name = string.capwords(band_name.replace('_', ' ')) # Remove underscores and capitalize each word
    
    return band_name

In [5]:
compressed_tracks = []

track_data = []

for filename in tqdm(filenames):
    song = guitarpro.parse(filename)
    
    for track in song.tracks:
        #print(track.name, track.channel.instrument)
        
        # Filter out percussion tracks and other SFX or drum-related tracks.
        if track.isPercussionTrack or track.channel.instrument >= 113:
            continue
            
        compressed_track = compress_track(track, as_fingerings)

        # Ignore the current track if it only contains rests.
        if all(all(beat[0] == 'rest' for beat in measure) for measure in compressed_track):
            continue
            
        compressed_tracks.append(compressed_track)
        
        track_data.append({})
        track_data[-1]['artist'] = get_artist_name(filename)
        if song.title == '' or song.title is None:
            song.title = string.capwords(filename.split('-')[-1].split('.')[0].replace('_', ' '))
        track_data[-1]['song'] = song.title
        track_data[-1]['name'] = track.name
        track_data[-1]['instrument'] = track.channel.instrument
        track_data[-1]['tempo'] = song.tempo
        track_data[-1]['tuning'] = track.strings[-1].value


print(f'Number of tracks: {len(compressed_tracks)}')
track_data = pd.DataFrame(track_data)

100%|██████████| 134/134 [01:46<00:00,  1.26it/s]

Number of tracks: 684





## Compile Note Vocabulary

In [6]:
def f(x):
    if type(x) != str:
        x = int_to_note[x][0]
        
    if x == 'rest' or x == 'tied':
        return -2
    if x == 'dead':
        return -1
    
    if as_fingerings:
        return int(x.split('_')[0])
    else:
        return MIDI[x.split('_')[0]]

def sort(x):
    semitones = f(x[0])
    duration = int((32 // x[1]) * (1 + 0.5 * x[2]))
    
    return (semitones, duration)

In [7]:
notes = []
notes_set = set()

note_start_indices = []


for i, compressed_track in enumerate(tqdm(compressed_tracks)):
    
    note_start_indices.append(len(notes))
 
    # Iterate through each measure in each compressed track
    for measure in compressed_track:
        
        keep = True
        
        # Skip measures that are only rests.
        if all(beat[0] == 'rest' for beat in measure):
            keep = False
        
        # Add each measure's notes to the notes list
        for beat in measure:
            notes_set.add(beat)
            if keep:
                notes.append(beat)
                
                
notes_set = notes_set
n_vocab = len(notes_set)

note_to_int = dict((note, number) for number, note in enumerate(sorted(notes_set, key=lambda x: sort(x))))
int_to_note = {v: k for k, v in note_to_int.items()} # Invert the map
    
print(f'Number of notes played:\t {len(notes)}')
print(f'Number of unique notes:\t {n_vocab}')

track_data['noteStartIdx'] = note_start_indices

100%|██████████| 684/684 [00:00<00:00, 1937.35it/s]

Number of notes played:	 544773
Number of unique notes:	 1629





## Get some patterns in the data

In [8]:
vals, freq = np.unique([note_to_int[x] for x in notes], return_counts=True)

vals = np.array([int_to_note[x] for x in vals])
freq = 100 * freq / freq.sum()


sort_idx = freq.argsort()[::-1]

print('Top 15 most common values:')
print('[#Semitones, duration, isDotted]\n')
for idx in sort_idx[:15]:
    noteData = NoteData(vals[idx][0], int(vals[idx][1]), vals[idx][2] == 'True', vals[idx][3] == 'True')
    #print(f'{vals[idx]} \t {freq[idx] :.1f}%')
    print(f'{freq[idx] :.1f}% \t {str(noteData)}')

Top 15 most common values:
[#Semitones, duration, isDotted]

5.3% 	 0,	 1/8, 
3.9% 	 0,	 1/16, muted
3.2% 	 0,	 1/16, 
3.1% 	 0,	 1/8, muted
2.3% 	 rest,	 1/8, 
1.7% 	 tied,	 1/8, 
1.6% 	 3,	 1/8, 
1.5% 	 2,	 1/16, 
1.4% 	 0_5,	 1/8, 
1.4% 	 1,	 1/8, 
1.4% 	 2,	 1/8, 
1.3% 	 12,	 1/8, 
1.3% 	 3,	 1/16, 
1.3% 	 7,	 1/8, 
1.3% 	 5,	 1/8, 


## Data Compression and Storage

In [9]:
if not os.path.isdir('data'):
    os.mkdir('data')

In [10]:
with mgzip.open('data\\notes_data.pickle.gz', 'wb') as filepath:
    pickle.dump(notes, filepath)
    pickle.dump(note_to_int, filepath)
    pickle.dump(int_to_note, filepath)
    pickle.dump(n_vocab, filepath)
    
with mgzip.open('data\\track_data.pickle.gz', 'wb') as filepath:
    pickle.dump(track_data, filepath)

# How to read DataFrame pickles:
# pd.read_pickle('track_data.pickle')