# Step 1: Compression and Vectorization of Songs

References: 

https://github.com/shubham3121/music-generation-using-rnn 

https://www.hackerearth.com/blog/developers/jazz-music-using-deep-learning/

https://pyguitarpro.readthedocs.io/en/stable/index.html

## Imports

In [3]:
from glob import glob
import guitarpro
from guitarpro import *
from matplotlib import pyplot as plt
import numpy as np
import os
import pickle

from keras.utils import np_utils

from _Compressor import compress_track

## Choose Songs (Make sure to convert to 4/4 before running)

In [4]:
# Choose all songs
# songs = glob('*.gp*')

# Choose a certain cohort of songs

'''
filenames = glob('pantera-*.gp*') + \
            glob('dream_theater-*.gp*')
'''
#filenames = ['korn-freak_on_a_leash.gp5']
#filenames = ['metallica-master_of_puppets.gp5']

#filenames = glob('metallica-*.gp*')
filenames = glob('korn-*.gp*')

In [6]:
tracks = []

for filename in filenames:
    song = guitarpro.parse(filename)
    
    for track in song.tracks:
        if track.isPercussionTrack:
            continue
        
        tracks.append(track)
        

print(f'Number of tracks: {len(tracks)}')

#[print(track.name) for track in tracks]

Number of tracks: 10


## Song Compression

In [8]:
notes = []

num_rest_measures = 0


for track in tracks:

    compressed_track = compress_track(track)
    
    # Ignore the current track if it only contains rests.
    if all(all(beat[0] == 'rest' for beat in measure) for measure in compressed_track):
        continue
        
    # Add each beat's note to the notes list.
    for measure in compressed_track:
        
        # Skip measures that are only rests.
        # TODO: Consider only removing rest measures that occur at least twice in a row.
        if all(beat[0] == 'rest' for beat in measure):
            num_rest_measures += 1
            continue
        
        for beat in measure:
            notes.append(beat)
            

with open('notes', 'wb') as filepath:
    pickle.dump(notes, filepath)
    
print(f'Number of notes:\t {len(notes)}')
print(f'Number of unique notes:\t {len(set(notes))}')
print(f'Number of removed rest-only measures: {num_rest_measures}')

Number of notes:	 9157
Number of unique notes:	 221
Number of removed rest-only measures: 2630


## Song Vectorization and NN Input/Output Storage

#### prepare_sequences(notes, n_vocab)

In [5]:
SEQUENCE_LENGTH = 100

# The function originally returned (network_input, network_output).

n_vocab = len(set(notes))
# ^ originally a parameter of the function


# def prepare_sequences(notes, n_vocab):

# Create a dictionary to map notes to integers
note_to_int = dict((note, number) for number, note in enumerate(sorted(set(notes))))
int_to_note = {v: k for k, v in note_to_int.items()} # Invert the map

network_input = []
network_output = []

# create input sequences and the corresponding outputs
for i in range(0, len(notes) - SEQUENCE_LENGTH, 1):
    sequence_in = notes[i: i + SEQUENCE_LENGTH]
    sequence_out = notes[i + SEQUENCE_LENGTH]
    network_input.append([note_to_int[char] for char in sequence_in])
    network_output.append(note_to_int[sequence_out])

n_patterns = len(network_input)

# reshape the input into a format comatible with LSTM layers 
network_input = np.reshape(network_input, (n_patterns, SEQUENCE_LENGTH, 1))

# normalize input
network_input = network_input / float(n_vocab)

# one hot encode the output vectors
network_output = np_utils.to_categorical(network_output)

print('Input and output processed.')

Input and output processed.


In [6]:
with open('network_data', 'wb') as filepath:
    pickle.dump(network_input, filepath)
    pickle.dump(network_output, filepath)
    pickle.dump(n_vocab, filepath)
    
print('Network data successfully pickled.')

Network data successfully pickled.


In [7]:
with open('note_int_conversions', 'wb') as filepath:
    pickle.dump(note_to_int, filepath)
    pickle.dump(int_to_note, filepath)
    
print('Note-to-int dict successfully pickled.')

Note-to-int dict successfully pickled.


## Visualize some patterns in the data

In [8]:
vals, freq = np.unique([note_to_int[x] for x in notes], return_counts=True)

vals = np.array([int_to_note[x] for x in vals])
freq = 100 * freq / freq.sum()


sort_idx = freq.argsort()[::-1]

print('Top 15 most common values:')
print('[#Semitones, duration, isDotted]\n')
for idx in sort_idx[:15]:
    print(f'{vals[idx]} \t {freq[idx] :.1f}%')

Top 15 most common values:
[#Semitones, duration, isDotted]

['0' '8' 'False'] 	 11.5%
['0' '16' 'False'] 	 5.8%
['10' '8' 'False'] 	 3.6%
['5' '8' 'False'] 	 3.5%
['3' '8' 'False'] 	 3.3%
['7' '8' 'False'] 	 2.9%
['2' '8' 'False'] 	 2.7%
['12' '8' 'False'] 	 2.4%
['0_5' '8' 'False'] 	 2.3%
['tied' '8' 'False'] 	 1.8%
['0' '32' 'False'] 	 1.8%
['rest' '8' 'False'] 	 1.6%
['8' '8' 'False'] 	 1.6%
['24' '8' 'False'] 	 1.6%
['17' '8' 'False'] 	 1.5%


In [9]:
'''
cutoff = len(notes) // 10
plt.figure()
plt.plot([note_to_int[x] for x in notes[:cutoff]])
plt.figure()
plt.plot([note_to_int[x] for x in notes[-cutoff:]])
''';