In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
# import chord progressions and tokens
chord_progressions = pd.read_pickle('df_clean.pickle').rename({'chord_progression_C':'chords'}, axis=1)[['chords']]
chord_progressions['chords'] = chord_progressions['chords'].apply(lambda x: x + ['<EOS>'])

# remove progressions that aren't at least a minimum length
min_progression_length = 2
chord_progressions = chord_progressions[chord_progressions['chords'].str.len()>min_progression_length].reset_index(drop=True)

In [3]:
# how many different chord progressions do we have?
f"{chord_progressions.chords.astype('str').nunique()} unique chord progressions out of {chord_progressions.shape[0]} total chord progressions"

'4630 unique chord progressions out of 17302 total chord progressions'

In [4]:
# most common chord progressions
chord_progressions.chords.astype('str').value_counts().head(20)

chords
['Am', 'F', 'C', 'G', '<EOS>']    433
['C', 'G', 'Am', 'F', '<EOS>']    426
['F', 'C', 'G', '<EOS>']          318
['F', 'C', 'G', 'Am', '<EOS>']    244
['C', 'F', '<EOS>']               224
['G', 'F', 'C', '<EOS>']          205
['F', 'G', 'C', '<EOS>']          202
['C', 'G', 'F', '<EOS>']          196
['C', 'G', '<EOS>']               170
['G', 'Am', 'F', 'C', '<EOS>']    170
['C', 'F', 'G', '<EOS>']          162
['F', 'G', 'Am', '<EOS>']         159
['Am', 'G', 'F', '<EOS>']         157
['Am', 'F', 'C', '<EOS>']         150
['Am', 'F', 'G', '<EOS>']         148
['C', 'Am', 'F', 'G', '<EOS>']    139
['C', 'Am', 'F', '<EOS>']         128
['F', 'Am', 'G', '<EOS>']         117
['G', 'C', 'F', '<EOS>']          109
['Dm', 'F', 'C', 'G', '<EOS>']     91
Name: count, dtype: int64

In [10]:
# what are all of the chords (tokens)?
tokens = set(' ')  # include the space between chords as a token
for _, row in chord_progressions.iterrows():
    tokens |= set(row.chords)
tokens = [0] + sorted(list(tokens))  # include zero for zero padding

# encoder / decoder
encoder = dict()
idx = 0
for token in tokens:
    encoder[token] = idx
    idx += 1

# save to file
with open('tokens.pickle', 'wb') as f:
    pickle.dump(tokens, f)
with open('encoder.pickle', 'wb') as f:
    pickle.dump(encoder, f)
    
# examine tokens
i = 0
for token in tokens:
    print(f'{token:8}', end='\t')
    i += 1
    if i%8 == 0:
        print()     
print('\n')
print('Number of Tokens: ', len(tokens))

       0	        	<EOS>   	Am      	Am/E    	Am/F    	Am11    	Am6     	
Am7     	Am7/C   	Am7/G   	Am7add11	Am9     	Amadd9  	Amaj7   	Amaj7sus2	
Amaj9   	Ammaj7  	Bm      	Bm11    	Bm6     	Bm7     	Bm7b5   	Bm9     	
Bmaj7   	Bmmaj7  	C       	C/E     	C/G     	C2      	C4      	C5      	
C6      	C6add11 	C7      	C7add11 	C7sus2  	C7sus4  	C9      	Cadd#11 	
Cadd11  	Cadd2   	Cadd4   	Cadd9   	Caug    	Cflat5  	Csus    	Csus2   	
Csus4   	Ddim    	Dm      	Dm11    	Dm13    	Dm6     	Dm6/F   	Dm7     	
Dm7/G   	Dm7add11	Dm7b5   	Dm9     	Dmadd9  	Dmaj    	Dmaj7   	Dmaj9   	
E       	E5      	E6      	E7      	E7b13   	E7b9    	E7sus   	E7sus4  	
E9sus4  	Edim    	Em      	Em6     	Em7     	Em9     	Emadd9  	Emaj7   	
Esus2   	Esus4   	F       	F/G     	F2      	F5      	F6      	F6add9  	
F6sus2  	F7      	F7sus4  	F9      	F9b5    	Fadd#11 	Fadd2   	Fadd4   	
Fadd9   	Fflat5  	Fsus    	Fsus2   	Fsus4   	G       	G#m     	G#m7    	
G#maj7  	G/C     	G11     	G13     	G2      	G4   

In [17]:
# training validation split
val_size = 0.1
train, val = train_test_split(chord_progressions, test_size=val_size)

# max chords in progression -> sequence_length
sequence_length = pd.read_pickle('df_clean.pickle')['chord_progression_C'].str.len().max()

# add to list, zero pad if required
def get_progression_list(progressions_df, zero_pad=False):
    prog_list, prog_list_enc = list(), list()
    for _, progression in progressions_df.iterrows():
        x = progression.values[0]  # list of chords
        if zero_pad:  # zero pad
            x = x + [0]*(sequence_length - len(x))
        y = list()  # encoded list of chords
        for chord in x:
            y.append(encoder[chord])
        prog_list += x
        prog_list_enc += y
    return prog_list, prog_list_enc
    
train_list, train_list_enc = get_progression_list(train)
val_list, val_list_enc = get_progression_list(val)

# save to file as np.array
train_enc = np.array(train_list_enc, dtype=np.uint16)
train_enc.tofile('train.bin')
val_enc = np.array(val_list_enc, dtype=np.uint16)
val_enc.tofile('val.bin')