In [82]:
import pandas as pd
import numpy as np
import pretty_midi
import music21
import re
from random import sample
from keras.utils import to_categorical

In [83]:
import os
os.chdir("../data")

### Read text data

In [180]:
with open("Jigs.txt") as my_file:
    abc_text = my_file.read()

# Cut out unnecessary backslashes
abc_text = re.sub('\\\\+\n', '\n', abc_text)

# Find starting index of the data we care about
start_ind = abc_text.find("X:")
abc_text = abc_text[start_ind:]

# Split data into training and testing sets
training_ind = int(len(abc_text)*0.7)
abc_train = abc_text[:training_ind]
abc_test = abc_text[training_ind:]

# Write files
train_file = open("jig_train.txt","w")
train_file.write(abc_train)
train_file.close()

test_file = open("jig_test.txt", "w")
test_file.write(abc_test)
test_file.close()

In [179]:
print(abc_test)

ingham Music Database
S:Pauline Wilson, via Phil Rowe
M:6/8
K:G
"G"d2d d^cd|e2B BAG|"C"E2E "Cm"_E2E|"G"D6|"Am"c2c "D7"cBA|
"G"B2A G2B|"A7"A2G F2G|"D7"A6|"G"d2d d^cd|e2B BAG|
"C"c2B c2d|e3 g2e|"G"d2d B2d|"C"e2d "G"B2G|"Am"E2G "D7"F2A|"G"G6||
"C"c3 e3|g2g g2e|"G"d2e d2B|d6|"Am"A2^G A2B|"D7"c2d e2c|"G"B2^A B2c|"G7"d3 G3|
"C"c3 e3|g2g g2e|"G"d2e d2B|"Em"e6|"Am"A2B "D7"c2e|"G"d2B "Em"g2e|
"Am"d2B "D7"cBA|"G"G6||


X: 240
T:The Pleated Plaidie
% Nottingham Music Database
S:Dennis Salter, via Phil Rowe
M:6/8
K:A
e2d |"A"c2e "E7"B2e|"A"A2A GFE|"D"F2A d2f|"E7"eBc dcB|"A"c2e "E7"B2e|
"A"Ace a3|"B7"bfb agf|"E7"efe gfe|"A"c2e "E7"B2e|"A"A2A GFE|"D"F2A d2f|
"E7"g3 efg|"A"a2a "D"agf|"A"efe ece|"Bm"f2f "E7"gfg|"A"a3 "A7"ag=g||
"D"f2f def|"A"e2e c2c|"Bm"dcd "E7"fed|"A"c3 -c3|"Bm"Bcd "E7"fed|
"A"c2e A2c|"B7"B^da gBf|"E7"e3 e2=f|"D"f2f def|"A"e2e cde|
"Bm"d2c "B7"BcA|"E7"GA^A B2=c|"A"^c2e "E7"B2e|"A"Ace a2a|"B7"b2b "E7"gfe|
"A"a3 ||


X: 241
T:Pop Goes the Wheezle
% Nottingham Music Database
S:Trad, via

In [165]:
test = re.sub('\\\\+\n', '\n', '|\\\\\\\n"A"ecc c2f|')
test

'|\n"A"ecc c2f|'

In [71]:
# Create vocab dictionaries
def create_dictionaries(text_data):
    char_list = list(text_data)
    vocab = list(set(char_list))
    
    # Dictionary with character to integer
    vocab_dict = {i: j for i,j in enumerate(char_list)}
    
    # Dictionary with integer to character
    vocab_dict_rev = {j: i for i,j in enumerate(char_list)}
        
    return vocab_dict, vocab_dict_rev

In [72]:
num_to_char, char_to_num = create_dictionaries(abc_text)

### Encoder & Decoder functions

In [73]:
def encoder(text_data, dictionary):
    character_nums = list(text_data)
    
    for i in range(len(character_nums)):
        character_nums[i] = dictionary[character_nums[i]]
        
    return character_nums

In [74]:
text_nums = encoder(abc_text, char_to_num)

In [75]:
def decoder(numeric_data, dictionary):
    text_list = []
    
    for i in range(len(numeric_data)):
        text_list.append(dictionary[numeric_data[i]])
    
    return "".join(text_list)
        

In [76]:
decoded_text = decoder(text_nums, num_to_char)

# Check to make sure decoder works:
abc_text == decoded_text

True

In [77]:
def create_training(char_nums, num_samples, str_length):
    # Get starting indices of the random samples for your training batch
    start_indices = sample(char_nums[0:(len(char_nums)-str_length-1)], num_samples)
    
    # The x_values begin at the starting indices and are str_length characters long
    # The y_values begin one character into the x_values and end one character longer than x_values
    x_data = np.array(char_nums[0:str_length])
    y_data = np.array(char_nums[1:str_length+1])
    for i in range(1,num_samples):
        x_data = np.vstack((x_data, np.array(char_nums[i:i+str_length])))
        y_data = np.vstack((y_data, np.array(char_nums[i+1:i+str_length+1])))
    
    #return x_data, y_data
    return x_data, y_data

In [78]:
x, y = create_training(text_nums, 10, 20)
x.shape, y.shape

((10, 20), (10, 20))

In [79]:
x

array([[   168, 135549, 135274, 135256, 135515, 130943, 135549, 135387,
        135242, 135260, 135278, 135549, 135387, 135242, 135260, 135278,
        135497, 130962, 135517, 130943],
       [135549, 135274, 135256, 135515, 130943, 135549, 135387, 135242,
        135260, 135278, 135549, 135387, 135242, 135260, 135278, 135497,
        130962, 135517, 130943, 135532],
       [135274, 135256, 135515, 130943, 135549, 135387, 135242, 135260,
        135278, 135549, 135387, 135242, 135260, 135278, 135497, 130962,
        135517, 130943, 135532, 132744],
       [135256, 135515, 130943, 135549, 135387, 135242, 135260, 135278,
        135549, 135387, 135242, 135260, 135278, 135497, 130962, 135517,
        130943, 135532, 132744, 135532],
       [135515, 130943, 135549, 135387, 135242, 135260, 135278, 135549,
        135387, 135242, 135260, 135278, 135497, 130962, 135517, 130943,
        135532, 132744, 135532, 135549],
       [130943, 135549, 135387, 135242, 135260, 135278, 135549, 135387,
   

In [67]:
y

array([[135549, 135274, 135256, 135515, 130943, 135549, 135387, 135242,
        135260, 135278, 135549, 135387, 135242, 135260, 135278, 135497,
        130962, 135517, 130943, 135532],
       [135274, 135256, 135515, 130943, 135549, 135387, 135242, 135260,
        135278, 135549, 135387, 135242, 135260, 135278, 135497, 130962,
        135517, 130943, 135532, 132744],
       [135256, 135515, 130943, 135549, 135387, 135242, 135260, 135278,
        135549, 135387, 135242, 135260, 135278, 135497, 130962, 135517,
        130943, 135532, 132744, 135532],
       [135515, 130943, 135549, 135387, 135242, 135260, 135278, 135549,
        135387, 135242, 135260, 135278, 135497, 130962, 135517, 130943,
        135532, 132744, 135532, 135549],
       [130943, 135549, 135387, 135242, 135260, 135278, 135549, 135387,
        135242, 135260, 135278, 135497, 130962, 135517, 130943, 135532,
        132744, 135532, 135549, 135497],
       [135549, 135387, 135242, 135260, 135278, 135549, 135387, 135242,
   