In [82]:
import pandas as pd
import numpy as np
import pretty_midi
import music21
import re
from random import sample
from keras.utils import to_categorical

In [83]:
import os
os.chdir("../data")

### Read text data

In [84]:
with open("Jigs.txt") as my_file:
    abc_text = my_file.read()
    
print(abc_text)

{\rtf1\ansi\ansicpg1252\cocoartf2512
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fmodern\fcharset0 Courier;}
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
{\*\expandedcolortbl;;\cssrgb\c0\c0\c0;}
\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\deftab720
\pard\pardeftab720\partightenfactor0

\f0\fs24 \cf2 \expnd0\expndtw0\kerning0
\outl0\strokewidth0 \strokec2 X: 1\
T:A and D\
% Nottingham Music Database\
S:EF\
M:4/4\
K:A\
M:6/8\
P:A\
f|"A"ecc c2f|"A"ecc c2f|"A"ecc c2f|"Bm"BcB "E7"B2f|\
"A"ecc c2f|"A"ecc c2c/2d/2|"D"efe "E7"dcB| [1"A"Ace a2:|\
 [2"A"Ace ag=g||\\\
K:D\
P:B\
"D"f2f Fdd|"D"AFA f2e/2f/2|"G"g2g ecd|"Em"efd "A7"cBA|\
"D"f^ef dcd|"D"AFA f=ef|"G"gfg "A7"ABc |1"D"d3 d2e:|2"D"d3 d2||\
\
\
\
X: 2\
T:Abacus\
% Nottingham Music Database\
S:By Hugh Barwell, via Phil Rowe\
M:6/8\
K:G\
"G"g2g B^AB|d2d G3|"Em"GAB "Am"A2A|"D7"ABc "G"BAG|\
"G"g2g B^AB|d2d G2G|"Em"GAB "Am"A2G|"D7"FGA "G"G3::\
"D7"A^GA DFA|"G"B^AB G3|"A7"^c=c^c Ace|"D7"fef def|\
"G"g2g de=f|"E7"e2e Bcd|"Am

In [125]:
test = re.findall(r'\+n', r'|\\\n"A"ecc c2f|')
test

[]

In [71]:
# Create vocab dictionaries
def create_dictionaries(text_data):
    char_list = list(text_data)
    vocab = list(set(char_list))
    
    # Dictionary with character to integer
    vocab_dict = {i: j for i,j in enumerate(char_list)}
    
    # Dictionary with integer to character
    vocab_dict_rev = {j: i for i,j in enumerate(char_list)}
        
    return vocab_dict, vocab_dict_rev

In [72]:
num_to_char, char_to_num = create_dictionaries(abc_text)

### Encoder & Decoder functions

In [73]:
def encoder(text_data, dictionary):
    character_nums = list(text_data)
    
    for i in range(len(character_nums)):
        character_nums[i] = dictionary[character_nums[i]]
        
    return character_nums

In [74]:
text_nums = encoder(abc_text, char_to_num)

In [75]:
def decoder(numeric_data, dictionary):
    text_list = []
    
    for i in range(len(numeric_data)):
        text_list.append(dictionary[numeric_data[i]])
    
    return "".join(text_list)
        

In [76]:
decoded_text = decoder(text_nums, num_to_char)

# Check to make sure decoder works:
abc_text == decoded_text

True

In [77]:
def create_training(char_nums, num_samples, str_length):
    # Get starting indices of the random samples for your training batch
    start_indices = sample(char_nums[0:(len(char_nums)-str_length-1)], num_samples)
    
    # The x_values begin at the starting indices and are str_length characters long
    # The y_values begin one character into the x_values and end one character longer than x_values
    x_data = np.array(char_nums[0:str_length])
    y_data = np.array(char_nums[1:str_length+1])
    for i in range(1,num_samples):
        x_data = np.vstack((x_data, np.array(char_nums[i:i+str_length])))
        y_data = np.vstack((y_data, np.array(char_nums[i+1:i+str_length+1])))
    
    #return x_data, y_data
    return x_data, y_data

In [78]:
x, y = create_training(text_nums, 10, 20)
x.shape, y.shape

((10, 20), (10, 20))

In [79]:
x

array([[   168, 135549, 135274, 135256, 135515, 130943, 135549, 135387,
        135242, 135260, 135278, 135549, 135387, 135242, 135260, 135278,
        135497, 130962, 135517, 130943],
       [135549, 135274, 135256, 135515, 130943, 135549, 135387, 135242,
        135260, 135278, 135549, 135387, 135242, 135260, 135278, 135497,
        130962, 135517, 130943, 135532],
       [135274, 135256, 135515, 130943, 135549, 135387, 135242, 135260,
        135278, 135549, 135387, 135242, 135260, 135278, 135497, 130962,
        135517, 130943, 135532, 132744],
       [135256, 135515, 130943, 135549, 135387, 135242, 135260, 135278,
        135549, 135387, 135242, 135260, 135278, 135497, 130962, 135517,
        130943, 135532, 132744, 135532],
       [135515, 130943, 135549, 135387, 135242, 135260, 135278, 135549,
        135387, 135242, 135260, 135278, 135497, 130962, 135517, 130943,
        135532, 132744, 135532, 135549],
       [130943, 135549, 135387, 135242, 135260, 135278, 135549, 135387,
   

In [67]:
y

array([[135549, 135274, 135256, 135515, 130943, 135549, 135387, 135242,
        135260, 135278, 135549, 135387, 135242, 135260, 135278, 135497,
        130962, 135517, 130943, 135532],
       [135274, 135256, 135515, 130943, 135549, 135387, 135242, 135260,
        135278, 135549, 135387, 135242, 135260, 135278, 135497, 130962,
        135517, 130943, 135532, 132744],
       [135256, 135515, 130943, 135549, 135387, 135242, 135260, 135278,
        135549, 135387, 135242, 135260, 135278, 135497, 130962, 135517,
        130943, 135532, 132744, 135532],
       [135515, 130943, 135549, 135387, 135242, 135260, 135278, 135549,
        135387, 135242, 135260, 135278, 135497, 130962, 135517, 130943,
        135532, 132744, 135532, 135549],
       [130943, 135549, 135387, 135242, 135260, 135278, 135549, 135387,
        135242, 135260, 135278, 135497, 130962, 135517, 130943, 135532,
        132744, 135532, 135549, 135497],
       [135549, 135387, 135242, 135260, 135278, 135549, 135387, 135242,
   