In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import os
import re
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import music21
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from joblib import Parallel, delayed
from tqdm import tqdm

from double_jig_gen.data import (
    ABCDataset,
    fix_encoding_errors,
    get_oneills_dataloaders,
    get_folkrnn_dataloaders,
    remove_quoted_strings,
)
from double_jig_gen.tokenizers import Tokenizer, ABCTune

logging.basicConfig()
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel("DEBUG")

In [None]:
DEVICE_ID = 7
# DATA_HOME = "/disk/scratch_fast/s0816700/data"
DATA_HOME = "data"
DATA_PATH = f"{DATA_HOME}/folk-rnn/data_v1"

In [None]:
with open(DATA_PATH, 'r') as fh:
    raw_folkrnn_data = fh.read()

In [None]:
abc_data_list = raw_folkrnn_data.split("\n\n")

In [None]:
from io import StringIO 
import sys

class CapturingStderr(list):
    def __enter__(self):
        self._stderr = sys.stderr
        sys.stderr = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio    # free up some memory
        sys.stderr = self._stderr

In [None]:
# Got these from the train.ipynb notebook
issue_idx = [32, 82, 86, 303, 380, 447, 500, 728, 856, 883, 894, 1157, 1160, 1164, 1402, 1412, 1452, 1523, 1549, 1569, 1986, 1987, 2315, 2586, 2592, 2843, 3175, 3179, 3185, 3189, 3248, 3485, 3486, 3498, 3542, 3646, 3662, 3827, 3901, 3940, 3941, 4399, 4977, 5021, 5071, 5075, 5292, 5369, 5436, 5489, 5717, 5845, 5872, 5883, 6146, 6149, 6153, 6391, 6401, 6441, 6512, 6538, 6558, 6975, 6976, 7304, 7575, 7581, 7832, 8164, 8168, 8174, 8178, 8237, 8474, 8475, 8487, 8531, 8635, 8651, 8816, 8890, 8929, 8930, 9388, 9966, 10043, 10071, 10185, 10241, 10330, 10410, 10412, 10420, 10542, 10598, 10604, 10738, 10747, 10749, 10780, 10827, 10842, 10853, 10858, 10864, 10894, 10902, 10903, 10904, 10905, 10913, 10914, 10915, 10950, 10960, 10961, 11145, 11195, 11293, 11316, 11341, 11359, 11367, 11426, 11441, 11462, 11556, 11770, 11844, 11859, 11877, 11886, 11893, 11951, 11974, 12019, 12115, 12167, 12215, 12243, 12338, 12355, 12381, 12413, 12439, 12462, 12559, 12581, 12614, 12627, 12638, 12744, 12794, 12807, 12880, 12883, 12969, 13108, 13184, 13188, 13319, 13506, 13558, 13710, 13749, 13901, 13987, 13990, 14001, 14002, 14003, 14065, 14139, 14190, 14228, 14261, 14308, 14310, 14359, 14481, 14509, 14579, 14627, 14652, 14752, 14757, 14790, 14817, 14819, 14893, 14914, 14915, 15003, 15008, 15036, 15109, 15187, 15207, 15228, 15355, 15428, 15453, 15518, 15582, 15681, 15916, 15919, 16005, 16026, 16033, 16034, 16060, 16133, 16217, 16231, 16339, 16387, 16433, 16479, 16673, 16737, 16868, 16880, 16987, 17019, 17064, 17130, 17142, 17188, 17261, 17475, 17615, 17785, 17896, 17992, 17997, 18002, 18016, 18099, 18250, 18348, 18353, 18389, 18397, 18485, 18491, 18499, 18569, 18594, 18595, 18652, 18700, 18710, 18719, 18783, 18820, 18837, 18838, 18839, 18893, 18902, 18913, 19044, 19051, 19065, 19081, 19257, 19277, 19286, 19378, 19379, 19591, 19592, 19926, 19967, 19971, 19995, 20004, 20075, 20199, 20243, 20307, 20312, 20388, 20403, 20462, 20472, 20540, 20595, 20596, 20599, 20644, 20675, 20740, 20742, 20809, 20842, 20984, 21051, 21087, 21134, 21138, 21145, 21148, 21166, 21179, 21184, 21187, 21201, 21274, 21331, 21366, 21367, 21368, 21369, 21370, 21385, 21388, 21389, 21446, 21447, 21502, 21508, 21521, 21522, 21625, 21629, 21633, 21634, 21639, 21649, 21660, 21710, 21711, 21896, 21919, 21955, 21974, 22004, 22014, 22024, 22025, 22033, 22073, 22086, 22103, 22104, 22121, 22214, 22353, 22397, 22408, 22469, 22565, 22582, 22584, 22589, 22593, 22594, 22720, 22756, 22795, 22796, 22797, 22832, 22847, 22908, 23062, 23120, 23219, 23236, 23240, 23241, 23266, 23294, 23425, 23455, 23510, 23647, 23648, 23649, 23656, 23669, 23675, 23755, 23835, 23883, 23889, 23891, 23921, 23933, 23940, 23947, 23949, 23955, 23956, 23957, 23958, 23959, 24054, 24057, 24059, 24074, 24084, 24162, 24177, 24304, 24422, 24507, 24651, 24773, 24816, 24910, 24911, 24912, 24960, 24984, 24985, 25022, 25023, 25024, 25025, 25046, 25061, 25084, 25085, 25091, 25124, 25174, 25230, 25236, 25279, 25287, 25335, 25367, 25368, 25369, 25370, 25371, 25372, 25373, 25383, 25466, 25489, 25509, 25551, 25553, 25554, 25604, 25619, 25630, 25744, 25761, 25766, 25841, 25842, 25881, 25954, 25988, 26041, 26042, 26047, 26056, 26071, 26176, 26411, 26456, 26511, 26550, 26600, 26674, 26788, 26794, 26907, 27034, 27039, 27043, 27097, 27098, 27166, 27194, 27221, 27252, 27264, 27352, 27465, 27483, 27507, 27583, 27646, 27795, 27832, 27834, 27961, 28048, 28186, 28189, 28254, 28255, 28302, 28326, 28379, 28410, 28411, 28419, 28429, 28477, 28538, 28616, 28628, 28637, 28656, 28681, 28704, 28728, 28742, 28983, 29087, 29115, 29229, 29285, 29374, 29454, 29456, 29464, 29586, 29642, 29648, 29782, 29791, 29793, 29824, 29871, 29886, 29897, 29902, 29908, 29938, 29946, 29947, 29948, 29949, 29957, 29958, 29959, 29994, 30004, 30005, 30189, 30239, 30337, 30360, 30385, 30403, 30411, 30470, 30485, 30506, 30600, 30814, 30888, 30903, 30921, 30930, 30937, 30995, 31018, 31063, 31159, 31211, 31259, 31287, 31382, 31399, 31425, 31457, 31483, 31506, 31603, 31625, 31658, 31671, 31682, 31788, 31838, 31851, 31924, 31927, 32013, 32152, 32228, 32232, 32363, 32550, 32602, 32754, 32793, 32945, 33031, 33034, 33045, 33046, 33047, 33109, 33183, 33234, 33272, 33305, 33352, 33354, 33403, 33525, 33553, 33623, 33671, 33696, 33796, 33801, 33834, 33861, 33863, 33937, 33958, 33959, 34047, 34052, 34080, 34153, 34231, 34251, 34272, 34399, 34472, 34497, 34562, 34626, 34725, 34960, 34963, 35049, 35070, 35077, 35078, 35104, 35177, 35261, 35275, 35383, 35431, 35477, 35523, 35717, 35781, 35912, 35924, 36031, 36063, 36108, 36174, 36186, 36232, 36305, 36519, 36659, 36829, 36940, 37036, 37041, 37046, 37060, 37143, 37294, 37392, 37397, 37433, 37441, 37529, 37535, 37543, 37613, 37638, 37639, 37696, 37744, 37754, 37763, 37827, 37864, 37881, 37882, 37883, 37937, 37946, 37957, 38088, 38095, 38109, 38125, 38301, 38321, 38330, 38422, 38423, 38635, 38636, 38970, 39011, 39015, 39039, 39048, 39119, 39243, 39287, 39351, 39356, 39432, 39447, 39506, 39516, 39584, 39639, 39640, 39643, 39688, 39719, 39784, 39786, 39853, 39886, 40028, 40095, 40131, 40178, 40182, 40189, 40192, 40210, 40223, 40228, 40231, 40245, 40318, 40375, 40410, 40411, 40412, 40413, 40414, 40429, 40432, 40433, 40490, 40491, 40546, 40552, 40565, 40566, 40669, 40673, 40677, 40678, 40683, 40693, 40704, 40754, 40755, 40940, 40963, 40999, 41018, 41048, 41058, 41068, 41069, 41077, 41117, 41130, 41147, 41148, 41165, 41258, 41397, 41441, 41452, 41513, 41609, 41626, 41628, 41633, 41637, 41638, 41764, 41800, 41839, 41840, 41841, 41876, 41891, 41952, 42106, 42164, 42263, 42280, 42284, 42285, 42310, 42338, 42469, 42499, 42554, 42691, 42692, 42693, 42700, 42713, 42719, 42799, 42879, 42927, 42933, 42935, 42965, 42977, 42984, 42991, 42993, 42999, 43000, 43001, 43002, 43003, 43098, 43101, 43103, 43118, 43128, 43206, 43221, 43348, 43466, 43551, 43695, 43817, 43860, 43954, 43955, 43956, 44004, 44028, 44029, 44066, 44067, 44068, 44069, 44090, 44105, 44128, 44129, 44135, 44168, 44218, 44274, 44280, 44323, 44331, 44379, 44411, 44412, 44413, 44414, 44415, 44416, 44417, 44427, 44510, 44533, 44553, 44595, 44597, 44598, 44648, 44663, 44674, 44788, 44805, 44810, 44885, 44886, 44925, 44998, 45032, 45085, 45086, 45091, 45100, 45115, 45220, 45455, 45500, 45555, 45594, 45644, 45718, 45832, 45838, 45951, 46078, 46083, 46087, 46141, 46142, 46210, 46238, 46265, 46296, 46308, 46396, 46509, 46527, 46551, 46627, 46690, 46839, 46876, 46878, 47005, 47092, 47230, 47233, 47298, 47299, 47346, 47370, 47423, 47454, 47455, 47463, 47473, 47521, 47582, 47660, 47672, 47681, 47700, 47725, 47748, 47772, 47786, 48027]

In [None]:
# TODO: see top of train.ipynb for cleaning and failures

# Some issues found

In [None]:
issues = {
    32: "tune is not valid ABC - it's just written out chords",
    82: "unquoted text included at start of tune",
    86: "is only unquoted text - next tune at idx 87 contains the data with metadata repeated",
    447: "quote data is encoded incorrectly in the file, can fix with a find/replace. Probably supposed to be unicode 'RIGHT SINGLE QUOTATION MARK'",  # bytes(abc_data_list[issue_idx[5]], "utf-8").replace("â\x80\x99".encode("utf-8"), "'".encode("utf-8"))
    303: "missing final repeat ], but also music21 can't handle [1-2 ",
    380: "I think these are simply a typo 'J' since removing it gives the right number of notes",
    894: "Like 447, quote data is encoded incorrectly in the file, can fix with a find/replace. Probably supposed to be unicode 'RIGHT SINGLE QUOTATION MARK'",  # bytes(abc_data_list[issue_idx[10]], "utf-8").replace("Â\xa0".encode("utf8"), "'".encode("utf-8"))
    1412: "single line, should be part of previous tune",
    1986: "single line, should be part of previous tune",
    1987: "single line, should be part of previous tune",
    2592: "typo: '[Ee[' should be '[Ee]'",
    18398: "Outdated E: field - now handled",
    29648: "Possibly invalid chord specifiers http://abcnotation.com/wiki/abc:standard:v2.1#annotations",
}

In [None]:
for idx, explanation in issues.items():
    LOGGER.warning(f"tune idx {idx} has issue: {explanation}")
    try:
        with CapturingStderr() as warnings:
            abc_tune = ABCTune(
                abc_data_list[idx],
                pianoroll_divisions_per_quarternote=2,
                min_pitch=None,
                min_time=None,
                transpose_to_pitchclass="C",
            )
        if warnings:
            LOGGER.warning(f"Imported with warnings (emitted to stderr from music21): {warnings}")
            LOGGER.info(f"Original abc_data:\n{abc_data_list[idx]}")
            LOGGER.info(f"Imported ABCTune:\n{[tok for tok in abc_tune.abc_music21.flat]}\n\n")
        else:
            LOGGER.info(f"Imported fine... ABCTune:\n{abc_tune}\n\n")
    except Exception as e:
        LOGGER.error(e)
        LOGGER.info(f"Source ABC:\n{abc_data_list[idx]}\n\n")

In [None]:
idx = issue_idx[4]
try:
    print(idx)
    with CapturingStderr() as warnings:
        abc_tune = ABCTune(
            abc_data_list[idx],
            pianoroll_divisions_per_quarternote=2,
            min_pitch=None,
            min_time=None,
            transpose_to_pitchclass="C",
        )
    if warnings:
        LOGGER.warning(f"Imported with warnings (emitted to stderr from music21): {warnings}")
        LOGGER.info(f"Original abc_data:\n{abc_data_list[idx]}")
        LOGGER.info(f"Imported ABCTune:\n{[tok for tok in abc_tune.abc_music21.flat]}\n\n")
    else:
        LOGGER.info(f"Imported fine... ABCTune:\n{abc_tune}\n\n")
except Exception as e:
    LOGGER.error(e)
    LOGGER.info(f"Source ABC:\n{abc_data_list[idx]}\n\n")

# Remove quoted sections

In [None]:
print(abc_data_list[issue_idx[4]])
print()
print(remove_quoted_strings(abc_data_list[issue_idx[4]]))

# How to fix weird apostrophe encoding shit

In [None]:
abc_data_list[issue_idx[5]]

In [None]:
abc_data_list[issue_idx[5]].encode("utf-8")

In [None]:
"â\x80\x99".encode("utf-8")

In [None]:
bytes(abc_data_list[issue_idx[5]], "utf-8").replace("â\x80\x99".encode("utf-8"), "'".encode("utf-8"))

In [None]:
fix_encoding_errors(abc_data_list[issue_idx[5]])

In [None]:
abc_data_list[issue_idx[10]]

In [None]:
abc_data_list[issue_idx[10]].encode("utf-8")

In [None]:
"Â\xa0".encode("utf-8")

In [None]:
issue_idx[10]

In [None]:
bytes(abc_data_list[issue_idx[10]], "utf-8").replace("Â\xa0".encode("utf8"), "'".encode("utf-8"))

In [None]:
fix_encoding_errors(abc_data_list[issue_idx[10]])

# Some other fixes

In [None]:
abc_str = """T: Hallowtree
M: 3/4
L: 1/8
K: Amaj
:"A"E2AB AG|"D"F2Bc BA|"E"G2A2B2|"A"cB "G#,"AG "F"F=F|"A"E2AB AG|"D"F2Bc BA|"E"GA Bc BG|"A"A6:|
K:Amin
"Am"e2 ce dc|"G"d2 Bd cB|"Ff"c2 Ac BA|"G"B6|"C"e2 ce dc|"Dm"d2 Bd cB|"E7"cB Ac B^G|"Am"A6|]
"C"ed ce dc|"G"dc Bd cB|"Am"cB Ac BA|"Em"B6|"C"cE ce dc|"Dm"dF df ed|"E7"cB Ac B^G|"Am"A6|]
K:A
|:"A"e2 ec Ae|"D"f2fd Af|"A"e2 ec Ae|"E"dc Bc df|"A"e2 ec Ae|"D"f2fd Af|"A"ec Ad "E7"BG|A6:|
"""
print(abc_str)
abc = music21.converter.parse(abc_str, format="abc")
abc.show()

In [None]:
abc_str = """T: Wind That Shakes The Barley, The
M: 4/4
L: 1/8
K: Gmaj
d2 Bd dG B/A/G | eGcG e2 ge | dG B/A/G dG B/A/G | afgf ecge |
d2 Bd dG B/A/G | e2 ee e2 ge | d2 Bd d2 ga | B/d'/b ag edga ||
b2 ab c'2 ac' | b2 gb  a/c'/a ga | bgbg d'2 d'e' | d'bag edga |
b2 ab c'2 ac' | b2 gb agfd | gabc' d'2 d'e' | d'bag ecge |]
D2 B,D DG B/A/G | eGcG E2 GE | dG B/A/G dG B/A/G | afgf ec[Gg][Ee] |
D2 B,D DG B/A/G | E2 EE E2 GE | D2 B,D D2 GA | [B/b/][d/d'/][Bb] [Aa][Gg] ed[Gg][Aa] ||
B2 AB c2 Ac | B2 GB  A/c/A GA | BGBG d2 de | dBAG edGA |
B2 AB c2 Ac | B2 GB AGFD | GABc d2 de | dBAG ecGE |]
[D2d2] [B,B][Dd] [Dd]G B/A/G | eGcG [E2e2] [Gg][Ee] |
dG B/A/G dG B/A/G | afgf ec[Gg][Ee] |
[D2d2] [B,B][Dd] [Dd]G B/A/G | [E2e2] [Ee][Ee] [E2e2] [Gg][Ee] |
[D2d2] [B,B][Dd] [D2d2] [Gg][Aa] | [B/b/][d/d'/][Bb] [Aa][Gg] ed[Gg][Aa] ||
[B2b2] [Aa][Bb] [c2c'2] [Aa][cc'] | [B2b2] [Gg][Bb]  [A/a/][c/c'/][Aa] [Gg][Aa] |
[Bb][Gg][Bb][Gg] [d2d'2] [dd'][ee'] | [dd'][Bb][Aa][Gg] ed[Gg][Aa] |
[B2b2] [Aa][Bb] [c2c'2] [Aa][cc'] | [B2b2] [Gg][Bb] [Aa][Gg][Ff][Dd] |
[Gg][Aa][Bb][cc'] [d2d'2] [dd'][ee'] | [dd'][Bb][Aa][Gg] ec[Gg][Ee] |]
"""
print(abc_str)
abc = music21.converter.parse(abc_str, format="abc")
abc.show()

In [None]:
abc_str = """T: Diane's Happiness
M: 4/4
L: 1/8
K: Gmaj
GA|:BGAG E2DB,|DEGA BAAG|BGAG E2DB,|DEGA BGGA|
BGAG E2DB,|DEGA BABd|eged BGAG|[1 E2DE G2GA:|[2 E2DE G2Bc||
|:dBeB d2Bc|dBeB dBAB|dBeB dBAG|[1-2 E2DE G2Bc:|[3 E2DE G2z2||"""
print(abc_str)
try:
    music21.converter.parse(abc_str, format="abc")
except ValueError as e:
    print(f"{e=}")

In [None]:
# This *should* work: http://abcnotation.com/wiki/abc:standard:v2.1#variant_endings
# but it's not implemented correctly in music21
abc_str = """T: Diane's Happiness
M: 4/4
L: 1/8
K: Gmaj
GA|:BGAG E2DB,|DEGA BAAG|BGAG E2DB,|DEGA BGGA|
BGAG E2DB,|DEGA BABd|eged BGAG|[1 E2DE G2GA:|[2 E2DE G2Bc||]
|:dBeB d2Bc|dBeB dBAB|dBeB dBAG|[1-2 E2DE G2Bc:|[3 E2DE G2z2||]""" 
print(abc_str)
abc_m21 = music21.converter.parse(abc_str, format="abc")
abc_m21.show()

In [None]:
# To hack to work, need to adapt the end...
abc_str = """T: Diane's Happiness
M: 4/4
L: 1/8
K: Gmaj
GA|:BGAG E2DB,|DEGA BAAG|BGAG E2DB,|DEGA BGGA|
BGAG E2DB,|DEGA BABd|eged BGAG|[1 E2DE G2GA:|[2 E2DE G2Bc||]
|:dBeB d2Bc|dBeB dBAB|dBeB dBAG| [1 E2DE G2Bc :| [2 E2DE G2z2 ||]""" 
print(abc_str)
abc_m21 = music21.converter.parse(abc_str, format="abc")
abc_m21.show()