# Exploring the data

In [None]:
import random
import re
from pathlib import Path
from typing import Mapping, Sequence

from IPython.display import display
import matplotlib.pyplot as plt
import music21
from midi_set_class.notation import (
    ABC_PATTERN,
    abc_to_midipitch,
    midipitch_to_abc,
)
import numpy as np
import pandas as pd
import seaborn as sns

from plotting import countplot_with_numbers, barplot_with_numbers
from double_jig_gen.data import ABCDataset

In [None]:
filename = "DoubleJig0001-0365.abc"
data_path = Path("..", "data")
file_path = Path(data_path, filename)

In [None]:
assert file_path.exists(), (
    "Download {filename} and put it in {data_path}. You can "
    "execute the provided script: dj-gen-get-data"
)

# Raw data

The file is just a text file containing the data and metadata

In [None]:
! head -58 {str(file_path)}
! echo ...

## Music21

The package [music21](http://web.mit.edu/music21/doc/) can read the whole lot into a container class.

In [None]:
# This is a class containing all the separate jigs
oneills_opus = music21.converter.parse(str(file_path.resolve()))

In [None]:
# contains 365 scores as expected
len(list(oneills_opus.scores))

In [None]:
# All scores have a single part
sum(len(list(score.parts.stream())) for score in oneills_opus.scores)

## A random jig

In [None]:
random.seed(1337)
score = random.choice(oneills_opus.scores)

In [None]:
score.show()

## The Piper's Picnic

In [None]:
# You can select by title
score = oneills_opus.getScoreByTitle("The piper's picnic")

In [None]:
score.show()

Internally, music21 is storing the data as a stream of information:
* Parts contain
    * Measures which contain
        * Notes
        * and other stuff

In [None]:
score.show('text')

## Extracting metadata

Here's what The Piper's Picnic looks like in the raw dataset

```
X:2
T:The piper's picnic
O:Ireland
B:Francis O'Neill: "The Dance Music of Ireland" (1907) no. 2
R:Double jig
Z:Transcribed by Frank Nordberg - http://www.musicaviva.com
F:http://www.musicaviva.com/abc/tunes/ireland/oneill-1001/oneill-1001-0002.abc
M:6/8
L:1/8
K:A
E|AcA BGE|cde dBG|\
AcA BGE|{F}EDE GAB|AcA BGE|cde dBG|{d}cBA BGE|EFG A2:|
(A/B/)|ceA ceA|cee edc|(B/c/d)G (B/c/d)G|Bdd dcB|\
cde dcB|{d}cBA Ggf|ecA GED|EFGA2:|
```

We can get (most of) that data using music21

In [None]:
meta = score.metadata
# there is only one part to each jig so index 0 works
first_measure = score.parts[0].measure(0)
second_measure = score.parts[0].measure(1)

In [None]:
# for attr_name in meta.searchAttributes:
#     print(f"{attr_name}: {getattr(meta, attr_name)}")

In [None]:
(
    meta.number,  # X:2
    meta.title,  # T:The piper's picnic
    meta.localeOfComposition,  # O:Ireland
    None,  # B:Francis O'Neill: "The Dance Music of Ireland" (1907) no. 2
    None,  # R:Double jig
    None,  # Z:Transcribed by Frank Nordberg - http://www.musicaviva.com
    None,  # F:http://www.musicaviva.com/abc/tunes/ireland/oneill-1001/oneill-1001-0002.abc
    str(first_measure.timeSignature),  # M:6/8
    second_measure.offset,  # L:1/8
    str(first_measure.keySignature), # K:A
    score.plot(),  # E|AcA BGE|cde dBG|\
           # AcA BGE|{F}EDE GAB|AcA BGE|cde dBG|{d}cBA BGE|EFG A2:|
           # (A/B/)|ceA ceA|cee edc|(B/c/d)G (B/c/d)G|Bdd dcB|\
           # cde dcB|{d}cBA Ggf|ecA GED|EFGA2:|
)

# DQA

In [None]:
with open(file_path, 'r') as fh:
    # remove comments
    data_lines = [line for line in fh.readlines() if not line.startswith("%")]
    # remove leading and trailing whitespace and recombine
    data = "".join(data_lines).strip()

In [None]:
abc_strings = data.split("\n\n\n\n")

In [None]:
len(abc_strings)  # this should be 365

In [None]:
# Taken from: http://abcnotation.com/wiki/abc:standard:v2.1
ABC_FIELDS = {
    'A': 'area',
    'B': 'book',
    'C': 'composer',
    'D': 'discography',
    'F': 'file url',
    'G': 'group',
    'H': 'history',
    'I': 'instruction',
    'K': 'key',
    'L': 'unit note length',
    'M': 'meter',
    'm': 'macro',
    'N': 'notes',
    'O': 'origin',
    'P': 'parts',
    'Q': 'tempo',
    'R': 'rhythm',
    'r': 'remark',
    'S': 'source',
    's': 'symbol line',
    'T': 'tune title',
    'U': 'user defined',
    'V': 'voice',
    'W': 'words',
    'w': 'words',
    'X': 'reference number',
    'Z': 'transcription'
}


def merge_continuation_lines(lines: Sequence[str]) -> Sequence[str]:
    """Merges lines which end with \ with the next line.
    
    Args:
        lines: a list of strings to check and merge.
        
    Returns:
        lines: the list of strings which have been merged.
    """
    nr_checks = len(lines)
    idx = 0
    for _ in range(nr_checks):
        line = lines[idx]
        if line.endswith("\\"):
            line = line[:-1]  # remove trailing backslash
            try:
                lines[idx] = f"{line} {lines[idx+1]}"
                del lines[idx+1]  # del and stay on this line
            except IndexError as e:
                print(lines, line)
                raise e
        else:
            idx += 1
    return lines


def parse_abc(abc_str: str) -> Mapping[str, str]:
    """Get the required information from each tune.
    
    Extracts the metadata as defined in [1] from the string, plus the transcription
    which is assumed not to have a prefix. Assumes all metadata is stated before the
    transcription is started. Then adds everything remaining to the transcription.
    
    Args:
        abc_str: the string containing an abc file to parse.
        
    Returns:
        out_dict: a dictionary containing all the extracted values.
    
    See also:
    * http://abcnotation.com/wiki/abc:standard:v2.1
    """
    lines = [line.strip() for line in abc_str.split("\n")]
    
    # if a line ends with \ then merge with subsequent line
    lines = merge_continuation_lines(lines)
    
    valid_field = "|".join(ABC_FIELDS)
    metadata_line_regex = re.compile(f"^({valid_field}):")
     
    _, metadata_end_idx = min(
        (val, idx)
        for (idx, val)
        in enumerate([bool(metadata_line_regex.match(line)) for line in lines])
    )
    
    out_dict = {}
    for line in lines[:metadata_end_idx]:
        meta_key, value = line.split(":", 1)
        field_name = ABC_FIELDS[meta_key]
        if field_name in out_dict:
            concat_fields = ("tune title", "words")
            if field_name not in concat_fields:
                msg = (
                    f"Tried to add {(field_name, value)} to {out_dict}."
                    f"\nLines: {lines}"
                )
                raise ValueError(msg)
            else:
                out_dict[field_name] = f"{out_dict[field_name]} --- {line}"
        out_dict[field_name] = value
    
    transcription_label = "tune"
    for line in lines[metadata_end_idx:]:
        if line.upper().startswith("W:"):
            if "words" not in out_dict:
                out_dict["words"] = line
            else:
                out_dict["words"] = f"{out_dict['words']} --- {line}"
        elif transcription_label not in out_dict:
            out_dict[transcription_label] = line
        else:
            out_dict[transcription_label] = f"{out_dict[transcription_label]} {line}"
    return out_dict

In [None]:
r"\\"

In [None]:
parsed_abc = [parse_abc(abc) for abc in abc_strings]

In [None]:
oneills_df = pd.DataFrame(parsed_abc)

### How much of the data is missing?

In [None]:
missing_counts = oneills_df.isna().sum().rename("count_of_missing_values")
missing_counts

In [None]:
id_vars = ["reference number"]
value_vars = [col for col in oneills_df.columns if col not in id_vars]
melt_df = oneills_df.melt(
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="field_name",
)
missing_df = melt_df
missing_df["value"] = missing_df["value"].isna()

In [None]:
countplot_with_numbers(
    var_name="field_name", data=missing_df,  hue="value", kind="count"
)
plt.title("filed value is missing")
plt.xticks(rotation=90);

In [None]:
def ordered_countplot(series, **kwargs):
    count = series.value_counts()
    barplot_with_numbers(count, **kwargs)

### Vaule counts

In [None]:
exclude_cols = [
    "reference number",
    "tune title",
    "file url",
    "book",
    "tune",
    "words",
    "transcription",
]
for col in [col for col in oneills_df.columns if col not in exclude_cols]:
    plt.figure()
    ordered_countplot(oneills_df[col], kind="count")
    plt.xticks(rotation=90)
    plt.title(col, fontsize=20)
    plt.ylabel("count")
    plt.show()

In [None]:
oneills_df["book"].str.slice(
    stop=len("Francis O'Neill: 'The Dance Music of Ireland' (1907)")
).value_counts()

In [None]:
for idx in oneills_df.index[~oneills_df["words"].isna()]:
    ref, title, words = oneills_df.loc[idx, ["reference number", "tune title", "words"]]
    comment = "\n".join(words.split(" --- "))
    print(f"Tune {ref} \"{title}\", has comments:\n{comment}\n")

## Tune analysis

In [None]:
tunes = [abc["tune"] for abc in parsed_abc]

In [None]:
tokens_set = set(list(''.join(tunes)))
idx2token = list(tokens_set)
vocab_size = len(idx2token)
print(f"vocabulary size: {vocab_size}")
print(f"vocabulary (each token separated by a space): \n{' '.join(sorted(tokens_set))}")

### Number of tokens in a piece (not true length as repeats are notated |: :|)

In [None]:
tune_len = [len(tune.replace(' ', '')) for tune in tunes]

In [None]:
max_len = max(tune_len)
med_len = np.median(tune_len)
mean_len = np.mean(tune_len)
min_len = min(tune_len)
sns.distplot(tune_len)
plt.xlabel("Number of non-whitespace chars in tunes")
plt.title(
    f"Mean: {int(mean_len)}, median: {int(med_len)}, range: [{min_len}, {max_len}]"
);

### How many tunes have ornamentation etc. in them?

Summary:

* K - key change mid tune
* L - a `>` accent on the subsequent note
* M - a lower mordent <img src="https://www.8notes.com/school/lessons/all/lowermordent1.1.gif" alt="Lower Mordent" width=200>
* S, and O - segno (sign), and coda <img src="https://www.liveabout.com/thmb/1Qfs7xts_x04z1HXOWLPeibyivA=/735x0/GL_segno-coda-music-56a72d345f9b58b7d0e7996b.png" alt="segno" width="100"/>
* T - `tr' trill
* ~ - irish roll
* z - rest
* { } - grace note
* "..." - text annotations

For more on these, see http://abcnotation.com/wiki/abc:standard:v2.1

Exceptions found:
* The one tune containing 's' and 't' has a text annotation saying 1st and 2nd

I found http://www.clivew.com/abc.php useful for viewing ornaments (music21 doesn't display them).

In [None]:
ornaments = {
    "K": "key change mid tune",
    "L": "a `>` accent on the subsequent note",
    "M": "a lower mordent",
    "S": "segno (sign)",
    "O": "coda",
    "T": "`tr' trill",
    "~": "irish roll",
    "z": "rest",
    "{": "grace note",
    "\"": "text annotations",
}
query_strings = ornaments.keys()
weird_tunes = {}
for query in query_strings:
    weird_tunes[query] = [
        (idx, tune) for (idx, tune) in enumerate(tunes) if query in tune
    ]

In [None]:
{k: len(v) for k, v in weird_tunes.items()}

In [None]:
for query, exceptions in weird_tunes.items():
    count = len(exceptions)
    print(f"\n\n{10*'='} Exceptions contiaining '{query}' ({count}) {10*'='}")
    print(f"'{query}' = {ornaments[query]} (http://abcnotation.com/wiki/abc:standard:v2.1)\n")
    max_print = 2
    for (idx, _) in exceptions[:max_print]:
        abc = parsed_abc[idx]
        for k, v in abc.items():
            print(f"{k}: {v}")
        abch = music21.abcFormat.ABCHandler()
        abch.tokenize(abc["tune"])
        print("tokenized tune: " + " ".join([tok.src for tok in abch.tokens]))
        print("\n")
        oneills_opus.scores[idx].show()
        print("\n")
    if count > max_print:
        print("\n ... \n")        

# Preprocessing ================================

Preprocessing to do:
* [x] Check L: is standard for all tunes
  * It's 1/8 for all tunes
* [x] Only keep M: and K:
* [x] Transpose all tunes to be centred at C
  * used FolkRNN-parser to tokenize, then rolled my own transposition code on tokens
  * tried to use music21 - no export to abc, only lilypond. then tried ly2abc.py, but this messed up bars
* [x] Remove all ornaments and gracenotes
  * appears to be done by `python FolkRNN-parser.py -f ../double-jig-gen/data/oneills_tunes -o oniells_double_jigs.abc`
* [x] Add `=` (indicating the note is natural i.e. not flat nor sharp) onto all notes which are natural
* [x] Separate into tokens
    * Hopefully can just use FolkRNN-parser.py
    * [x] Check that brackets are handled correctly, 
        * seems like you can open a bracket and not close it if it's finished by a bar

Example output:
```
M:9/8
K:maj
=G =E =E =E 2 =D =E =D =C | =G =E =E =E =F =G =A =B =c | =G =E =E =E 2 =D =E =D =C | =A =D =D =G =E =C =D 2 =A | =G =E =E =E 2 =D =E =D =C | =G =E =E =E =F =G =A =B =c | =G =E =E =E 2 =D =E =D =C | =A =D =D =G =E =C =D 2 =D | =E =D =E =c 2 =A =B =A =G | =E =D =E =A /2 =B /2 =c =A =B 2 =D | =E =D =E =c 2 =A =B =A =G | =A =D =D =D =E =G =A 2 =D | =E =D =E =c 2 =A =B =A =G | =E =D =E =A /2 =B /2 =c =A =B 2 =B | =G =A =B =c =B =A =B =A =G | =A =D =D =D =E =G =A =B =c |

```

## Remove the lydian tune

In [None]:
oneills_df = oneills_df.loc[oneills_df.key != 'Glyd', :]

## Transpose

In [None]:
oneills_df.key.value_counts()

### Get nr_semitones to transpose

In [None]:
# This only works as there are no flat or sharp keys
num_letter = {
    0: "C",
    2: "D",
    4: "E",
    5: "F",
    7: "G",
    9: "A",
    10: "Bb",
    11: "B",
}
letter_num = {v: k for k, v in num_letter.items()}
mode_num = {
    'dor': -2,
    'mix': -7,
    'm': -9,
}
def transpose_key(key_str):
    letter = key_str[0]
    num = letter_num[letter]
    if len(key_str) > 1:
        mode = key_str[1:]
    else:
        mode = None
    if mode is not None:
        num = (num + mode_num[mode]) % 12
    diatonic_letter = num_letter[num]
    transpose_semitones = letter_num[diatonic_letter]
    if transpose_semitones > 6:
        transpose_semitones = transpose_semitones - 12
    return -transpose_semitones

In [None]:
oneills_df['transpose_semitones'] = oneills_df.key.apply(transpose_key)

In [None]:
oneills_df.groupby(['key', 'transpose_semitones']).size()

### Can we use `music21` (& `ly2abc`) to write new abc?
Spoiler: no. The bar lines get messed up somewhere.
This code just checks the transpose semitones are correct.

In [None]:
abch = music21.abcFormat.ABCHandler()
print(oneills_df.loc[0, "tune"])
abch.tokenize(oneills_df.loc[0, "tune"])
" ".join([tok.src for tok in abch.tokens])

In [None]:
oneills_df.loc[1]

In [None]:
oneills_df.loc[oneills_df.key.str.contains('mix')]

In [None]:
oneills_df.loc[14]

In [None]:
score = oneills_opus.scores[14]

In [None]:
score.show()

In [None]:
s2 = score.transpose(oneills_df.loc[14, 'transpose_semitones'])
s2.show()

In [None]:
oneills_df.loc[oneills_df.key.str.endswith('m')]

In [None]:
oneills_df.loc[31]

In [None]:
score = oneills_opus.scores[31]
score.show()

In [None]:
s2 = score.transpose(oneills_df.loc[31, 'transpose_semitones'])
s2.show()

In [None]:
def make_transposed_music21(row):
    ref_idx = int(row["reference number"])
    trans_semitones = row["transpose_semitones"]
    score = oneills_opus.scores[ref_idx - 1].transpose(trans_semitones)
    return score

In [None]:
oneills_df["transposed_music21"] = oneills_df.apply(make_transposed_music21, axis=1)

In [None]:
output_dir = Path(data_path, "oneills_lilypond")
output_dir.mkdir(parents=True, exist_ok=True)
for idx, row in oneills_df.iterrows():
    tune_ref = row['reference number']
    outpath = Path(output_dir, f"tune_{int(tune_ref):03d}.lp")
    row["transposed_music21"].write(fmt='lilypond', fp=str(outpath))

In [None]:
output_dir = Path(data_path, "oneills_transposed")
output_dir.mkdir(parents=True, exist_ok=True)

```bash
for filepath in ../double-jig-gen/data/oneills_lilypond/*; do
    filename="${filepath##*/}"
    filename="${filename%.*}"
    outpath="../double-jig-gen/data/oneills_transposed/${filename}.abc"
    echo "Processing $filename"
    pipenv run python ly2abc.py $filepath > $outpath
done
```

In [None]:
tok_data_path = Path(data_path, "folkrnn_parser_output", "oneills_transposed.abc")
with open(str(tok_data_path), "r") as fh:
    tok_data = fh.read().strip()
tok_tunes = [
    tune_str.split('\n')[3][len("[M:6/8] [L:1/8] "):]
    for tune_str in tok_data.split('\n\n')
]
tok_tunes[1]

DAMMIT!!! THE BARS ARE ALL MESSED UP...

In [None]:
len(tok_tunes)

## Tokenize and clean with `FolkRNN-parser`

### Write individual files out

In [None]:
oneills_df.dtypes

In [None]:
output_dir = Path(data_path, "oneills_tunes")
for idx, row in oneills_df.iterrows():
    with open(str(Path(output_dir, f"tune_{int(row['reference number']):03d}.abc")), "w") as fh:
        tune_str = f"M:{row.meter}\nK:{row.key}\n{row.tune}"
        fh.write(tune_str)


NB I edited the FolkRNN-parser code to sort the files such that they stay in order

```bash
python ../../polska/FolkRNN-parser.py -f ../data/oneills_tunes -o ../data/folkrnn_parser_output/oneills_transposed.abc 
```

### Read back

In [None]:
tok_data_path = Path(data_path, "folkrnn_parser_output", "oneills_double_jigs.abc")
with open(str(tok_data_path), "r") as fh:
    tok_data = fh.read().strip()
tok_tunes = [tune_str.split('\n')[-1].split() for tune_str in tok_data.split('\n\n')]

In [None]:
len(tok_tunes)

In [None]:
oneills_df["tune_str"] = oneills_df["tune"]

In [None]:
oneills_df["tune"] = tok_tunes

In [None]:
oneills_df

## Manual transpose

In [None]:
def transpose_tune(token_list, nr_semitones):
    new_token_list = []
    for tok in token_list:
        if ABC_PATTERN.match(tok):
            midipitch = abc_to_midipitch(tok)
            midipitch += nr_semitones
            tok = midipitch_to_abc(midipitch)
            new_token_list.append(tok)
        else:
            new_token_list.append(tok)
    return new_token_list

In [None]:
idx = 1
tune = oneills_df.loc[idx, "tune"]
nr_semitones = oneills_df.loc[idx, "transpose_semitones"]
oneills_df.loc[idx]

In [None]:
transposed_tune = transpose_tune(tune, nr_semitones)

In [None]:
' '.join(tune)

In [None]:
' '.join(transposed_tune)

In [None]:
def trans_row(row):
    tune = row["tune"]
    oneills_df["transpose_semitones"]
    return transpose_tune(tune, nr_semitones)

In [None]:
oneills_df["trans_tune"] = oneills_df.apply(trans_row, axis=1)

In [None]:
oneills_df.head(2)

## Add `=` to naturals

In [None]:
def add_equals_to_naturals(token_list):
    return [f"={tok}" if re.match(r"^[A-Ga-g]$", tok[0]) else tok for tok in token_list]

In [None]:
add_equals_to_naturals(["c", "^G"])

In [None]:
idx = 0
tune = oneills_df.loc[idx, "trans_tune"]

In [None]:
' '.join(tune)

In [None]:
' '.join(add_equals_to_naturals(tune))

In [None]:
oneills_df["clean_tune"] = oneills_df["trans_tune"].apply(add_equals_to_naturals)

In [None]:
oneills_df.head(2)

In [None]:
clean_tunes = oneills_df["clean_tune"].tolist()

In [None]:
dataset = ABCDataset(tunes=clean_tunes)

In [None]:
print(dataset)

there are a few tokens to handle, e.g. keys within bars. For now - remove the tunes.

In [None]:
issues = ["[K:AMaj]", "[K:DMaj]", "[K:GMaj]"]

In [None]:
has_issue = [any(issue in tune for issue in issues) for tune in clean_tunes]

In [None]:
np.array(has_issue).sum()

In [None]:
oneills_df = oneills_df.loc[~ np.array(has_issue), :]

In [None]:
clean_tunes = oneills_df["clean_tune"].tolist()

In [None]:
dataset = ABCDataset(tunes=clean_tunes)

In [None]:
print(dataset)

In [None]:
sns.distplot([len(tune) for tune in clean_tunes])

# Write the cleaned result

In [None]:
oneills_df.key.value_counts()

In [None]:
filepath = Path(data_path, "oneills_reformat.abc")
with open(str(filepath), "w") as fh:
    for idx, row in oneills_df.iterrows():
        fh.write("M:6/8\n")
        key = row["key"]
        if len(key) == 1:
            key_str = 'maj'
        elif len(key) == 2:
            key_str = 'min'
        else:
            key_str = key[1:]
        fh.write(f"K:{key_str}\n")
        tune = row["clean_tune"]
        fh.write(f"{' '.join(tune)}\n\n")

# Read dataset

In [None]:
dataset = ABCDataset(filepath)

In [None]:
print(dataset)

# Extra: music21 built-in plots

In [None]:
score.plot('histogram', 'octave', xHideUnused=False)

In [None]:
oneills_opus.plot('histogram', 'octave')

In [None]:
score.plot('histogram', 'pitchClass')

In [None]:
oneills_opus.plot('histogram', 'pitchClass')

In [None]:
oneills_opus.plot('scatterweighted', 'pitch', 'quarterLength')

In [None]:
p = music21.graph.plot.WindowedKey(score)
p.run()