In [73]:
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

In [74]:
path = Path("data", "derived", "thesession.org", "all_tunes.abc")

In [75]:
tune_numbers = [
    int(abc_file.stem)
    for abc_file in [
        filepath for filepath in path.iterdir() if filepath.suffix == ".abc"
    ]
]

In [76]:
len(tune_numbers)

19037

In [77]:
max(tune_numbers)

20903

In [78]:
missing_tunes = set(range(max(tune_numbers))).difference(tune_numbers)
len(missing_tunes)

1867

In [79]:
abc_tunes = {}
issue_tunes = {}
for abc_file in tqdm([filepath for filepath in path.iterdir() if filepath.suffix == ".abc"]):
    tune_number = int(abc_file.stem)
    with open(abc_file, "r") as fh:
        file_content = fh.read().strip()
        if file_content.startswith("<"):
            issue_tunes[tune_number] = file_content
        else:
            tunes = {
                (tune_number, setting_number + 1): tune.strip()
                for setting_number, tune
                in enumerate(file_content.split("\n\n"))
            }
    abc_tunes.update(tunes)

  0%|          | 0/19037 [00:00<?, ?it/s]

In [80]:
len(issue_tunes)

0

In [82]:
tunes_df = pd.DataFrame(
    abc_tunes.values(),
    index=pd.MultiIndex.from_tuples(
        abc_tunes.keys(), names=["tune_number", "setting_number"]
    ),
    columns=["tune_str"],
)

In [83]:
for _ in range(10):
    sample_tune = tunes_df.sample()
    display(sample_tune)
    print(sample_tune["tune_str"].squeeze())

Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
7623,3,X: 3\nT: Gone Fishing\nZ: Valbu\nS: https://th...


X: 3
T: Gone Fishing
Z: Valbu
S: https://thesession.org/tunes/7623#setting37178
R: reel
M: 4/4
L: 1/8
K: Emin
|:"Em"E2BA GAB^d|edBA GAFG|EB (3BBB ABfg-|"Am"gf^dB ABGF|
"Em"E2(3BBB ABF2|GFGB AEF2|"C"zBAF GFGB|1 "B"A/2B/2A/2z/2^DE FGAB:|2 A/2B/2A/2z/2^DE FGAB-||
|:"Em"~B2AB cBAc|~B2AB GAF2|zBAB cBAB|"B"geBG (3EFG F2|
"Em"~B2AB cAB2|zGF2 G2A2|"A"BE (3EEE BAGF-|"B"FGAB G/2A/2G/2z/2 E2:|


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
2772,5,"X: 5\nT: Ballyhoura Mountains, The\nZ: ceolach..."


X: 5
T: Ballyhoura Mountains, The
Z: ceolachan
S: https://thesession.org/tunes/2772#setting16400
R: polka
M: 2/4
L: 1/8
K: Gmaj
|: B2 AG | EG D2 | B2 AG | A/B/c d2 |
B2 AG | EG D2 | B2 dB | AG G2 :|
|: ge ge | dB AG | ge ge | A/B/c d2 |
ge ge | dB AG | B2 dB | AG G2 :|


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
14673,1,X: 1\nT: Lastringe Storpolska\nZ: Javi V\nS: h...


X: 1
T: Lastringe Storpolska
Z: Javi V
S: https://thesession.org/tunes/14673#setting27077
R: mazurka
M: 3/4
L: 1/8
K: Dmaj
A2F2 DFAF DFAF| DGBG DGBG DGBG|
A2F2 DFAF DFAF| F2EE E2E2 E3E |
A2F2 DFAF DFAF| DGBG DGBG DGBG|
ABcd e2f2 g2e2| dcBc e2d2 d3d:|
A2d2 f2a2 bafa| g2b2 b2g2 ageg|
f2a2 a2fa f2ed| cdef g2eg f2df|
egec d2AG F2A2|
A2d2 f2a2 bafa| g2b2 b2g2 ageg|
f2a2 a2fa f2ed| cdef g2eg f2df|
|1 egec e2dc d3d :|2 egec e2dc d4 |]
P:Voice 2
F2D2 A,DFD A,DFD| B,DGD B,DGD B,DGD|
F2D2 A,DFD A,DFD| F2EE E2C2 A,4|
F2D2 A,DFD A,DFD| B,DGD B,DGD B,DGD|
EFGA B2A2 c2A2| GFEF G2F2 F3F:|
F2A2 d2d2 fdAd| B2g2 g2d2 edBG|
F2A2 d2Ad d2AF| ABcd e2ce d2Ad|
cecA F2DA, D2F2|
F2A2 d2d2 fdAd| B2g2 g2d2 edBG|
F2A2 d2Ad d2AF| ABcd e2ce d2BA |
|1 cecA G2FE [F3A3][FA] :|2 cecA G2FE [F4A4] |]


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
2240,2,X: 2\nT: Touch Of Gaelic\nZ: Nigel Gatherer\nS...


X: 2
T: Touch Of Gaelic
Z: Nigel Gatherer
S: https://thesession.org/tunes/2240#setting21609
R: waltz
M: 3/4
L: 1/8
K: Gmaj
B2 c2 | d6 | B4 A2 | A2 G4 | A4 B2 | D4 D2 | E3 D E2 |G6- | G2
B2 c2 | d6 | B4 A2 | A2 G4 | A4 B2 | D4 D2 | E3 D E2 |G6- | G4 ||
D2 | G4 G2 | c4 d2 | e6 | d4 de | d6 | B2 A2 G2 | c6 |d4 e2 |
B2 d4 | B4 A2 | A2 G4 | A4 B2 |D4 D2 | E3 D E2 | G6- | G2 |]


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
2223,5,X: 5\nT: Da Auld Resting Chair\nZ: Richard D C...


X: 5
T: Da Auld Resting Chair
Z: Richard D Cook
S: https://thesession.org/tunes/2223#setting37557
R: barndance
M: 4/4
L: 1/8
K: Dmaj
|: A2 | f3 g e3 f | d2 A2 f2 d2| g2 e2 f2 d2 | GB ed c2 A2 |
f3 g e3 f | d2 A2 f2 d2| G2 e2 A2 c2 | d6 :||
a2| gfed d3 A | B2 d2 A2 f2 | g3 a f2 d2 | B2 ed c2 A2 |
gfed d3 A | B2 d2 A2 f2 | G2 AB A2 c2 | d6 a2 |
gfed d3 A | B2 d2 A2 f2 | g3 a f2 d2 | GB ed c2 A2 |
f3 g e3 f | d2 A2 f2 d2| G2 e2 A2 c2 | d8 ||


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
9300,1,X: 1\nT: Highland President\nZ: AngusF\nS: htt...


X: 1
T: Highland President
Z: AngusF
S: https://thesession.org/tunes/9300#setting9300
R: strathspey
M: 4/4
L: 1/8
K: Gmin
F|D/G3/2G3/2F/ G3/2A/ Bc/B/|A/F3/2c/F3/2 d/F3/2 d/c/B/A/|
G/D3/2D3/2C/ B,3/2A,/ G,d/c/|c/B/A/G/ B/A/G/^F/ G2 G,3/2=F/|
D/G3/2G3/2F/ G3/2A/ Bc/B/|A/F3/2c/F3/2 d/F3/2 d/c/B/A/|
G/A/B/G/ A/B/c/A/ B/c/d/B/ c/d/e/g/|^f/d/c/d/ B/c/ A/B/ {F}~G2 G,||
B|G/g3/2g3/2d/ B3/2d/ Gg/a/|b/g/d/g/ B/d/G/B/ D/G/B,/D/ G,/B,/D/G/|
F/f3/2f3/2c/ A3/2c/ Ff/g/|a/f/c/f/ A/c/F/A/ C/F/ A,/C/ F/A/c/f/|
G/g3/2g3/2d/4c/4 B/d3/2G3/2A/4G/4|F/f3/2f3/2c/4B/4 A/c3/2F3/2B/4A/4|
G/A/B/G/ A/B/c/A/ B/c/d/B/ c/d/e/g/|^f/d/c/d/ B/c/ A/B/ {F}~G2 G,||


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
112,10,"X: 10\nT: Trip To Pakistan, The\nZ: Damien Rog..."


X: 10
T: Trip To Pakistan, The
Z: Damien Rogeau
S: https://thesession.org/tunes/112#setting30705
R: reel
M: 4/4
L: 1/8
K: Bmin
|: BdfB d3 f | e2ed cdec | BdfB d3 f | edcd B2BA:|
|: BdfB g3 e | ~f3 a fefd|BdfB g3 e |fedf ~e3d :|
|: cdfc dfcd | BcdB cdcB | AceA ceAc | edcd ~B3A :|


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
18426,13,X: 13\nT: O'Sullivan's March\nR: jig\nM: 6/8\n...


X: 13
T: O'Sullivan's March
R: jig
M: 6/8
L: 1/8
K: Gmaj
"G"DBA ABd | edB A2D | GBA B2G | "D"AGE GFE |
"G"DBA ABd | edB A2D | GBA B2G |1 "D"AGF G3:|2 "D"AGF Gdg ||
|: "C"e3 edg | e3 edB | "G"d3 dBd | edB BAG |
"C"c3 dcd | edB A2D | "D"GBA B2G |1 AGF Gdg :|2 "D"AGF "G"G3 ||


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
11934,1,"X: 1\nT: Burnt Cabbage, The\nZ: slainte\nS: ht..."


X: 1
T: Burnt Cabbage, The
Z: slainte
S: https://thesession.org/tunes/11934#setting11934
R: reel
M: 4/4
L: 1/8
K: Edor
|:BE~E2 BAFA|BFAF DEFA|BE~E2 BAFA|1 Bcdf e2dc:|2 Bcdf e3d||
Bdef gfeg|fdad bdad|Bdef gfeg|fedf e3d|
Bdef gfeg|fddc dfaf|g2bg f2ef|dBAF EFGA||


Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
13104,1,X: 1\nT: Upon A Sunday Morning When Spring Was...


X: 1
T: Upon A Sunday Morning When Spring Was In Its Prime
Z: manxygirl
S: https://thesession.org/tunes/13104#setting22557
R: jig
M: 6/8
L: 1/8
K: Gmaj
d | G2G e2c | d2c A2F | D2G G2F | G3G2d |
d2B d2f | g2g f2d | e2e f2d | B3B2d |
d2B d2f | g2g f2d | e2e f2d | B3B2d |
G2G e2c | d2c A2F | D2G G2F | G3G2 ||


In [86]:
print(tunes_df.loc[19114, "tune_str"])

setting_number
1    X: 1\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
2    X: 2\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
3    X: 3\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
4    X: 4\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
5    X: 5\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
6    X: 6\nT: Hundred Pipers\nR: jig\nM: 6/8\nL: 1/...
Name: tune_str, dtype: object


In [87]:
print(tunes_df.loc[(19114, 1), "tune_str"])

X: 1
T: Hundred Pipers
R: jig
M: 6/8
L: 1/8
K: Amaj
|:c2E EFE|F2A A2f|e2c cBA|BcB BAB|
c2E EFE|F2A A2f|e2c BcB|1 A3 A2B:|2 A3 c2d||
|:e2e ece|f2a agf|e2c cBA|BcB Bcd|
e2e ece|f2a agf|e2c BcB|A3 A2B:||


In [None]:
tunes_df.Path

In [89]:
#!/usr/bin/env python
"""Script to tokenise abc data."""
import logging
import os
from functools import partial
from pathlib import Path
from typing import Optional

import fire
import pandas as pd
from tqdm import tqdm as std_tqdm

from double_jig_gen.data import (
    TOKEN_SEPARATOR,
    clean_and_standardise_token,
    fix_encoding_errors,
    remove_quoted_strings,
)
from double_jig_gen.tokenizers import ABCTune, ABCTuneError

logging.basicConfig()
LOGGER = logging.getLogger(__name__)
# https://github.com/tqdm/tqdm/issues/370
tqdm = partial(std_tqdm, dynamic_ncols=True)


def clean_tune_str(tune_str):
    return fix_encoding_errors(remove_quoted_strings(tune_str))


def get_abc_tune(abc_data):
    try:
        abc_tune = ABCTune(
            abc_data,
            # pianoroll_divisions_per_quarternote=12,
            # min_pitch=0,
            # min_time=0,
            # transpose_to_pitchclass="C",
        )
    except ABCTuneError as e:
        msg = (
            f"Not including the following tune:\n{abc_data}\n"
            f"It raised an error when parsing with ABCTune(): {e}."
        )
        LOGGER.warning(msg)
        abc_tune = msg
    except Exception as e:
        msg = (
            f"Not including the following tune:\n{abc_data}\n"
            f"It raised an unhandled error: {e}."
        )
        LOGGER.warning(msg)
        abc_tune = msg
    return abc_tune


DATA_PATH = "data/raw/folk-rnn/data_v1"
OUTPUT_PATH = "data/working/folk-rnn/clean-folk-rnn.txt"


def main(
    data_path: str = DATA_PATH,
    output_path: str = OUTPUT_PATH,
    token_separator: str = TOKEN_SEPARATOR,
    nr_tunes: Optional[int] = None,
    # TODO: reduce this number to zero and clean duff tokens manually (it's currently )
    # partially used to get count = 1 tokens which are just crap (i.e. music21 parse
    # errors) but there are many legitimate count = 1 tokens e.g. chords, accents, etc.
    minimum_token_frequency: Optional[int] = 2,
    log_level: Optional[str] = None,
):
    if log_level is not None:
        LOGGER.setLevel(log_level)
    LOGGER.info(
        f"Tokenizing data from {data_path} and writing to {output_path} using token "
        f"separator {token_separator}"
    )
    output_dir = Path(output_path).parent
    if not output_dir.exists():
        LOGGER.warning(f"Creating output directory {output_dir}")
        output_dir.mkdir(parents=True, exist_ok=False)
    with open(data_path, "r") as fh:
        raw_folkrnn_data = fh.read()
    abc_data_list = [tune_str.strip() for tune_str in raw_folkrnn_data.split("\n\n")]
    orig_nr_tunes = len(abc_data_list)
    LOGGER.info(f"Read {orig_nr_tunes} tunes from {data_path}")

    if nr_tunes is not None:
        LOGGER.info(f"Restricting read to first {nr_tunes} tunes...")
        abc_data_list = abc_data_list[:nr_tunes]
    else:
        nr_tunes = len(abc_data_list)

    token_separator_in_tune = [
        token_separator in "".join(tune) for tune in abc_data_list
    ]
    if any(token_separator_in_tune):
        LOGGER.error(
            f"Can't use token separator {token_separator} because it is contained "
            f"within {sum(token_separator_in_tune)} tunes."
        )
        tunes_with_token_separator = [
            tune for ii, tune in enumerate(abc_data_list) if token_separator_in_tune[ii]
        ]
        tunes_with_token_separator = "\n\n".join(tunes_with_token_separator)
        raise ValueError(
            f"The token separator {token_separator} is contained within the following "
            f"tunes:\n\n{tunes_with_token_separator}"
        )

    LOGGER.info("Checking for duplicate tunes")
    seen_tunes = set()
    abc_data_with_source_idx = []
    for source_idx, tune in enumerate(abc_data_list):
        if tune not in seen_tunes:
            abc_data_with_source_idx.append((source_idx, tune))
            seen_tunes.add(tune)
    dedup_nr_tunes = len(abc_data_with_source_idx)
    nr_duplicate_tunes = nr_tunes - dedup_nr_tunes
    LOGGER.info(
        f"Removed {nr_duplicate_tunes} identical tunes, we now have {dedup_nr_tunes}"
    )

    LOGGER.info(
        "Cleaning the text strings representing the tunes e.g. fixing encoding errors "
        "and removing double quoted text comments."
    )
    clean_abc_data = [
        (source_idx, clean_tune_str(tune_str))
        for source_idx, tune_str in tqdm(
            abc_data_with_source_idx, desc="cleaning tunes"
        )
    ]

    # TODO: profile this
    LOGGER.info("Splitting into tokens using music21 (drop tunes which cant be parsed)")
    tunes = [
        (source_idx, get_abc_tune(abc_data))
        for source_idx, abc_data in tqdm(clean_abc_data, desc="parsing with music21")
    ]
    clean_tunes = [
        (source_idx, tune) for source_idx, tune in tunes if not isinstance(tune, str)
    ]

    LOGGER.info("Cleaning individual tokens")
    tunes_as_token_lists = [
        (
            source_idx,
            [clean_and_standardise_token(tok.src) for tok in tune._abc_handler.tokens],
        )
        for source_idx, tune in tqdm(clean_tunes, desc="splitting into tokens")
    ]

    LOGGER.info(
        "Dropping tunes with rare tokens i.e. tokens which appear less than "
        f"{minimum_token_frequency} times in the whole corpus"
    )
    nr_tunes_before = len(tunes_as_token_lists)
    token_counts = pd.Series(
        [tok for _, tune in tunes_as_token_lists for tok in tune]
    ).value_counts()
    is_removed = token_counts < minimum_token_frequency
    removed_tokens = token_counts[is_removed].index
    nr_removed_tokens = sum(is_removed)
    tunes_as_token_lists = [
        (source_idx, tune)
        for source_idx, tune in tqdm(
            tunes_as_token_lists, desc="removing tunes with rare tokens"
        )
        if all(token not in removed_tokens for token in tune)
    ]
    nr_removed_tunes = nr_tunes_before - len(tunes_as_token_lists)
    LOGGER.info(
        f"Removed {nr_removed_tokens} tokens and the {nr_removed_tunes} tunes which "
        "used them."
    )

    LOGGER.info(f"Writing tokenized data to {output_path}")
    source_idx, tunes_as_token_strings = zip(
        *[
            (str(source_idx), token_separator.join(tune))
            for source_idx, tune in tunes_as_token_lists
        ]
    )
    with open(output_path, "w") as fh:
        fh.write("\n".join(tunes_as_token_strings))
    source_idx_path = f"{os.path.splitext(output_path)[0]}_source_idx.txt"
    LOGGER.info(
        "Writing indices of source tunes (i.e. tune index number for tunes in "
        f"{data_path}) for tokenized tunes {output_path} to {source_idx_path}"
    )
    with open(source_idx_path, "w") as fh:
        fh.write("\n".join(source_idx))

In [None]:
main(
    data_path=,
    output_path: str = OUTPUT_PATH,
    token_separator: str = TOKEN_SEPARATOR,
    nr_tunes: Optional[int] = None,
    # TODO: reduce this number to zero and clean duff tokens manually (it's currently )
    # partially used to get count = 1 tokens which are just crap (i.e. music21 parse
    # errors) but there are many legitimate count = 1 tokens e.g. chords, accents, etc.
    minimum_token_frequency: Optional[int] = 2,
    log_level: Optional[str] = None,
)