In [1]:
import time
from pathlib import Path
from tqdm.notebook import tqdm

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
basepath = Path("data", "raw", "thesession.org")
basepath.mkdir(exist_ok=True, parents=True)

In [4]:
max_tunes = 100_000
max_requests_per_second = 20
wait_time = 1/max_requests_per_second
print(f"will take {(max_tunes * wait_time)/60/60} hours")

will take 1.3888888888888888 hours


In [5]:
for tune_nr in tqdm(range(1, max_tunes+1)):
    path = basepath / Path(f"{tune_nr:06}.abc")
    # this format *normally* returns a page which contains all tunes, but doesn't always
#     url = f"https://thesession.org/tunes/{tune_nr}/abc/1000"
    url = f"https://thesession.org/tunes/{tune_nr}"
    page = requests.get(url, allow_redirects=True)
    soup = BeautifulSoup(page.content, "html.parser")
    level1_headings = soup.h1
    if level1_headings is None:
        level1_headings_contents = []
    else:
        level1_headings_contents = level1_headings.contents
    if not any(code in level1_headings_contents for code in ['404', '410']):
        with open(path, "w") as file_handle:
#             file_handle.write(page.text)  # if using */abc/1000 url, this worked
            tune_tags = soup.find_all("div", {"class": "notes"})
            text = "\n\n".join(
                "".join(
                    tune_string.replace("\r", "\n") for tune_string in tag.strings
                ).replace("\n\n", "\n").strip()
                for tag in tune_tags
            )
            file_handle.write(text)
        time.sleep(wait_time)

  0%|          | 0/100000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [6]:
!ls {basepath} | head

000001.abc
000002.abc
000003.abc
000004.abc
000005.abc
000006.abc
000007.abc
000008.abc
000009.abc
000010.abc


In [7]:
path = Path("data", "raw", "thesession.org")

In [8]:
tune_numbers = [
    int(abc_file.stem)
    for abc_file in [
        filepath for filepath in path.iterdir() if filepath.suffix == ".abc"
    ]
]

In [9]:
len(tune_numbers)

1313

In [10]:
max(tune_numbers)

1387

In [11]:
missing_tunes = list(
    set(range(1, max(tune_numbers)+1)).difference(tune_numbers)
)
len(missing_tunes)

74

In [13]:
for tune_nr in missing_tunes[:10]:
    url = f"https://thesession.org/tunes/{tune_nr}"
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

https://thesession.org/tunes/519
https://thesession.org/tunes/525
https://thesession.org/tunes/1041
https://thesession.org/tunes/530
https://thesession.org/tunes/1046
https://thesession.org/tunes/25
https://thesession.org/tunes/1071
https://thesession.org/tunes/567
https://thesession.org/tunes/588
https://thesession.org/tunes/599


In [14]:
issue_tunes = {}
for abc_file in tqdm([filepath for filepath in path.iterdir() if filepath.suffix == ".abc"]):
    tune_number = int(abc_file.stem)
    with open(abc_file, "r") as fh:
        file_content = fh.read().strip()
        if file_content.startswith("<"):
            issue_tunes[tune_number] = file_content

  0%|          | 0/1313 [00:00<?, ?it/s]

In [15]:
len(issue_tunes)

0

In [16]:
for tune_nr, html in issue_tunes.items():
    print(f"https://thesession.org/tunes/{tune_nr}")
    print(html)
    with open(f"test_{tune_nr}.html", "w") as fh:
        fh.write(html)
    break

In [18]:
abc_tunes = {}
issue_tunes = {}
for abc_file in tqdm([filepath for filepath in basepath.iterdir() if filepath.suffix == ".abc"]):
    tune_number = int(abc_file.stem)
    with open(abc_file, "r") as fh:
        file_content = fh.read().strip()
        if file_content.startswith("<"):
            issue_tunes[tune_number] = file_content
        else:
            tunes = {
                (tune_number, setting_number + 1): tune.strip()
                for setting_number, tune
                in enumerate(file_content.split("\n\n"))
            }
    abc_tunes.update(tunes)

  0%|          | 0/1313 [00:00<?, ?it/s]

In [21]:
tunes_df = pd.DataFrame(
    abc_tunes.values(),
    index=pd.MultiIndex.from_tuples(
        abc_tunes.keys(), names=["tune_number", "setting_number"]
    ),
    columns=["tune_str"],
).sort_index()
tunes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,tune_str
tune_number,setting_number,Unnamed: 2_level_1
1,1,X: 1\nT: Cooley's\nR: reel\nM: 4/4\nL: 1/8\nK:...
1,2,X: 2\nT: Cooley's\nR: reel\nM: 4/4\nL: 1/8\nK:...
1,3,X: 3\nT: Cooley's\nR: reel\nM: 4/4\nL: 1/8\nK:...
1,4,X: 4\nT: Cooley's\nR: reel\nM: 4/4\nL: 1/8\nK:...
1,5,X: 5\nT: Cooley's\nR: reel\nM: 4/4\nL: 1/8\nK:...
...,...,...
1387,6,X: 6\nT: Monymusk\nR: reel\nM: 4/4\nL: 1/8\nK:...
1387,7,X: 7\nT: Monymusk\nR: reel\nM: 4/4\nL: 1/8\nK:...
1387,8,X: 8\nT: Monymusk\nR: reel\nM: 4/4\nL: 1/8\nK:...
1387,9,X: 9\nT: Monymusk\nR: reel\nM: 4/4\nL: 1/8\nK:...


In [22]:
outdir = Path("data", "working", "thesession.org")
outdir.mkdir(exist_ok=True, parents=True)
tunes_df.to_csv(outdir / Path("all_tunes.csv"))

In [23]:
with open(outdir / Path("all_tunes.abc"), "w") as fh:
    fh.write("\n\n".join(tunes_df["tune_str"].squeeze()))

# Appendix

In [167]:
tune_nr = 218
url = f"http://thesession.org/tunes/{tune_nr}"
print(url)
page = requests.get(url, allow_redirects=True)

http://thesession.org/tunes/218


In [168]:
page.text
soup = BeautifulSoup(page.content, "html.parser")
tune_tags = soup.find_all("div", {"class": "notes"})
tune_strings = "\n\n".join(
    "".join([s.replace("\r", "\n") for s in tag.strings]).replace("\n\n", "\n").strip()
    for tag in tune_tags
)
print(tune_strings)

X: 1
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Gmaj
|:G2 (3GGG dGBd|eGce dBAB|G2 (3GGG BGAB|cE (3EEE GEDE|
|G2 (3GGG dBGd|eGce dBAB|cE (3EEE GECE|1 DFAc BcAB:|2 DFAc BGGA||
|:Bd (3ddd edgd|edgd egdc|Bd (3ddd edgd|eaag fedc|
|BcdB ~c3 E|(3EEE cE DGBG|cE (3EEE GECE|1 DFAc BGGA:|2 DFAc BcAB||

X: 2
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Fmaj
|:F2 (3FFF cFAc|dFBd cAGA|F2 (3FFF AFGA|BD (3DDD FDCD|
|F2 (3FFF cFAc|dFBd cAGA|BD (3DDD FDB,D|1 CEGB ABGA:|2 CEGB AFFG||
|:Ac (3ccc dcfc|dcfc dfcB|Ac (3ccc dcfc|dggf edcB|
|ABcA ~B3 D|(3DDD BD CFAF|BD (3DDD FDB,D|1 CEGB AFFG:|2 CEGB ABGA||

X: 3
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Gmaj
|: G,2 G,/G,/G, DG,B,D | EG,CE DB,A,B, | G,2 G,/G,/G, B,G,A,B, | "D" C "U" E, "D" E,/E,/E, G,E,D,E, |
G,2 G,/G,/G, DG,B,D | EG,CE DB,A,B, | "U" C "D" E, "D" E,/E,/E, G,E,D,E, |1 D,F,A,D B,DA,B, :|
|: B,D D/D/D EDGD | ED GD EG DC | B,D D/D/D EDGD | EAAG FEDC | 
B,CDB, B,CCE, | E,/E,/E, CE, D,D,B,D, | CE, E,/E,/