In [3]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 4.5 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.2.1


In [169]:
import time
from pathlib import Path
from tqdm.notebook import tqdm

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [45]:
basepath = Path("data", "raw", "thesession.org")
basepath.mkdir(exist_ok=True, parents=True)

In [52]:
max_tunes = 100_000
max_requests_per_second = 20
wait_time = 1/max_requests_per_second
print(f"will take {(max_tunes * wait_time)/60/60} hours")
url_format = "https://thesession.org/tunes/{tune_nr}/abc/1000"

will take 1.3888888888888888 hours


In [58]:
for tune_nr in tqdm(range(1, max_tunes+1)):
    path = basepath / Path(f"{tune_nr:06}.abc")
    url = url_format.format(tune_nr=tune_nr)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    level1_headings = soup.h1
    if level1_headings is None:
        level1_headings_contents = []
    else:
        level1_headings_contents = level1_headings.contents
    if not any(code in level1_headings_contents for code in ['404', '410']):
        with open(path, "w") as file_handle:
            file_handle.write(page.text)
        time.sleep(wait_time)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [61]:
!ls {basepath} | head

000001.abc
000002.abc
000003.abc
000004.abc
000005.abc
000006.abc
000007.abc
000008.abc
000009.abc
000010.abc
ls: write error: Broken pipe


In [62]:
path = Path("data", "raw", "thesession.org")

In [63]:
tune_numbers = [
    int(abc_file.stem)
    for abc_file in [
        filepath for filepath in path.iterdir() if filepath.suffix == ".abc"
    ]
]

In [64]:
len(tune_numbers)

19037

In [65]:
max(tune_numbers)

20903

In [70]:
missing_tunes = list(
    set(range(1, max(tune_numbers)+1)).difference(tune_numbers)
)
len(missing_tunes)

1866

In [73]:
for tune_nr in missing_tunes[:10]:
    url = url_format.format(tune_nr=tune_nr)
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

https://thesession.org/tunes/8198/abc/1000
https://thesession.org/tunes/8202/abc/1000
https://thesession.org/tunes/8207/abc/1000
https://thesession.org/tunes/8208/abc/1000
https://thesession.org/tunes/8210/abc/1000
https://thesession.org/tunes/8212/abc/1000
https://thesession.org/tunes/8216/abc/1000
https://thesession.org/tunes/25/abc/1000
https://thesession.org/tunes/16409/abc/1000
https://thesession.org/tunes/8221/abc/1000


In [74]:
issue_tunes = {}
for abc_file in tqdm([filepath for filepath in path.iterdir() if filepath.suffix == ".abc"]):
    tune_number = int(abc_file.stem)
    with open(abc_file, "r") as fh:
        file_content = fh.read().strip()
        if file_content.startswith("<"):
            issue_tunes[tune_number] = file_content

  0%|          | 0/19037 [00:00<?, ?it/s]

In [75]:
len(issue_tunes)

584

In [None]:
for tune_nr, html in issue_tunes.items():
    print(f"https://thesession.org/tunes/{tune_nr}")
    print(html)
    with open(f"test_{tune_nr}.html", "w") as fh:
        fh.write(html)
    break

In [166]:
for tune_nr in tqdm(issue_tunes):
    path = basepath / Path(f"{tune_nr:06}.abc")
    url = f"https://thesession.org/tunes/{tune_nr}"
    page = requests.get(url, allow_redirects=True)
    soup = BeautifulSoup(page.content, "html.parser")
    level1_headings = soup.h1
    if level1_headings is None:
        level1_headings_contents = []
    else:
        level1_headings_contents = level1_headings.contents
    if not any(code in level1_headings_contents for code in ['404', '410']):
        with open(path, "w") as file_handle:
            tune_tags = soup.find_all("div", {"class": "notes"})
            text = "\n\n".join(
                "".join(
                    tune_string.replace("\r", "\n") for tune_string in tag.strings
                ).replace("\n\n", "\n").strip()
                for tag in tune_tags
            )
            file_handle.write(text)
        time.sleep(wait_time)

  0%|          | 0/584 [00:00<?, ?it/s]

In [171]:
abc_tunes = {}
issue_tunes = {}
for abc_file in tqdm([filepath for filepath in basepath.iterdir() if filepath.suffix == ".abc"]):
    tune_number = int(abc_file.stem)
    with open(abc_file, "r") as fh:
        file_content = fh.read().strip()
        if file_content.startswith("<"):
            issue_tunes[tune_number] = file_content
        else:
            tunes = {
                (tune_number, setting_number + 1): tune.strip()
                for setting_number, tune
                in enumerate(file_content.split("\n\n"))
            }
    abc_tunes.update(tunes)

  0%|          | 0/19037 [00:00<?, ?it/s]

In [172]:
tunes_df = pd.DataFrame(
    abc_tunes.values(),
    index=pd.MultiIndex.from_tuples(
        abc_tunes.keys(), names=["tune_number", "setting_number"]
    ),
    columns=["tune_str"],
)

In [173]:
outdir = Path("data", "derived", "thesession.org")
outdir.mkdir(exist_ok=True, parents=True)
tunes_df.to_csv(outdir / Path("all_tunes.csv"))

In [178]:
with open(outdir / Path("all_tunes.abc"), "w") as fh:
    fh.write("\n\n".join(tunes_df["tune_str"].squeeze()))

# Appendix

In [167]:
tune_nr = 218
url = f"http://thesession.org/tunes/{tune_nr}"
print(url)
page = requests.get(url, allow_redirects=True)

http://thesession.org/tunes/218


In [168]:
page.text
soup = BeautifulSoup(page.content, "html.parser")
tune_tags = soup.find_all("div", {"class": "notes"})
tune_strings = "\n\n".join(
    "".join([s.replace("\r", "\n") for s in tag.strings]).replace("\n\n", "\n").strip()
    for tag in tune_tags
)
print(tune_strings)

X: 1
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Gmaj
|:G2 (3GGG dGBd|eGce dBAB|G2 (3GGG BGAB|cE (3EEE GEDE|
|G2 (3GGG dBGd|eGce dBAB|cE (3EEE GECE|1 DFAc BcAB:|2 DFAc BGGA||
|:Bd (3ddd edgd|edgd egdc|Bd (3ddd edgd|eaag fedc|
|BcdB ~c3 E|(3EEE cE DGBG|cE (3EEE GECE|1 DFAc BGGA:|2 DFAc BcAB||

X: 2
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Fmaj
|:F2 (3FFF cFAc|dFBd cAGA|F2 (3FFF AFGA|BD (3DDD FDCD|
|F2 (3FFF cFAc|dFBd cAGA|BD (3DDD FDB,D|1 CEGB ABGA:|2 CEGB AFFG||
|:Ac (3ccc dcfc|dcfc dfcB|Ac (3ccc dcfc|dggf edcB|
|ABcA ~B3 D|(3DDD BD CFAF|BD (3DDD FDB,D|1 CEGB AFFG:|2 CEGB ABGA||

X: 3
T: Mairtin O'Connor's Flying Clog
R: reel
M: 4/4
L: 1/8
K: Gmaj
|: G,2 G,/G,/G, DG,B,D | EG,CE DB,A,B, | G,2 G,/G,/G, B,G,A,B, | "D" C "U" E, "D" E,/E,/E, G,E,D,E, |
G,2 G,/G,/G, DG,B,D | EG,CE DB,A,B, | "U" C "D" E, "D" E,/E,/E, G,E,D,E, |1 D,F,A,D B,DA,B, :|
|: B,D D/D/D EDGD | ED GD EG DC | B,D D/D/D EDGD | EAAG FEDC | 
B,CDB, B,CCE, | E,/E,/E, CE, D,D,B,D, | CE, E,/E,/