In [None]:
from glob import glob
import requests
from bs4 import BeautifulSoup
from os.path import join, basename
import re
import pandas as pd
from typing import List, Optional
from tqdm import tqdm

In [None]:
html_dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_compositions_by_Johann_Sebastian_Bach", flavor="bs4")

In [None]:
bach_works_df = html_dfs[5].copy()

# remove unreferenced
bach_works_df = bach_works_df[~bach_works_df["BWV"].isna()]
# remove variants
bach_works_df = bach_works_df[~bach_works_df["BWV"].str.contains("/", regex=False)]
# remove html table separations
bach_works_df = bach_works_df[bach_works_df["BD"].fillna("").str.match(r"^\d+$")]

bach_works_df["BWV_without_version"] = bach_works_df["BWV"].str.extract(r"^(\d+)")
bach_works_df["version"] = bach_works_df["BWV"].str.extract("^\d+\.(\d+)").fillna(1).astype(int)

bach_works_df = bach_works_df.loc[bach_works_df.groupby("BWV_without_version")["version"].idxmax()]

In [None]:
bach_flac_path = "/home/samuel/Téléchargements/Bach - Complete Works"
musics = glob(join(bach_flac_path, "**", '**', "*.flac"))

In [None]:
regex_music = re.compile(r"^.* BWV (\d+[a-z]?).*\.flac$")
regex_volume = re.compile(r".*/Vol (I+) .*")

In [None]:
music_bwv = {}
for music in musics:
    filename = basename(music)
    if regex_music.match(filename):
        bwv = regex_music.match(filename).group(1)
        
        music_bwv[music] = bwv

In [None]:
bwv_set = set(bach_works_df["BWV_without_version"])

music_path_df = pd.DataFrame(
    [[wav_path, bwv] for wav_path, bwv in music_bwv.items() if bwv in bwv_set],
    columns=["wav_path", "bwv"]
)

In [None]:
scoring_df = html_dfs[6].copy()

voices_df = (
    scoring_df
    .loc[:1]
    .transpose()
    .reset_index()[["level_1", 0]]
    .rename({"level_1": "code", 0: "scoring"}, axis=1)
)

winds_battery_df = scoring_df.loc[2:3]
winds_battery_df.columns = winds_battery_df.iloc[0]
winds_battery_df = (
    winds_battery_df
    .reset_index()
    .drop("index", axis=1)
    .drop(index=0)
    .transpose()
    .reset_index()
    .rename({2: "code", 1: "scoring"}, axis=1)
)

strings_keyboard_df = scoring_df.loc[5:6]
strings_keyboard_df.columns = strings_keyboard_df.iloc[0]
strings_keyboard_df = (
    strings_keyboard_df
    .reset_index()
    .drop("index", axis=1)
    .drop(index=0)
    .transpose()
    .reset_index()
    .drop(index=0)
    .rename({5: "code", 1: "scoring"}, axis=1)
)

scoring_legend_df = pd.concat([voices_df, winds_battery_df, strings_keyboard_df], axis=0).reset_index().drop("index", axis=1)

scoring_legend_dict = {row["code"]: row["scoring"] for _, row in scoring_legend_df.iterrows()}

In [None]:
regex_scoring_both = re.compile(r"")

regex_voices = re.compile(r"^((?:[satbSATBvV\d?()]|(?:[vV]\.[12]))+)(?: .+)?$")
regex_voices_2 = re.compile(r"[satbSATBvV]|(?:[vV]\.[12])")

regex_scoring = re.compile(r"^(?:(?:[satbSATBvV?\d()]|[vV]\.[12])+ )?(.+)$")
regex_scoring_2 = re.compile(r"(\d+)")
regex_scoring_3 = re.compile(r"([A-z]+)")
regex_scoring_4 = re.compile(r"([1-9]*[A-z]+)")

remove_scoring = {"SBBB", "Nho", "colla", "parte", "instr", "or", "mezzo"}
rename_scoring_dict = {
    "Harpsichord": "Hc",
    "Vla": "Va",
    "Vlp": "Vl",
    "Tne": "Tbn",
    "Keyboard": "Kb",
    "Fag": "Bas",
    "Cdc": "Hn",
    "Organ": "Org",
    "Gam": "Vdg",
}

def _parse_scoring(s: str) -> List[str]:
    
    scoring = []
    
    matched_voices = regex_voices.match(s)
    
    has_matched_voices = False
    if matched_voices:
        for grp in regex_voices_2.findall(matched_voices.group(1)):
            scoring.append(grp)
            has_matched_voices = True
        
    matched_scoring = regex_scoring.match(s)
    
    if matched_scoring and not (len(regex_scoring_4.findall(s)) == 1 and has_matched_voices):
        for grp in matched_scoring.group(1).split(" "):
            number = regex_scoring_2.search(grp)
            number = 1 if not number else int(number.group(1))
            
            found_sco = regex_scoring_3.search(grp)
            if found_sco and found_sco.group(1) not in remove_scoring:
                scoring.append(
                    rename_scoring_dict[found_sco.group(1)]
                    if found_sco.group(1) in rename_scoring_dict
                    else found_sco.group(1)
                )
    
    return scoring

bach_works_df["formatted_scoring"] = bach_works_df["Scoring"].fillna("").apply(_parse_scoring)

In [None]:
bach_metadata_df = (
    bach_works_df[["Name", "Key", "BWV_without_version", "formatted_scoring", "BD"]]
    .rename(columns={"BWV_without_version": "bwv", "formatted_scoring": "scoring", "BD": "url"})
)

metadata_url = "https://www.bachdigital.de/receive/BachDigitalWork_work_"
bach_metadata_df["url"] = metadata_url + bach_metadata_df["url"].str.pad(9, fillchar="0", side="left") + "?lang=en"

def _get_genre(url: str) -> Optional[str]:
    page = BeautifulSoup(requests.get(url).content)
    dl = page.find("dl", {"id": "generalData"})
    if dl:
        dts = dl.find_all("dt")
        for dt in dts:
            if dt.get_text().replace("\n", "") == "Genre":
                return dt.find_next("dd").get_text()
    return None

tqdm.pandas()

bach_metadata_df["genre"] = bach_metadata_df["url"].progress_apply(_get_genre)
bach_metadata_df = bach_metadata_df[~bach_metadata_df["genre"].isna()]

bach_metadata_df["Key"] = bach_metadata_df["Key"].str.replace("\xa0", " ")

In [None]:
final_df = music_path_df.merge(bach_metadata_df, on=["bwv"], how="inner")
final_df = final_df.rename({"Name": "name", "Key": "key"}, axis=1)

In [None]:
len(final_df), len(bach_metadata_df), len(music_path_df)

In [None]:
output_csv_path = join(bach_flac_path, "metadata.csv")

final_df.to_csv(output_csv_path, sep=";", index=False)

In [None]:
final_df = pd.read_csv(output_csv_path, sep=";")

In [None]:
final_df