In [None]:
import re
import string

import pandas as pd
from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
url = input("URL: ")
dirs = sorted(d for d in Path.cwd().iterdir() if d.is_dir())

count_vec = CountVectorizer(token_pattern='.', ngram_range=(1, 5))
X_dirs = count_vec.fit_transform([d.stem for d in dirs])
X_link = count_vec.transform([Path(url).stem])
idx = cosine_similarity(X_link, X_dirs).argmax()
path = dirs[idx]

info_df = pd.read_html(url, attrs={"class": "infobox"})[0]
info_df = info_df.set_index(info_df.columns[0]).iloc[:, 0]
year = info_df.loc['Released'].split(", ")[-1].split(' ')[-1]
artist = info_df.iloc[1].split('by ')[-1]
title = info_df.index.name

year, artist, title, path.stem

In [None]:
def clean_track(text):
    pattern = r'(".+?")( \(.+\))?'
    return (re.match(pattern, text)
              .group(1)
              .strip('"')
              .replace(':', '_')
              .replace('/', '_')
              .replace('\\', '_')
              .title())

dfs = pd.read_html(url, attrs={"class": "tracklist"})
for i, df in enumerate(dfs):
    df.columns = df.columns.get_level_values(-1) # bonus tracks have multi-index column
    
    last = df.iloc[-1]
    if last['No.'] == last['Title']: # some tables include total time row
        dfs[i] = df[:-1]

tracks_df = (pd.concat(dfs, axis='rows', sort=False)
               .reset_index(drop=True)
               .loc[:, 'Title']
               .map(clean_track))
tracks_df.index = tracks_df.index + 1
tracks_df

In [None]:
extensions = ['.mp3', '.m4a', '.flac', '.wav', '.mp4', '.ogg']
files = sorted(file for file in path.iterdir() if file.suffix in extensions)
names = [file.stem for file in files]

count_vec = CountVectorizer(token_pattern='.', ngram_range=(1, 10))

X_wiki = count_vec.fit_transform(tracks_df)
X_disk = count_vec.transform(names)

idxs = cosine_similarity(X_wiki, X_disk).argmax(axis=0)

[(i, f.name) for i, f in enumerate(files, start=1)]

In [None]:
assert len(files) <= len(tracks_df)
assert len(files) == len(np.unique(idxs))

for file, (track_num, track_name) in zip(files, tracks_df.iloc[idxs].iteritems()):
    new_filename = f"{track_num:0>2} - {artist} - {track_name}{file.suffix}"
    new_file = file.parent / new_filename
    file.rename(new_file)

new_dirname = f"[{year}] {artist} - {title.title()}"
new_path = path.parent / new_dirname
path.rename(new_path)