In [1]:
import httpx
import pandas as pd

from pathlib import Path
from bs4 import BeautifulSoup


In [2]:
MUSIC_URL_FILE = "../data/list_cds.md"
CD_INFO_FILE = "../data/cd_info.parquet"

In [3]:
def extract_urls_from_list_cds(music_file=MUSIC_URL_FILE):
    with open(music_file, "r") as f:
        music_urls = f.readlines()
    music_urls  = [line.strip() for line in music_urls if line != "\n"]
    music_urls = music_urls[1:]
    return music_urls

In [4]:
def extract_music_info(music_url: str):
    r = httpx.get(music_url, timeout=60)
    soup = BeautifulSoup(r.text, "html")
    title = soup.find("meta", {"name" : "apple:title"})['content']
    title_artist = soup.find("meta", {"property" : "og:title"})['content']
    artist = title_artist.replace(f"{title} by ", "")
    return title, artist

In [5]:
def create_info_df_for_every_album(music_urls):
    cd_info = []
    for url in music_urls:
        title, artist = extract_music_info(url)
        cd_info.append([title, artist, url])
    cd_info_df = pd.DataFrame(cd_info)
    cd_info_df.columns = ["Album Title", "Artist", "Apple Music URL"]
    return cd_info_df
        

In [6]:
music_urls = extract_urls_from_list_cds(MUSIC_URL_FILE)

In [None]:
# Should read current file and skip any existing albums

In [7]:
cd_info_df = create_info_df_for_every_album(music_urls)

In [8]:
cd_info_df

Unnamed: 0,Album Title,Artist,Apple Music URL
0,Speechless,Nicky Chiswell,https://music.apple.com/au/album/speechless/16...
1,"Greatest Hits, Volume I & Volume II",Billy Joel,https://music.apple.com/au/album/greatest-hits...
2,Bleecker Street: Greenwich Village in the 60's,Various Artists,https://music.apple.com/au/album/bleecker-stre...
3,More Power to Ya (30th Anniversary Edition),Petra,https://music.apple.com/au/album/more-power-to...
4,"Tchaikovsky: 1812 Overture, Op. 49, TH 49; Cap...",Cincinnati Symphony Orchestra & Erich Kunzel,https://music.apple.com/au/album/tchaikovsky-1...
...,...,...,...
90,...But Seriously (Deluxe Edition) [Remastered],Phil Collins,https://music.apple.com/au/album/but-seriously...
91,A Kernel of Wheat,Selah,https://music.apple.com/au/album/a-kernel-of-w...
92,Arriving,Chris Tomlin,https://music.apple.com/au/album/arriving/1440...
93,Fields of Gold - The Best of Sting (1984-1994)...,Sting,https://music.apple.com/au/album/fields-of-gol...


In [9]:
def write_cd_info_to_parquet(cd_info_df):
    output_file = Path(CD_INFO_FILE)
    if output_file.exists():
       output_file.rename(output_file.with_suffix(".parquet.BAK"))
    cd_info_df.to_parquet(CD_INFO_FILE)
    print(f"{len(cd_info_df)} albums written")
    return None

In [10]:
write_cd_info_to_parquet(cd_info_df)

95 albums written
