In [2]:
import pandas as pd
from utils import RAW_DATA_DIR
import ast

In [3]:
def load(filepath):
    tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

    COLUMNS = [("track", "tags"), ("album", "tags"), ("artist", "tags"), ("track", "genres"), ("track", "genres_all")]
    for column in COLUMNS:
        tracks[column] = tracks[column].map(ast.literal_eval)

    COLUMNS = [
        ("track", "date_created"),
        ("track", "date_recorded"),
        ("album", "date_created"),
        ("album", "date_released"),
        ("artist", "date_created"),
        ("artist", "active_year_begin"),
        ("artist", "active_year_end"),
    ]
    for column in COLUMNS:
        tracks[column] = pd.to_datetime(tracks[column])

    SUBSETS = ("small", "medium", "large")
    try:
        tracks["set", "subset"] = tracks["set", "subset"].astype("category", categories=SUBSETS, ordered=True)
    except (ValueError, TypeError):
        # the categories and ordered arguments were removed in pandas 0.25
        tracks["set", "subset"] = tracks["set", "subset"].astype(pd.CategoricalDtype(categories=SUBSETS, ordered=True))

    COLUMNS = [
        ("track", "genre_top"),
        ("track", "license"),
        ("album", "type"),
        ("album", "information"),
        ("artist", "bio"),
    ]
    for column in COLUMNS:
        tracks[column] = tracks[column].astype("category")

    return tracks

In [34]:
df = load(RAW_DATA_DIR / "fma_metadata" / "tracks.csv")

In [15]:
df.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [46]:
df[df["album"]["tags"].map(lambda x: len(x)) != 0]

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
992,0,2008-11-26 02:29:10,NaT,,0,252,,4404,,[baltimore],...,,1654,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,917,,9,,[baltimore],Beautiful Song w/ kick drum solo
4119,0,2008-12-04 09:29:00,1995-12-24,,1,1460,"<p>""Christmas With Quintron - Mr. Quintron, Mi...",4507,Stork,[new orleans],...,,798,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,381,,6,,[new orleans],Organ Music II
5477,0,2009-01-30 07:24:01,2004-01-25,,8,1889,"<p>All content unwritten, unplayed, unsung and...",7851,,"[children, jim reeves, youth performance, coun...",...,,3476,en,Attribution-NonCommercial 3.0 International,1855,,5,,"[children, jim reeves, youth performance, coun...","But You Love Me, Daddy (with Jim Reeves)"
5510,0,2009-01-30 07:24:01,2004-01-25,,8,1889,"<p>All content unwritten, unplayed, unsung and...",7851,,"[children, jim reeves, youth performance, coun...",...,,1822,en,Attribution-NonCommercial 3.0 International,402,,4,,"[children, jim reeves, youth performance, coun...",Technical Difficulty
5570,0,2009-01-30 07:24:01,2004-01-25,,8,1889,"<p>All content unwritten, unplayed, unsung and...",7851,,"[children, jim reeves, youth performance, coun...",...,,1248,en,Attribution-NonCommercial 3.0 International,514,,12,,"[children, jim reeves, youth performance, coun...",You And Me
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154897,0,2017-03-19 06:16:11,2017-03-19,,0,22861,<p>Deep 3D turntable spinning like a phat bitc...,2529,,"[electronic, italian, edm, spettro records, el...",...,,1241,,Creative Commons Attribution-NonCommercial-NoD...,1037,,3,,"[electronic, italian, edm, spettro records, el...",Brainworm
155000,1,2017-03-21 22:56:31,2017-03-22,,0,22880,<p>Some upbeat ukulele lounge loafering.</p>\n...,1208,Andre William owen,"[beatnik, hip, turtleneck, swagger, foxtrot, b...",...,,1491,,Attribution-Noncommercial-Share Alike 3.0 Unit...,1210,,1,,"[beatnik, hip, turtleneck, swagger, foxtrot, b...",turtleneck foxtrot
155063,0,2017-03-24 19:40:34,2017-03-24,JBlanked,1,22899,"<p>Hip Hop, Old School type beat 2017, Trap Mu...",4524,JBlanked,"[old school beats, 2017 free instrumentals, fr...",...,,1283,,Attribution,1050,,4,,"[old school beats, 2017 free instrumentals, fr...",Been On
155064,0,2017-03-24 19:40:34,2017-03-24,JBlanked,1,22899,"<p>Hip Hop, Old School type beat 2017, Trap Mu...",4524,JBlanked,"[old school beats, 2017 free instrumentals, fr...",...,,1077,,Attribution,858,,2,,"[old school beats, 2017 free instrumentals, fr...",Send Me


In [37]:
df = df[df["set"]["subset"] == "medium"]
(df["album"]["information"] == "<p></p>").sum()

np.int64(27)

In [29]:
df = df[df["set"]["subset"] == "medium"]
df = df[~df["album"]["information"].isna()]["album"].head(10)
df

In [54]:
from bs4 import BeautifulSoup
import re

In [56]:
def sanitize_text(html_string):
    # Remove all <a> tags and their content
    soup = BeautifulSoup(html_string, "html.parser")
    clean_text = soup.get_text(separator=" ", strip=True)
    url_pattern = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
    clean_text = url_pattern.sub("", clean_text)
    # Remove extra whitespace
    clean_text = re.sub(r"\s+", " ", clean_text).strip()
    return clean_text

In [58]:
df["album"]["information"].apply(sanitize_text).isna().sum()


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(html_string, "html.parser")


np.int64(3218)

In [55]:
html_string = df["album"].iloc[-1]["information"]
url_pattern = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
html_string = url_pattern.sub("", html_string)
soup = BeautifulSoup(html_string, "html.parser")
soup.get_text(strip=True)

'A live performance at Monty Hall on Feb 17, 2017 on a bill with Screaming Females. For the full video of their performance, please visit'

In [59]:
df["track"]

Unnamed: 0_level_0,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3,256000,0,,2008-11-26 01:48:14,2008-11-26,237,1,Hip-Hop,[21],[21],,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
134,256000,0,,2008-11-26 01:43:19,2008-11-26,207,3,Hip-Hop,[21],[21],,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music
136,256000,1,,2008-11-26 01:43:35,2008-11-26,509,0,Rock,"[45, 58]","[58, 12, 45]",,1948,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1498,,0,,[],Peel Back The Mountain Sky
139,128000,0,,2008-11-26 01:44:05,2008-11-26,296,3,Folk,[17],[17],,702,en,Attribution-Noncommercial-No Derivative Works ...,582,,2,,[],CandyAss
181,256000,0,,2008-11-26 01:46:31,2006-01-01,171,8,Rock,[27],"[27, 12]",,1736,en,Attribution-Noncommercial-No Derivative Works ...,1339,,1,,[],Gopacapulco
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155297,320000,0,,2017-03-30 09:52:59,NaT,252,3,Instrumental,"[18, 107, 1235]","[107, 18, 1235]",,1463,,Attribution-NonCommercial,1049,,9,,[],Nebula Reborn
155298,320000,0,,2017-03-30 10:47:51,NaT,151,0,Folk,"[17, 103]","[17, 103]",,706,,Attribution,590,,2,,[],An Idiot Abroad
155306,320000,0,,2017-03-30 10:48:02,NaT,162,1,Folk,"[17, 103]","[17, 103]",,497,,Attribution,435,,1,,[],Tiny Man
155307,320000,0,,2017-03-30 12:53:18,NaT,271,0,Experimental,[1],"[1, 38]",,630,,Creative Commons Attribution-NonCommercial-NoD...,571,,1,,[],Kolka


In [12]:
df.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [None]:
import os


def get_audio_path(audio_dir, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.

    Examples
    --------
    >>> import utils
    >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
    >>> utils.get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'

    """
    tid_str = "{:06d}".format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + ".mp3")