# Building a Representative Corpus of Jazz Piano Trio Recordings

## Import dependencies, set constants etc.

In [44]:
import pandas as pd
import dotenv
import requests
import hashlib
import string
import os
import logging
import time
from thefuzz import process
from tqdm.notebook import tqdm
from requests.exceptions import ConnectionError

import src.utils.analyse_utils as autils

In [2]:
# These options just make the outputs from each cell look cleaner
pd.set_option('display.max_rows', None)
logging.getLogger().setLevel(logging.ERROR)

In [3]:
# These variables are constants and can be changed if needed
N_PAGES = 500    # The number of LastFM pages to scrape artist information from
N_ARTISTS = 500    # The number of artists per LastFM page
ROOT = r"https://ws.audioscrobbler.com/2.0"    # This is the API root directory that we will make our calls to
GENRE = 'jazz'    # The LastFM genre to scrape for artist names
SEARCH_TAG = 'trio'    # Only consider artists that have this tag in their name
NAME_SIMILARITY_THRESH = 2/3    # Consider track names below this threshold to be unique

## Get API information

To access the LastFM API, we need an API key and shared secret. These must be applied for manually, the procedure for which is out of scope for this notebook. Follow the advice [given in the LastFM API documentation](https://www.last.fm/api/authentication).

Once you have an API and shared secret key, these should be stored as environment variables within a file called `.env` that is saved in the root directory of this project folder. These will then be loaded in automatically using the `dotenv` package within the following few lines.

In [4]:
# Load in the environment variables we require for authentication
dotenv.load_dotenv(rf"{autils.get_project_root()}\.env")
LASTFM_API_KEY = os.getenv('LASTFM_API_KEY')

In [5]:
# These authentication variables aren't currently used, but are created now in case we do end up using them later
SHARED_SECRET = os.getenv('LASTFM_SHARED_SECRET')
TOKEN = eval(requests.get(f"{ROOT}/?method=auth.gettoken&api_key={LASTFM_API_KEY}&format=json").text)['token']
API_SIGNATURE = hashlib.md5(f"api_key{LASTFM_API_KEY}methodauth.getSessiontoken{TOKEN}{SHARED_SECRET}".encode('utf-8'))

## Scrape data from LastFM using API

The following cells scrape data from the LastFM API using the information and keys created in the cells above.

In [6]:
def get_total_streams(mbid: str) -> dict:
    """Gets the total number of streams for an artist by their ID, across all tracks and albums"""
    # Make the API request
    request = requests.get(f"{ROOT}/?method=artist.getinfo&mbid={mbid}&api_key={LASTFM_API_KEY}&format=json").json()
    # Get the required variable from the request JSON object
    return {'playcount': request['artist']['stats']['playcount']}

def get_top_n_track_streams(mbid: str, num_tracks: int = 3) -> dict:
    """Gets the number of streams for the top n tracks by an artist, by their ID"""
    # Make the API request
    request = requests.get(f"{ROOT}/?method=artist.gettoptracks&mbid={mbid}&api_key={LASTFM_API_KEY}&format=json").json()
    res_ = {}
    # Iterate through the required number of tracks
    for tr in range(num_tracks):
        # Get the name and play count from the given track
        res_[f'track_{tr+1}_name'] = request['toptracks']['track'][tr]['name']
        res_[f'track_{tr+1}_plays'] = request['toptracks']['track'][tr]['playcount']
    return res_

In [7]:
def process_artist(artist: dict) -> dict:
    """Central function for processing data from a single artist scraped from LastFM"""
    if SEARCH_TAG.upper() in str(artist['name']).upper():
        # We make everything upper case to address any case-sensitivity issues
        return {
            **artist,
            **get_total_streams(mbid=artist['mbid']),
            **get_top_n_track_streams(mbid=artist['mbid']),
        }

In [8]:
# Empty list to hold our processed data
trios = []

In [12]:
# Iterate through each page of artists on LastFM
for i in tqdm(range(1, N_PAGES)):
    # Make the API request
    try:
        tag = requests.get(
            f"{ROOT}/?method=tag.gettopartists&tag={GENRE}&api_key={LASTFM_API_KEY}&page={i}&perpage={N_ARTISTS}&format=json"
        )
    # If we time out, log and then continue to the next page
    except ConnectionError as e:
        print(e)
        pass
    # If we receive data, iterate through each artist in turn and process it
    else:
        for k, v in tag.json().items():
            for k_, v_ in v.items():
                for lastfm_artist in v_:
                    # Try and process the data for one artist
                    try:
                        trios.append(process_artist(lastfm_artist))
                    # Catch errors resulting from irregular item construction, timeouts, and continue
                    except (TypeError, KeyError, ConnectionError) as e:
                        pass

  0%|          | 0/499 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


In [16]:
# Concatenate the artist information and drop any duplicates (by ID)
df = (
    pd.DataFrame([t for t in trios if t is not None])
      .drop_duplicates(subset=['mbid'])
      .drop(columns=['image', 'streamable', '@attr'])
)

## Subset data to only include those in Levine/Gioia

In order to ensure that the artists we include in the corpus played a non-trivial role in jazz history, we cross-reference the list of artists scraped from LastFM with two reference discographies, defined as:

- Gioia, T. (2011). The History of Jazz (2nd ed.). New York: Oxford University Press. (`gioia`)
- Levine, M. (2011). The Jazz Theory Book. Sebastopol: Sher Music Company. (`levine`)

If an artist can be found in the LastFM scraping dataset AND one of the following discographies AND it passes a final manual check against the inclusion criteria, then they can be included in the corpus.

In [17]:
# Define the reader function to load in the required text file
reader = lambda f: open(fr'{autils.get_project_root()}\references\corpus_construction\{f}.txt', "r").read().splitlines()

In [18]:
# Load in the reference discographies and list of appropriate artists
levine = reader('levine')
gioia = reader('gioia')
appropriate = reader('appropriate')

In [19]:
# Subset the dataframe to include artists in one of the two discographies & that are acceptable to the inclusion criteria
df_cut = df[(df['name'].isin(levine)) | (df['name'].isin(gioia))]
df_cut = df[(df['name'].isin(appropriate))].reset_index(drop=True)

## Get metadata for selected artists from LastFM

Once we have our list of acceptable artists, we can then make an additional call to the LastFM database to get some additional information that would have taken too long to gather during our initial scraping. This includes the total number of unique tracks by that artist.

We define a 'unique track' as one that does not exceed a given similarity threshold with other tracks performed by that artist (i.e. multiple takes of one track are excluded), and which has received over and above a set threshold of plays. This excludes tracks with incorrect or mispelled filenames added by LastFM users.

In [20]:
# These are the substrings we will remove from our track titles
REMOVERS = [
    'alternate',
    'take',
    'original',
    'mix',
    'live'
    'digital',
    'remaster',
    'remastered',
    'instrumental',
    'intrumental',
    'album',
    'version',
    'rec',
    'bonus',
    'track'
]

In [21]:
def get_unique_tracks(
        mbid: str, n_pages: int = 10, track_limit: int = 50, listener_limit: int = 100
) -> list:
    """Returns a list of unique track names for a particular artist"""
    def is_english(s: str) -> bool:
        """Returns whether a string contains only English/ascii characters"""
        try:
            s.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True

    def cleaner(s) -> str:
        """Runs cleaning procedure on a string"""
        # Remove all punctuation and numerical characters from the string, then convert to uppercase
        cleaned = ''.join(
            [c for c in s.translate(str.maketrans('', '', string.punctuation)) if not c.isdigit()]
        ).strip().upper()
        # Remove all requested words from the cleaned string and return
        for word in REMOVERS:
            cleaned = cleaned.replace(word.upper(), '')
        return cleaned

    seen = []
    non_fmt = []
    # Iterate through each page of tracks
    for page_num in range(n_pages):
        # Make the API request
        try:
            tracks =requests.get(
                f"{ROOT}/?method=artist.gettoptracks&mbid={mbid}&api_key={LASTFM_API_KEY}&format=json&limit={track_limit}&page={page_num}"
            ).json()
        # If we time out, log and then continue to the next page
        except ConnectionError as e_:
            print(e_)
            continue
        # Otherwise, try and process the page
        else:
            # Iterate through all the tracks on the page
            for track_num in range(track_limit):
                # Try and get the name of the track
                try:
                    match = tracks["toptracks"]["track"][track_num]
                # Break if the page ends early
                except IndexError:
                    break
                else:
                    # If the track has enough listeners
                    if int(match["listeners"]) >= listener_limit:
                        # Clean the track name to remove punctuations, numbers, etc
                        name = cleaner(match['name'])
                        # If the track is english and has letters in it
                        if len(name) != 0 and is_english(name):
                            # Get the similarity of the current track to the previously seen tracks
                            sims = [item[1] / 100 for item in process.extract(match['name'], seen)]
                            # If the maximum similarity to our seen track names is below the threshold, it's unique
                            if len(sims) == 0 or max(sims) < NAME_SIMILARITY_THRESH:
                                seen.append(name)
                                non_fmt.append(match['name'])
    return non_fmt

In [22]:
unique_tracks = []
for i, v in tqdm(df_cut.iterrows(), total=df_cut.shape[0]):
    unique_tracks.append({
        'name': v['name'],
        'n_unique_tracks': len(get_unique_tracks(v['mbid'])),
        'unique_tracks': get_unique_tracks(v['mbid'])
    })

  0%|          | 0/34 [00:00<?, ?it/s]

In [28]:
df_cut['unique_tracks'] = [int(uni['n_unique_tracks']) for uni in unique_tracks]

1232


## Scrape artist metadata from Discogs

Once we have our list of artists from LastFM, we can now turn to scraping [Discogs](https://discogs.com) to gain extra metadata from their releases. This will include building up the networks of musicians they played with, which can (in turn) be used to estimate the number of unique combinations of musicians under the banner of a single artist (for example, the Bill Evans Trio contains 4 unique groups of musicians, all led by Bill Evans)

This requires us to access the Discogs REST API, the documentation for which is provided [at the following page](https://www.discogs.com/developers). As with LastFM, to access discogs you will need to generate an account and an API key, a tutorial for which is again out of scope for this notebook.

In [40]:
DISCOGS_API_TOKEN = os.getenv('DISCOGS_API_TOKEN')

In [41]:
headers = {"Authorization": f"Discogs token={DISCOGS_API_TOKEN}"}
params = {"token": DISCOGS_API_TOKEN}
discogs_endpoint = "https://api.discogs.com"

In [120]:
def backoff(func):
    """Simple decorator that forces a one-second wait after an API call to prevent rate limiting"""
    def wrapper(*args, **kwargs):
        # Discogs API rate-limits authenticated users to 60 API calls per minute
        # TODO: implement some sort of error catching here
        time.sleep(1)
        return func(*args, **kwargs)
    return wrapper

@backoff
def get_discogs_artist_id(artist: str) -> int:
    """Gets the discogs artist ID for an artist by their name, ripped from LastFM"""
    # Get the artist name by replacing spaces with dashes and converting to lower case
    artist = artist.replace(' ', '-').lower()
    # The first returned result is (usually) the artist name on Discogs
    return requests.get(
        f"{discogs_endpoint}/database/search?q={artist}&type=artist",
        headers=headers,
        params=params
    ).json()['results'][0]['id']

@backoff
def get_discogs_artist_releases(discogs_artist_id: int, page_num: int = '') -> dict:
    """Gets the releases for an artist on discogs by their artist ID"""
    # This forces us to get releases from a particular page
    if page_num != '':
        page_num = f'&page={page_num}'
    # Return the API request
    return requests.get(
        f"{discogs_endpoint}/artists/{discogs_artist_id}/releases?sort=year&sort_order=asc&per_page=100{page_num}",
        headers=headers,
        params=params
    ).json()

@backoff
def parse_discogs_release_for_artists(discogs_release: str, df_name: str) -> dict:
    """Parses a single release from Discogs to get the names and roles of the individuals who contributed towards it"""
    # Get the information from a single release
    rel = requests.get(
        discogs_release['resource_url'],
        headers=headers,
        params=params
    ).json()
    # Try to get the names of the contributors to the release
    try:
        arts = rel['extraartists']
    # Break if contributors are not found for this release
    except KeyError:
        pass
    # Otherwise, iterate through all the contributors and return a formatted dictionary
    else:
        for art in arts:
            yield {
                'name': df_name,
                'release_artist': discogs_release['artist'],
                'release_title': rel['title'],
                'release_year': rel['year'],
                'release_tracks': len(rel['tracklist']),
                'artist_name': art['name'],
                'artist_role': art['role']
            }

In [None]:
res = []
# Iterate through each of the artists in our dataframe returned from LastFM
for i, v in tqdm(df_cut.iterrows(), total=df_cut.shape[0], desc='Total artists'):
    # Get the discogs ID for the artist
    artist_id = get_discogs_artist_id(v['name'])
    # Get the number of pages of releases for this artist
    pages = get_discogs_artist_releases(artist_id)['pagination']['pages']
    # Iterate through each page
    for page in range(1, pages + 1):
        # Get the releases on this page
        artist_releases = get_discogs_artist_releases(artist_id, page=page)
        # Iterate through the releases on the page and get the contributor names and roles
        for release in tqdm(artist_releases['releases'], desc=f'{v["name"]}, page {page}/{pages}'):
            res.extend(list(parse_discogs_release_for_artists(release, v['name'])))

Total artists:   0%|          | 0/34 [00:00<?, ?it/s]

Bill Evans Trio, page 1/4:   0%|          | 0/100 [00:00<?, ?it/s]

Bill Evans Trio, page 2/4:   0%|          | 0/100 [00:00<?, ?it/s]

Bill Evans Trio, page 3/4:   0%|          | 0/100 [00:00<?, ?it/s]

Bill Evans Trio, page 4/4:   0%|          | 0/72 [00:00<?, ?it/s]

Oscar Peterson Trio, page 1/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 2/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 3/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 4/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 5/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 6/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 7/8:   0%|          | 0/100 [00:00<?, ?it/s]

Oscar Peterson Trio, page 8/8:   0%|          | 0/96 [00:00<?, ?it/s]

Ahmad Jamal Trio, page 1/2:   0%|          | 0/100 [00:00<?, ?it/s]

Ahmad Jamal Trio, page 2/2:   0%|          | 0/6 [00:00<?, ?it/s]

## Output results

In [None]:
df_cut.to_csv(fr'{autils.get_project_root()}\references\corpus_construction\lastfm_piano_trio_search.csv')