Next we need to identify unique artists in our set and match them with Musicbrainz ids. We'll then use them to sample users from LastFM's comments section to identify early adopters and their listening habits.

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
import re
import time
# Import API importers
import musicbrainzngs
# NGram
from similarity.ngram import NGram

twogram = NGram(2)

# Set musicbrainz crednetials
musicbrainzngs.set_rate_limit(limit_or_interval=1.0, new_requests=1)
musicbrainzngs.set_useragent('concerts', '0.0.1', 'me@sebastian-engels.com')
musicbrainzngs.set_format(fmt='xml')

In [2]:
# Switch Directory to ../data/interim/
current_dir = os.getcwd()
destination_dir = '/data/interim'
if current_dir[-len(destination_dir):] != destination_dir:
    os.chdir('..' + destination_dir)
else:
    print('already in correct directory: ',current_dir)

In [3]:
# Generate a DataFrame with unique artists that have a musicbrainzid
hot100_processed = pd.read_csv('../processed/hot100_processed.csv',sep='\t')
hot100_artists_df = hot100_processed.drop_duplicates(subset=['artist'],keep='first').loc[:,['artist','title']]
print(len(hot100_artists_df))
hot100_artists_df.head()

9482


Unnamed: 0,artist,title
0,The Association,Windy
1,The Music Explosion,Little Bit O' Soul
2,Frankie Valli,Can't Take My Eyes Off You
3,Scott McKenzie,San Francisco (Be Sure To Wear Flowers In Your...
4,Petula Clark,Don't Sleep In The Subway


In [None]:
try:
    mbid_list = pd.read_csv('../interim/hot100_artists.csv',sep='\t',encoding='utf-8',index_col='artist')
except:
    mbid_list = pd.DataFrame(index=hot100_artists_df.artist,columns=['mbid','checked'])
    mbid_list.loc[:,'checked'] = False

mbid_list.head()

Unnamed: 0_level_0,mbid,checked
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
The Association,e1fae923-bf20-4d7b-89fb-38ecc0a8236b,True
The Music Explosion,033c88a1-d6d3-4654-8a40-e67fc469608e,True
Frankie Valli,,True
Scott McKenzie,120d7ba0-73e1-4e96-98e3-e75f60f232e8,True
Petula Clark,40e69149-45a7-4dab-8128-c5a4d9654eeb,True


In [None]:
def save_progress(df,output_path):
    df_res_index = df.reset_index()
    df_res_index.to_csv(output_dir + target, sep='\t',index=False,encoding='utf-8')
    

def twogram_distance(row):
    # only compare string values
    return twogram.distance(row[0],row[1])

In [None]:
def search_artist_mb(row):
    artist_name = row[0]
    track_title = row[1]
    res_tracks = musicbrainzngs.search_recordings(query=track_title,limit=20)
    best_match = False
    best_dist = 0.5
    max_dist = 0.5
    for track in res_tracks['recording-list']:
        mb_title = track['title']
        track_dist = twogram_distance((track_title,mb_title))
        for artist_credit in track['artist-credit']:
            try:
                mbid_artist = artist_credit['artist']['id']
                mb_artist = artist_credit['artist']['name']
                artist_dist = twogram_distance((artist_name,mb_artist))
            except:
                # This is not a valid artist dict instead it's a concatenation (e.g. &, feat. etc.)
                artist_dist = 1
        cumul_dist = artist_dist + track_dist
        if cumul_dist == 0.0:
            return mbid_artist
        elif cumul_dist < best_dist and cumul_dist < max_dist:
            best_match = mbid_artist
            best_dist = cumul_dist
    if best_match:
        return best_match
    return np.nan

# Save Progress Settings 
output_dir = '../interim/'
target = 'hot100_artists.csv'

# Total Artists
total_artists = len(hot100_artists_df)
print("Total Artists to be matched: ",total_artists)
for i, row in enumerate(hot100_artists_df.iterrows()):
    row_values = row[1]
    checked = mbid_list.loc[row_values['artist'],'checked']
    if checked:
        continue
    mbid_list.loc[row[1]['artist'],'mbid'] = search_artist_mb(row_values)
    mbid_list.loc[row[1]['artist'],'checked'] = True
    if (i % 10) == 0:
        save_progress(mbid_list,output_dir+target)
        print("Progress: {}%".format(round(i/total_artists*100,2)))
    elif i >= (len(hot100_artists_df) - 1):
        save_progress(mbid_list,output_dir+target)
        print("Success: {} of {}".format(i+1,total_artists))
    time.sleep(.2)

Total Artists to be matched:  9482
Progress: 62.43%
Progress: 62.54%
Progress: 62.65%
Progress: 62.75%
Progress: 62.86%
Progress: 62.96%
Progress: 63.07%
Progress: 63.17%
Progress: 63.28%
Progress: 63.38%
Progress: 63.49%
Progress: 63.59%
Progress: 63.7%
Progress: 63.81%
Progress: 63.91%
Progress: 64.02%
Progress: 64.12%
Progress: 64.23%
Progress: 64.33%
Progress: 64.44%
Progress: 64.54%
Progress: 64.65%
Progress: 64.75%
Progress: 64.86%
Progress: 64.97%
Progress: 65.07%
Progress: 65.18%
Progress: 65.28%
Progress: 65.39%
Progress: 65.49%
Progress: 65.6%
Progress: 65.7%
Progress: 65.81%
Progress: 65.91%
Progress: 66.02%
Progress: 66.13%
Progress: 66.23%
Progress: 66.34%
Progress: 66.44%
Progress: 66.55%
Progress: 66.65%
Progress: 66.76%
Progress: 66.86%
Progress: 66.97%
Progress: 67.07%
Progress: 67.18%
Progress: 67.29%
Progress: 67.39%
Progress: 67.5%
Progress: 67.6%
Progress: 67.71%
Progress: 67.81%
Progress: 67.92%
Progress: 68.02%
Progress: 68.13%
Progress: 68.23%
Progress: 68.34%
P