Next we need to identify unique songs in our set and match them with Musicbrainz ids. We'll then use them to crawl tags for genre identification

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
import re
import time
# Import API importers
import musicbrainzngs
# NGram
from similarity.ngram import NGram

twogram = NGram(2)

# Set musicbrainz crednetials
musicbrainzngs.set_rate_limit(limit_or_interval=1.0, new_requests=1)
musicbrainzngs.set_useragent('concerts', '0.0.1', 'me@sebastian-engels.com')
musicbrainzngs.set_format(fmt='xml')

In [2]:
# Switch Directory to ../data/interim/
current_dir = os.getcwd()
destination_dir = '/data/interim'
if current_dir[-len(destination_dir):] != destination_dir:
    os.chdir('..' + destination_dir)
else:
    print('already in correct directory: ',current_dir)

In [3]:
# Generate a DataFrame with unique artists that have a musicbrainzid
hot100_processed = pd.read_csv('../processed/hot100_processed.csv',sep='\t')
hot100_uniq_df = hot100_processed.drop_duplicates(subset=['artist','title'],keep='first').loc[:,['artist','title']]

print(len(hot100_uniq_df))
hot100_uniq_df.head()

28083


Unnamed: 0,artist,title
0,The Association,Windy
1,The Music Explosion,Little Bit O' Soul
2,Frankie Valli,Can't Take My Eyes Off You
3,Scott McKenzie,San Francisco (Be Sure To Wear Flowers In Your...
4,Petula Clark,Don't Sleep In The Subway


In [4]:
try:
    mbid_list = pd.read_csv('../interim/hot100_uniq.csv',sep='\t',encoding='utf-8',index_col='artist')
except:
    mbid_list = pd.DataFrame(index=[hot100_uniq_df.title,hot100_uniq_df.artist],columns=['mbid','checked'])
    mbid_list.loc[:,'checked'] = False

mbid_list.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mbid,checked
title,artist,Unnamed: 2_level_1,Unnamed: 3_level_1
Windy,The Association,,False
Little Bit O' Soul,The Music Explosion,,False
Can't Take My Eyes Off You,Frankie Valli,,False
San Francisco (Be Sure To Wear Flowers In Your Hair),Scott McKenzie,,False
Don't Sleep In The Subway,Petula Clark,,False


In [5]:
def save_progress(df,output_path):
    df_res_index = df.reset_index()
    df_res_index.to_csv(output_path, sep='\t',index=False,encoding='utf-8')
    
def twogram_distance(row):
    # only compare string values
    return twogram.distance(row[0],row[1])

In [6]:
def search_artist_mb(row):
    artist_name = row[0]
    track_title = row[1]
    res_tracks = musicbrainzngs.search_recordings(query=track_title,limit=20)
    best_match = False
    best_dist = 0.5
    max_dist = 0.5
    for track in res_tracks['recording-list']:
        mb_title = track['title']
        mbid_title = track['id']
        track_dist = twogram_distance((track_title,mb_title))
        for artist_credit in track['artist-credit']:
            try:
                mbid_artist = artist_credit['artist']['id']
                mb_artist = artist_credit['artist']['name']
                artist_dist = twogram_distance((artist_name,mb_artist))
            except:
                # This is not a valid artist dict instead it's a concatenation (e.g. &, feat. etc.)
                artist_dist = 1
        cumul_dist = artist_dist + track_dist
        if cumul_dist == 0.0:
            return mbid_title
        elif cumul_dist < best_dist and cumul_dist < max_dist:
            best_match = mbid_title
            best_dist = cumul_dist
    if best_match:
        return best_match
    return np.nan

# Save Progress Settings 
output_dir = '../interim/'
target = 'hot100_uniq_mbid.csv'

# Total Artists
total_titles = len(hot100_uniq_df)
print("Total Titles to be matched: ",total_titles)
for i, row in enumerate(hot100_uniq_df.iterrows()):
    row_values = row[1]
    checked = mbid_list.loc[(row_values['title'],row_values['artist']),'checked']
    if checked:
        continue
    mbid_list.loc[row[1]['title'],'mbid'] = search_artist_mb(row_values)
    mbid_list.loc[row[1]['title'],'checked'] = True
    if (i % 10) == 0:
        save_progress(mbid_list,output_dir+target)
        print("Progress: {}%".format(round(i/total_titles*100,2)))
    elif i >= (len(hot100_uniq_df) - 1):
        save_progress(mbid_list,output_dir+target)
        print("Success: {} of {}".format(i+1,total_titles))
    time.sleep(.2)

Total Titles to be matched:  28083
Progress: 0.0%
Progress: 0.04%
Progress: 0.07%
Progress: 0.11%
Progress: 0.14%
Progress: 0.18%
Progress: 0.21%
Progress: 0.25%
Progress: 0.28%
Progress: 0.32%
Progress: 0.36%
Progress: 0.39%
Progress: 0.43%
Progress: 0.46%
Progress: 0.5%
Progress: 0.53%
Progress: 0.57%
Progress: 0.61%
Progress: 0.64%
Progress: 0.68%
Progress: 0.71%
Progress: 0.75%
Progress: 0.78%
Progress: 0.82%
Progress: 0.85%
Progress: 0.89%
Progress: 0.93%
Progress: 0.96%
Progress: 1.0%
Progress: 1.03%
Progress: 1.07%
Progress: 1.1%
Progress: 1.14%
Progress: 1.18%
Progress: 1.21%
Progress: 1.25%
Progress: 1.28%
Progress: 1.32%
Progress: 1.35%
Progress: 1.39%
Progress: 1.42%
Progress: 1.46%
Progress: 1.5%
Progress: 1.53%
Progress: 1.57%
Progress: 1.6%
Progress: 1.64%
Progress: 1.67%
Progress: 1.71%
Progress: 1.74%
Progress: 1.78%
Progress: 1.82%
Progress: 1.85%
Progress: 1.89%
Progress: 1.92%
Progress: 1.96%
Progress: 1.99%
Progress: 2.03%
Progress: 2.07%
Progress: 2.1%
Progress: 2.

KeyboardInterrupt: 