In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  
import urllib.request
import urllib.parse
import urllib.error
import json
import base64
import configparser
from bs4 import BeautifulSoup
import re
import pymongo
from datetime import datetime
import time
import collections
import editdistance

In [2]:
# Open a connection to the Mongo server
client = pymongo.MongoClient('mongodb://localhost:27017/')

In [40]:
# Create a database and a collections within it.
songs_db = client.songs
albums = songs_db.albums
tracks = songs_db.tracks
genius_tracks = songs_db.gtracks
discog_tracks = songs_db.discog_tracks

In [4]:
config = configparser.ConfigParser()
config.read('secrets.ini')
[k for k in config['discogs']]

['app_name', 'consumer_key', 'consumer_secret', 'token']

In [5]:
def get_audio_features(track_ids, auth_type, auth_token):
    url = 'https://api.spotify.com/v1/audio-features?ids={ids}'.format(ids=','.join(track_ids))
    headers = {'Authorization': auth_type + ' ' + auth_token}
    request = urllib.request.Request(url, headers=headers, method='GET')
    
    for _ in range(10):
        try:
            with urllib.request.urlopen(request) as f:
                response = json.loads(f.read().decode('utf-8'))
                for track in response['audio_features']:
                    tracks.update_one({'_id': track['id']}, {'$set': track})
                break
        except urllib.error.HTTPError as e:
            print("Rate limited. Pausing for", e.info()['Retry-After'])
            time.sleep(int(e.info()['Retry-After']) + 0.5)
            continue     

In [None]:
/database/search?q={query}&{?type,title,release_title,credit,artist,anv,label,genre,style,country,year,format,catno,barcode,track,submitter,contributor}

In [7]:
def get_artists(artist_name):
    query = urllib.parse.urlencode({'q': artist_name, 'type': 'artist'})
    # query = urllib.parse.urlencode({'q': artist_name})
    url = 'https://api.discogs.com/database/search?{}'.format(query)
    headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
    artists = []
    while url:
        request = urllib.request.Request(url, headers=headers, method='GET')
        with urllib.request.urlopen(request) as f:
            response = json.loads(f.read().decode('utf-8'))
            artists += response['results']
            if 'next' in response['pagination']['urls']:
                url = response['pagination']['urls']['next']
            else:
                url = None
    return artists

In [148]:
def discogs_search(search_params, show_progress=False):
    query = urllib.parse.urlencode(search_params)
    url = 'https://api.discogs.com/database/search?{}'.format(query)
    headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
    results = []
    while url:
        try:
            request = urllib.request.Request(url, headers=headers, method='GET')
            with urllib.request.urlopen(request) as f:
                response = json.loads(f.read().decode('utf-8'))
                if show_progress:
                    print('Retrieved page', response['pagination']['page'], 'of', response['pagination']['pages'],
                         ';', f.info()['X-Discogs-Ratelimit-Remaining'], 'requests before rate limit')
                results += response['results']
                if 'next' in response['pagination']['urls']:
                    url = response['pagination']['urls']['next']
                else:
                    url = None
                if int(f.info()['X-Discogs-Ratelimit-Remaining']) <= 1:
                    if show_progress:
                        print('Rate limit: sleeping')
                    time.sleep(3)
        except urllib.error.HTTPError as e:
#                 print("Rate limited. Pausing for", e.info()['Retry-After'])
#                 time.sleep(int(e.info()['Retry-After']) + 0.5)
            print('Rate limited. Sleeping for 62 seconds')
            time.sleep(62)
            continue
    return results

In [128]:
def get_artist_albums(artist_name, show_progress=False):
    return discogs_search({'artist': artist_name, 
                           'format': 'album'},
                         show_progress=show_progress)   

In [8]:
nivs = get_artists('Nirvana')
len(nivs)

211

In [9]:
[n for n in nivs if 'Nirvana' in n['title']]

[{'id': 125246,
  'resource_url': 'https://api.discogs.com/artists/125246',
  'thumb': 'https://img.discogs.com/_W4bvRx11IqhvAG8eqjofA5MqhM=/150x150/smart/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/A-125246-1501285200-3058.jpeg.jpg',
  'title': 'Nirvana',
  'type': 'artist',
  'uri': '/artist/125246-Nirvana'},
 {'id': 307513,
  'resource_url': 'https://api.discogs.com/artists/307513',
  'thumb': 'https://img.discogs.com/ib710cDa2aK62NZIOqgb3AyGqd8=/150x150/smart/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/A-307513-1270236942.jpeg.jpg',
  'title': 'Nirvana (2)',
  'type': 'artist',
  'uri': '/artist/307513-Nirvana-2'},
 {'id': 1087206,
  'resource_url': 'https://api.discogs.com/artists/1087206',
  'thumb': 'https://img.discogs.com/rRzxkphLYtHr8bD9v8UPucd1zos=/150x150/smart/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/A-1087206-1266869411.jpeg.jpg',
  'title': 'Nirvana 2002',
  'type': 'artist',
  'uri': '/a

In [10]:
tbs = get_artists('The Beatles')
len(tbs)

701

In [11]:
atbs = [a for a in tbs if re.match('^The Beatles(\s\(\d+\))?$', a['title'])]
len(atbs)

3

In [12]:
atbs

[{'id': 82730,
  'resource_url': 'https://api.discogs.com/artists/82730',
  'thumb': 'https://img.discogs.com/z5jHbqQZ7JJ-g0OKPfpX-3luLpo=/150x150/smart/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/A-82730-1319532675.jpeg.jpg',
  'title': 'The Beatles',
  'type': 'artist',
  'uri': '/artist/82730-The-Beatles'},
 {'id': 2517607,
  'resource_url': 'https://api.discogs.com/artists/2517607',
  'thumb': '',
  'title': 'The Beatles (2)',
  'type': 'artist',
  'uri': '/artist/2517607-The-Beatles-2'},
 {'id': 4290435,
  'resource_url': 'https://api.discogs.com/artists/4290435',
  'thumb': '',
  'title': 'The Beatles (3)',
  'type': 'artist',
  'uri': '/artist/4290435-The-Beatles-3'}]

In [13]:
beatles_keys = [a for a in atbs if a['id'] == 82730][0]
beatles_keys

{'id': 82730,
 'resource_url': 'https://api.discogs.com/artists/82730',
 'thumb': 'https://img.discogs.com/z5jHbqQZ7JJ-g0OKPfpX-3luLpo=/150x150/smart/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/A-82730-1319532675.jpeg.jpg',
 'title': 'The Beatles',
 'type': 'artist',
 'uri': '/artist/82730-The-Beatles'}

In [20]:
def get_artist(artist_keys):
    url = artist_keys['resource_url']
    headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
    request = urllib.request.Request(url, headers=headers, method='GET')
    with urllib.request.urlopen(request) as f:
        response = json.loads(f.read().decode('utf-8'))
        return response     

In [94]:
def get_by_url(url):
    headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
    request = urllib.request.Request(url, headers=headers, method='GET')
    with urllib.request.urlopen(request) as f:
        response = json.loads(f.read().decode('utf-8'))
        return response     

In [22]:
beatles_data = get_artist(beatles_keys)
beatles_data.keys()

dict_keys(['urls', 'members', 'images', 'name', 'profile', 'uri', 'resource_url', 'namevariations', 'releases_url', 'aliases', 'data_quality', 'id'])

In [26]:
beatles_data['data_quality']

'Needs Vote'

In [37]:
def get_releases(releases_url, show_progress=False):
    url = releases_url
    headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
    releases = []
    while url:
        request = urllib.request.Request(url, headers=headers, method='GET')
        try:
            with urllib.request.urlopen(request) as f:
                response = json.loads(f.read().decode('utf-8'))
                if show_progress:
                    print('Retrieved page', response['pagination']['page'], 'of', response['pagination']['pages'])
                releases += response['releases']
                if 'next' in response['pagination']['urls']:
                    url = response['pagination']['urls']['next']
                else:
                    url = None
        except urllib.error.HTTPError as e:
#                 print("Rate limited. Pausing for", e.info()['Retry-After'])
#                 time.sleep(int(e.info()['Retry-After']) + 0.5)
                print('Rate limited. Sleeping for 62 seconds')
                time.sleep(62)
                continue

    return releases

In [32]:
beatles_releases = get_releases(beatles_data['releases_url'])
beatles_releases.keys()

KeyError: 'results'

In [35]:
url = beatles_data['releases_url']
headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
releases = []
while url:
    request = urllib.request.Request(url, headers=headers, method='GET')
    try:
        with urllib.request.urlopen(request) as f:
            response = json.loads(f.read().decode('utf-8'))
            print(response.keys())
            releases += response['releases']
            if 'next' in response['pagination']['urls']:
                url = response['pagination']['urls']['next']
            else:
                url = None
    except urllib.error.HTTPError as e:
#                 print("Rate limited. Pausing for", e.info()['Retry-After'])
#                 time.sleep(int(e.info()['Retry-After']) + 0.5)
            print('Rate limited. Sleeping for 62 seconds')
            time.sleep(62)
            continue

dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['pagination', 'releases'])
dict_keys(['

In [57]:
response.info()

AttributeError: 'dict' object has no attribute 'info'

In [38]:
len(releases)

5488

In [39]:
releases[0]

{'artist': 'The Beatles',
 'id': 846231,
 'main_release': 6849682,
 'resource_url': 'https://api.discogs.com/masters/846231',
 'role': 'Main',
 'thumb': 'https://img.discogs.com/uGrpdgGIbyvuYjkM4ndGsZ_ri8c=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-6849682-1475807872-1981.jpeg.jpg',
 'title': 'The Beatles In Hamburg',
 'type': 'master',
 'year': 1961}

In [41]:
set(r['role'] for r in releases)

{'Appearance', 'Main', 'TrackAppearance', 'UnofficialRelease'}

In [42]:
collections.Counter(r['role'] for r in releases)

Counter({'Appearance': 278,
         'Main': 1234,
         'TrackAppearance': 813,
         'UnofficialRelease': 3163})

In [48]:
[r for r in releases if r.get('format') == 'Album'][:5]

[]

In [50]:
set(r['format'] for r in releases if 'format' in r)

{'Cass, Club',
 '2xLP, Album, Unofficial',
 '2xLP, Unofficial, Red',
 '4xCD, Album, Comp, Mono',
 'CD, EP',
 'Lathe, 7", Num, Unofficial',
 '2x12", Comp, Unofficial, W/Lbl',
 '5xCD, Album, Unofficial + DVD, Unofficial, NTSC, C',
 '4xCD, Comp, Ltd, RM, Unofficial, Gol',
 'CD, Promo, 14 ',
 'Flexi, 7", S/Sided, Unofficial',
 'Cass, Comp, Ltd, Num',
 'DVD, Unofficial, NTSC, NTS + CD, Album, Unofficial',
 'CD, Comp, Rem',
 '2xCD, Comp, Promo, Gat',
 '5xCD, Unofficial, Ltd, RM',
 'Acetate, 7", Unofficial',
 '2xLP, Comp, Ltd, Num, RM',
 '9xLP, Comp + Box',
 '7", EP, Mono, Unofficial',
 'Flexi, 7"',
 'DVD-V, Comp',
 '8-Trk, Unofficial',
 'DVD-V, Comp, Unofficial',
 'Box + 13xCD, Album, RE, RM + 7xCD, Comp, RM + 5xDV',
 '2xCD, Comp, Promo, Dig',
 'CD, Album, RM, Unofficial + CD',
 '6", S/Sided, Unofficial, Pos',
 'CD, Mixed, Unofficial, Comp',
 '7", Ltd, Promo',
 '2xLP, Comp, Ltd',
 'LP, Ltd, Num, Unofficial, Blu',
 'CDr, Comp, Promo, Unofficial',
 'LP, Unofficial, Blu',
 'CD, mp3',
 '7", EP, 

In [51]:
[r for r in releases 
 if 'format' in r
 if 'CD' in r['format'] or 'LP' in r['format'] or 'Album' in r['format']][:5]

[{'artist': 'Los Beatles*',
  'format': 'LP, Comp, Mono',
  'id': 3709625,
  'label': 'Odeon',
  'resource_url': 'https://api.discogs.com/releases/3709625',
  'role': 'Main',
  'status': 'Accepted',
  'thumb': 'https://img.discogs.com/Q9TRW_NGja6wOMeHAlOP9_9fj0M=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-3709625-1341255067-5666.jpeg.jpg',
  'title': 'Surfin Con Los Beatles',
  'type': 'release',
  'year': 1964},
 {'artist': 'The Beatles',
  'format': 'LP, Comp, Mono',
  'id': 3709649,
  'label': 'Odeon',
  'resource_url': 'https://api.discogs.com/releases/3709649',
  'role': 'Main',
  'status': 'Accepted',
  'thumb': 'https://img.discogs.com/iPFUHDU7ce_qyj3j04gFJfQ1F-4=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-3709649-1341255732-4215.jpeg.jpg',
  'title': 'The Beatles Hits',
  'type': 'release',
  'year': 1964},
 {'artist': 'The Beatles',
  'format': 'LP, Album, Mono, Comp',
  'id': 3709693,
  

In [129]:
beatles_albums = get_artist_albums('The Beatles', show_progress=True)
len(beatles_albums)

Retrieved page 1 of 142 ; 59 requests before rate limit
Retrieved page 2 of 142 ; 58 requests before rate limit
Retrieved page 3 of 142 ; 57 requests before rate limit
Retrieved page 4 of 142 ; 56 requests before rate limit
Retrieved page 5 of 142 ; 55 requests before rate limit
Retrieved page 6 of 142 ; 54 requests before rate limit
Retrieved page 7 of 142 ; 53 requests before rate limit
Retrieved page 8 of 142 ; 52 requests before rate limit
Retrieved page 9 of 142 ; 51 requests before rate limit
Retrieved page 10 of 142 ; 50 requests before rate limit
Retrieved page 11 of 142 ; 49 requests before rate limit
Retrieved page 12 of 142 ; 48 requests before rate limit
Retrieved page 13 of 142 ; 47 requests before rate limit
Retrieved page 14 of 142 ; 46 requests before rate limit
Retrieved page 15 of 142 ; 45 requests before rate limit
Retrieved page 16 of 142 ; 44 requests before rate limit
Retrieved page 17 of 142 ; 43 requests before rate limit
Retrieved page 18 of 142 ; 42 requests b

7081

In [130]:
beatles_albums[0]

{'barcode': ['7464-48544-2', 'DIDP-076121 2', '[DADC logo]', 'AAD'],
 'catno': 'AK 48544',
 'community': {'have': 46, 'want': 6},
 'country': 'US',
 'format': ['CD', 'Album', 'Remastered'],
 'genre': ['Rock'],
 'id': 1180927,
 'label': ['Sony Music Special Products', 'Star-Club Hamburg', 'DADC'],
 'resource_url': 'https://api.discogs.com/masters/1180927',
 'style': ['Rock & Roll'],
 'thumb': 'https://img.discogs.com/eStw6Cr2L3xHUD8Wwmw2n-nuu5k=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-1436372-1355136570-4485.jpeg.jpg',
 'title': 'The Beatles - Live! At The Star-Club In Hamburg, Germany; 1962 (Vol. 1)',
 'type': 'master',
 'uri': '/The-Beatles-Live-At-The-Star-Club-In-Hamburg-Germany-1962-Vol-1/master/1180927',
 'year': '1991'}

In [133]:
beatles_df = pd.DataFrame([{'id': a['id'], 'year': a.get('year'), 
                            'title': a['title'][14:],
                            'resource_url': a['resource_url']}
              for a in beatles_albums
              if 'year' in a
              if a['country'] == 'UK'
              if a['title'].startswith('The Beatles - ')
              if a['type'] == 'master'])
beatles_df

Unnamed: 0,id,resource_url,title,year
0,24047,https://api.discogs.com/masters/24047,Abbey Road,1969
1,1060866,https://api.discogs.com/masters/1060866,"Live! At The Star Club, Hamburg, Germany 1962 ...",1985
2,45895,https://api.discogs.com/masters/45895,Help!,1965
3,45362,https://api.discogs.com/masters/45362,Please Please Me,1963
4,54585,https://api.discogs.com/masters/54585,Rock 'N' Roll Music,1976
5,24003,https://api.discogs.com/masters/24003,A Hard Day's Night,1964
6,45729,https://api.discogs.com/masters/45729,With The Beatles,1963
7,347141,https://api.discogs.com/masters/347141,Sgt. Pepper's Lonely Hearts Club Band,1967
8,54829,https://api.discogs.com/masters/54829,The Beatles At The Hollywood Bowl,1977
9,45799,https://api.discogs.com/masters/45799,Beatles For Sale,1964


In [134]:
collections.Counter(beatles_df['title']).most_common(5)

[("Sgt. Pepper's Lonely Hearts Club Band", 4),
 ('The Beatles At The Hollywood Bowl', 2),
 ('Beatles For Sale', 2),
 ('1962-1966', 1),
 ("Rock 'N' Roll Music", 1)]

In [135]:
abbey_road = get_by_url('https://api.discogs.com/masters/24047')
abbey_road.keys()

dict_keys(['year', 'images', 'videos', 'artists', 'num_for_sale', 'uri', 'title', 'resource_url', 'genres', 'versions_url', 'id', 'main_release_url', 'main_release', 'tracklist', 'data_quality', 'lowest_price', 'notes', 'styles'])

In [136]:
[(t['title'], t['position']) for t in abbey_road['tracklist']]

[('Come Together', 'A1'),
 ('Something', 'A2'),
 ("Maxwell's Silver Hammer", 'A3'),
 ('Oh! Darling', 'A4'),
 ("Octopus's Garden", 'A5'),
 ("I Want You (She's So Heavy)", 'A6'),
 ('Here Comes The Sun', 'B1'),
 ('Because', 'B2'),
 ('You Never Give Me Your Money', 'B3'),
 ('Sun King', 'B4'),
 ('Mean Mr. Mustard', 'B5'),
 ('Polythene Pam', 'B6'),
 ('She Came In Through The Bathroom Window', 'B7'),
 ('Golden Slumbers', 'B8'),
 ('Carry That Weight', 'B9'),
 ('The End', 'B10'),
 ('Her Majesty', 'B11')]

In [137]:
abbey_road['main_release_url']

'https://api.discogs.com/releases/2498807'

In [139]:
come_together_search = discogs_search({'artist': 'The Beatles', 'title': 'Come Together'}, show_progress=True)
come_together_search

Retrieved page 1 of 2 ; 59 requests before rate limit
Retrieved page 2 of 2 ; 58 requests before rate limit


[{'barcode': [],
  'catno': '3C 006 04266',
  'community': {'have': 10, 'want': 24},
  'country': 'Italy',
  'format': ['Vinyl', '7"', 'Reissue'],
  'genre': ['Rock', 'Pop'],
  'id': 1833079,
  'label': ['Apple Records', 'The Greatest Story'],
  'resource_url': 'https://api.discogs.com/releases/1833079',
  'style': ['Pop Rock'],
  'thumb': 'https://img.discogs.com/9CQznpENauUYZkwi4dp9-ZG2oR4=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-1833079-1475199701-3047.jpeg.jpg',
  'title': 'The Beatles - Come Together',
  'type': 'release',
  'uri': '/Beatles-Come-Together/release/1833079',
  'year': '1976'},
 {'barcode': [],
  'catno': '8E 006 40 031 M',
  'community': {'have': 22, 'want': 28},
  'country': 'Portugal',
  'format': ['Vinyl', '7"', 'Single'],
  'genre': ['Rock'],
  'id': 2753401,
  'label': ['Parlophone'],
  'resource_url': 'https://api.discogs.com/releases/2753401',
  'style': ['Pop Rock'],
  'thumb': 'https://img.discogs.com/ilMWcd90

In [141]:
[t for t in come_together_search
if t['country'] == 'UK'
if t['type'] == 'master']

[{'barcode': ['7YCE.21369', '7YCE.21370', '7YCE 21369-1U', '7YCE 21370-1U'],
  'catno': 'R 5814',
  'community': {'have': 3970, 'want': 1789},
  'country': 'UK',
  'format': ['Vinyl', '7"', 'Single'],
  'genre': ['Rock'],
  'id': 54558,
  'label': ['Apple Records', 'Northern Songs', 'Harrisongs'],
  'resource_url': 'https://api.discogs.com/masters/54558',
  'style': ['Pop Rock'],
  'thumb': 'https://img.discogs.com/lNA75bhNCWcBY42-g-Mf0zoAYPs=/fit-in/150x150/filters:strip_icc():format(jpeg):mode_rgb():quality(40)/discogs-images/R-513427-1278590203.jpeg.jpg',
  'title': 'The Beatles - Something / Come Together',
  'type': 'master',
  'uri': '/The-Beatles-Something-Come-Together/master/54558',
  'year': '1969'}]

In [142]:
url = 'https://api.discogs.com/releases/2498807'
headers = {'Authorization': 'Discogs token=' + config['discogs']['token']}
releases = []
request = urllib.request.Request(url, headers=headers, method='GET')
response = urllib.request.urlopen(request)
response.info()

<http.client.HTTPMessage at 0x7f690f1cc1d0>

In [147]:
type(int(response.info()['X-Discogs-Ratelimit-Remaining']))

int