In [None]:
import requests
import pandas as pd
import json
import Keys
import time
import pickle
from rauth import OAuth1Service

In [None]:
def get_musix_genres(key):
    url = 'https://api.musixmatch.com/ws/1.1/'
    sub_url = 'music.genres.get'
    params = {
        'apikey': key
    }
    
    musix_genres_json = requests.get(url + sub_url, params = params).json()['message']['body']['music_genre_list']
    
    musix_genres = []
    
    for dictionary in musix_genres_json:
        genre = dictionary['music_genre']
        musix_genres.append({'genre_id': genre['music_genre_id'],
                             'genre_name': genre['music_genre_vanity']})
    
    return musix_genres

In [None]:
def get_musix_track_info_by_genre(genres, key, id_limit = 5000):
    url = 'https://api.musixmatch.com/ws/1.1/'
    sub_url = 'track.search'
    
    for genre in genres:
        params = {
            'apikey': key,
            'q_track': '*',
            'f_music_genre_id': genre['genre_id'],
            'f_has_lyrics': 1,
            'f_lyrics_language': 'en',
            'page_size': 100
        }
        
        num_tracks = requests.get(url + sub_url, params = params).json()['message']['header']['available']
        page_limit = min(num_tracks, id_limit)
        page_max = (page_limit // 100) + 1
        pages = range(2, page_max)
        
        print('{}: Retrieving {} ids in {} pages'.format(genre['genre_name'], page_limit, page_max - 1))
        print('Retrieving page 1 of {}'.format(page_max - 1))
        
        tracks = requests.get(url + sub_url, params = params).json()['message']['body']['track_list']
        
        with open('../data/raw/musix_match_track_data.csv', 'a') as f:
            track_info = [','.join([str(track['track']['track_id']),
                                    track['track']['track_name'],
                                    str(genre['genre_id']),
                                    genre['genre_name']]) + '\n' for track in tracks]
                         
            f.writelines(track_info)

        for page in pages:
            time.sleep(1)
            params['page'] = page
            print('Retrieving page {} of {}'.format(page, page_max - 1))
            tracks = requests.get(url + sub_url, params = params).json()['message']['body']['track_list']
            
            with open('../data/raw/musix_match_track_data.csv', 'a') as f:
                track_info = [','.join([str(track['track']['track_id']),
                                        track['track']['track_name'],
                                        str(genre['genre_id']),
                                        genre['genre_name']]) + '\n' for track in tracks]

                f.writelines(track_info)
    
        print('Retrieved {} ids for genre {}'.format(page_limit, genre['genre_name']))
    
    return

In [None]:
musixmatch_url = 'https://api.musixmatch.com/ws/1.1/'
musixmatch_key = Keys.musixmatch_key

In [None]:
# musix_genres = get_musix_genres(musixmatch_key)

# with open('../data/raw/genres_dict.pickle', 'wb') as f:
#     pickle.dump(musix_genres, f)

In [None]:
with open('../data/raw/genres_dict.pickle', 'rb') as f:
    musix_genres = pickle.load(f)

In [None]:
sorted(musix_genres, key = lambda x: x['genre_id'])

In [None]:
top_level_genres = sorted([x for x in musix_genres if x['genre_id'] < 54], key = lambda x: x['genre_id'])
genre_ids_to_drop = [0, 3, 4, 9, 10, 16, 25, 27, 28, 29, 30, 34, 50, 51, 52, 53]
genres_to_extract = [x for x in top_level_genres if x['genre_id'] not in genre_ids_to_drop]

In [None]:
genres_to_extract

In [None]:
# get_musix_track_info_by_genre(genres_to_extract, musixmatch_key, id_limit = 10_000)