In [1]:
import requests
import numpy as np
import pandas as pd
import json
import Keys
import time
import pickle
import re
from lyricsgenius import Genius

In [None]:
def get_musix_genres(key):
    url = 'https://api.musixmatch.com/ws/1.1/'
    sub_url = 'music.genres.get'
    params = {
        'apikey': key
    }
    
    musix_genres_json = requests.get(url + sub_url, params = params).json()['message']['body']['music_genre_list']
    
    musix_genres = []
    
    for dictionary in musix_genres_json:
        genre = dictionary['music_genre']
        musix_genres.append({'genre_id': genre['music_genre_id'],
                             'genre_name': genre['music_genre_vanity']})
    
    return musix_genres

In [None]:
def get_musix_track_info_by_genre(genres, key, file, id_limit = 5000):
    url = 'https://api.musixmatch.com/ws/1.1/'
    sub_url = 'track.search'
    
    for genre in genres:
        params = {
            'apikey': key,
            'q_track': '*',
            'f_music_genre_id': genre['genre_id'],
            'f_has_lyrics': 1,
            'f_lyrics_language': 'en',
            'page_size': 100,
            'page': 1
        }
        
        num_tracks = requests.get(url + sub_url, params = params).json()['message']['header']['available']
        page_limit = min(num_tracks, id_limit)
        page_max = (page_limit // 100) + 1
        pages = range(2, page_max)
        
        print('{}: Retrieving {} ids in {} pages'.format(genre['genre_name'], page_limit, page_max - 1))
        print('Retrieving page 1 of {}'.format(page_max - 1))
        
        tracks = requests.get(url + sub_url, params = params).json()['message']['body']['track_list']
        
        with open(file, 'a') as f:
            track_info = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(track['track']['track_id'],
                                                                    track['track']['track_name'],
                                                                    genre['genre_id'],
                                                                    genre['genre_name'],
                                                                    track['track']['album_id'],
                                                                    track['track']['album_name'],
                                                                    track['track']['artist_id'],
                                                                    track['track']['artist_name'])
                          for track in tracks]
                         
            f.writelines(track_info)

        for page in pages:
            time.sleep(1.0)
            params['page'] = page
            print('Retrieving page {} of {}'.format(page, page_max - 1))
            tracks = requests.get(url + sub_url, params = params).json()['message']['body']['track_list']
            
            with open(file, 'a') as f:
                track_info = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(track['track']['track_id'],
                                                                        track['track']['track_name'],
                                                                        genre['genre_id'],
                                                                        genre['genre_name'],
                                                                        track['track']['album_id'],
                                                                        track['track']['album_name'],
                                                                        track['track']['artist_id'],
                                                                        track['track']['artist_name'])
                              for track in tracks]

                f.writelines(track_info)
    
        print('Retrieved {} ids for genre {}'.format(page_limit, genre['genre_name']))
    
    return

In [None]:
def scrape_song_lyrics(df, key, file):
    genius = Genius(key)
    
    for row in df.index:
        title = df.iloc[row].title_name
        artist = df.iloc[row].artist_name
        try:
            song = genius.search_song(title, artist)
        except Exception:
            with open('../data/raw/exception-log.txt', 'a') as f:
                f.write('{}, {}, {}'.format(str(type(Exception)), title, artist))
            continue
            
        if song is None:
            continue
        
        song_split = song.lyrics.split('\n')
        song_split_no_empty_headers = [x for x in song_split if '(' not in x and x != '' and '[' not in x]
        song_scraped_artefacts_removed = []

        for line in song_split_no_empty_headers:
            song_scraped_artefacts_removed.append(re.sub(r'([A-Z]?[a-z]+)[A-Z][\w\W]+', '\g<1>', line))

        lyrics = '|'.join(song_scraped_artefacts_removed)

        with open(file, 'a') as f:
            f.write('{}\t{}\t{}\n'.format(lyrics, title, artist))
            
    return

In [None]:
def get_genre_subframe(frame, genre):
    return frame.loc[frame.genre_name == genre].reset_index().drop('index', axis = 1)

In [2]:
def extract_songs_no_lyrics(files, target):
    df = pd.DataFrame()
    for file in files:
        with open(file, 'r') as f:
            lines = f.readlines()
        
        lines_split = [x.split('\t') for x in lines]
        valid_split = [x for x in lines_split if len(x) == 3]
        temp_frame = pd.DataFrame(valid_split, columns = ['lyrics', 'title_name', 'artist_name'])
        temp_frame.artist_name = temp_frame.artist_name.str.replace(r'\n', r'', regex = True)
        df = pd.concat([df, temp_frame], axis = 0)
        
    merged_df = target.merge(df, how = 'left', on = ['title_name', 'artist_name'])
    
    return merged_df.loc[merged_df.lyrics.isna()]

In [3]:
def clean_titles(original_df):
    df = original_df.copy()
    # Replace any expressions in parentheses or brackets with spaces
    df.title_name = df.title_name.str.replace(r'\(.+\)', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'\[.+\]', r' ', regex = True)
    # Replace any instances of Remix, Remaster, Live, Radio Edit, Extended, and Bonus with spaces
    df.title_name = df.title_name.str.replace(r'[Rr]emix(ed)?', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'[Rr]emaster(ed)?', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'[Ll]ive', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'[Rr]adio [Ee]dit', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'[Ee]xtended', r' ', regex = True)
    df.title_name = df.title_name.str.replace(r'[Bb]onus( [Tt]rack)?', r' ', regex = True)
    # Replace any non-English, non-Latin, non-digit, non-apostrophe character with spaces
    df.title_name = df.title_name.str.replace(r'/[^\x00-\x7FâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇÖöǾǿØ̈ø̈\d\']+/',
                                              r' ', regex = True)
    # Convert any non-hyphen dashes and following characters into spaces
    df.title_name = df.title_name.str.replace(r'- [\w\W]+', r' ', regex = True)
    # Convert all series of spaces to single space
    df.title_name = df.title_name.str.replace(r'\s+', r' ', regex = True)
    # Strip leading and trailing dashes and spaces
    df.title_name = df.title_name.str.strip('- ')
    # Drop rows whose titles are now empty
    df = df.loc[df.title_name != '']
    
    return df

In [None]:
musixmatch_url = 'https://api.musixmatch.com/ws/1.1/'
musixmatch_key = Keys.musixmatch_key
genius_access_token = Keys.genius_access_token

In [None]:
# with open('../data/raw/genres_dict.pickle', 'rb') as f:
#     musix_genres = pickle.load(f)
    
# genre_ids_to_keep = [2, 6, 11, 14, 15, 17, 18, 20, 22, 24, 1149, 1152]
# genres_to_extract = [x for x in musix_genres if x['genre_id'] in genre_ids_to_keep]

In [None]:
# file = '../data/raw/musix_match_track_data_with_artists_and_albums-02.tsv'
# get_musix_track_info_by_genre(genres_to_extract, musixmatch_key, file, id_limit = 10_000)

In [4]:
columns = ['title_id', 'title_name', 'genre_id', 'genre_name', 'album_id', 'album_name', 'artist_id', 'artist_name']
first_half_genres = pd.read_csv('../data/raw/musix_match_track_data_with_artists_and_albums.tsv',
                                sep = '\t', header = None)
first_half_genres.columns = columns
second_half_genres = pd.read_csv('../data/raw/musix_match_track_data_with_artists_and_albums-02.tsv',
                                 sep = '\t', header = None)
second_half_genres.columns = columns

In [None]:
rock_death_black = get_genre_subframe(first_half_genres, 'Rock-Death-Metal-Black-Metal')
rock_hard_rock = get_genre_subframe(first_half_genres, 'Rock-Hard-Rock')
blues = get_genre_subframe(first_half_genres, 'Blues')
jazz = get_genre_subframe(first_half_genres, 'Jazz')
country = get_genre_subframe(first_half_genres, 'Country')

reggae = get_genre_subframe(second_half_genres, 'Reggae')
christian_gospel = get_genre_subframe(second_half_genres, 'Christian-Gospel')
rb_soul = get_genre_subframe(second_half_genres, 'R-B-Soul')
dance = get_genre_subframe(second_half_genres, 'Dance')
pop = get_genre_subframe(second_half_genres, 'Pop')
hip_hop_rap = get_genre_subframe(second_half_genres, 'Hip-Hop-Rap')
alternative = get_genre_subframe(second_half_genres, 'Alternative')

In [5]:
lyrics_files = ['../data/raw/alternative_song_lyrics.tsv',
                '../data/raw/black_death_metal_song_lyrics.tsv',
                '../data/raw/blues_song_lyrics.tsv',
                '../data/raw/christian_gospel_song_lyrics.tsv',
                '../data/raw/country_song_lyrics.tsv',
                '../data/raw/dance_song_lyrics.tsv',
                '../data/raw/hip_hop_rap_song_lyrics.tsv',
                '../data/raw/jazz_song_lyrics.tsv',
                '../data/raw/pop_song_lyrics.tsv',
                '../data/raw/rb_soul_song_lyrics.tsv',
                '../data/raw/reggae_song_lyrics.tsv',
                '../data/raw/rock_hard_rock_song_lyrics.tsv']
all_genres = pd.concat([first_half_genres, second_half_genres], axis = 0)
missed_songs = extract_songs_no_lyrics(lyrics_files, all_genres)
missed_songs_titles_cleaned = clean_titles(missed_songs)

In [None]:
# file = '../data/raw/rock_death_black_song_lyrics.tsv'
# scrape_song_lyrics(rock_death_black, genius_access_token, file)

In [None]:
# file = '../data/raw/rock_hard_rock_song_lyrics.tsv'
# scrape_song_lyrics(rock_hard_rock, genius_access_token, file)

In [None]:
# file = '../data/raw/blues_song_lyrics.tsv'
# scrape_song_lyrics(blues, genius_access_token, file)

In [None]:
# file = '../data/raw/jazz_song_lyrics.tsv'
# scrape_song_lyrics(jazz, genius_access_token, file)

In [None]:
# file = '../data/raw/reggae_song_lyrics.tsv'
# scrape_song_lyrics(reggae, genius_access_token, file)

In [None]:
# file = '../data/raw/christian_gospel_song_lyrics.tsv'
# scrape_song_lyrics(christian_gospel, genius_access_token, file)

In [None]:
# file = '../data/raw/dance_song_lyrics.tsv'
# scrape_song_lyrics(dance, genius_access_token, file)

In [None]:
# file = '../data/raw/pop_song_lyrics.tsv'
# scrape_song_lyrics(pop, genius_access_token, file)

In [None]:
# file = '../data/raw/hip_hop_rap_song_lyrics.tsv'
# scrape_song_lyrics(hip_hop_rap, genius_access_token, file)

In [None]:
# file = '../data/raw/alternative_song_lyrics.tsv'
# scrape_song_lyrics(alternative, genius_access_token, file)

In [None]:
file = '../data/raw/missed_songs_titles_cleaned_song_lyrics.tsv'
scrape_song_lyrics(missed_songs_titles_cleaned, genius_access_token, file)