In [3]:
from lyricsgenius.api import Genius
from config import genius_client_access_token, s3_bucket
import yaml
import json
import boto3
import time
from utils.s3_manager import S3Manager
from tqdm.notebook import tqdm

In [4]:
s3_manager = S3Manager(s3_bucket)

In [5]:
genius_api = Genius(genius_client_access_token, verbose=True, sleep_time=0)

In [6]:
MAX_SONGS = 40

with open("artist_list.yaml", 'r') as f:
    artists_names = set(yaml.load(f, Loader=yaml.FullLoader))

In [7]:
# Get last songs if already in S3
songs = s3_manager.get_latest_song_data()

if songs is None:
    found_artists = set()
    songs = []
else:
    found_artists = set(map(lambda song: song["artist_name"], songs))

print(f"# songs : {len(songs)}")
print(f"# artists : {len(found_artists)}")

# songs : 4271
# artists : 109


In [None]:
for artist_name in tqdm(artists_names - found_artists):
    print("\nSearching {}".format(artist_name))

    try:
        artist = genius_api.search_artist(artist_name, max_songs=MAX_SONGS)
        for song in artist.songs:
            songs.append({
                "artist_name": artist_name,
                "artist": artist.name,
                "song": song.title,
                "lyrics": song.lyrics,
                "year": song.year[:4] if song.year is not None else None,
                "featured_artists": song.featured_artists,
                "url": song._url
            })
    except KeyboardInterrupt as e:
        raise e
    except:
        print(f"Could not fetch results for artist : {artist_name}. Continuing...")
        continue

In [None]:
len(songs)

In [None]:
s3_manager.save_song_data(songs)