## Imports

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import json
import musicbrainzngs
import sqlalchemy
from sqlalchemy import text
from dotenv import load_dotenv
import os
from numpy import NaN
import time
import logging
logging.basicConfig(filename='info.log', level=logging.INFO)
from datetime import datetime
from ratelimit import limits,sleep_and_retry

## Inital setup
- load env variables
- create connection to the sql database
- create a spotify API client

In [None]:
Thirty_SECONDS = 30

@sleep_and_retry
@limits(calls=60, period=Thirty_SECONDS)
def call_api():
    return 0

In [None]:
musicbrainzngs.set_useragent('application-project', '0.0.1')

In [None]:
load_dotenv()
# token = os.environ.get("ENV_VARIABLE")

SPOTIFY_CLIENT_ID = os.environ.get("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = os.environ.get("SPOTIFY_CLIENT_SECRET")
LAST_FM_API_KEY = os.environ.get("LAST_FM_API")

SQL_USERNAME = os.environ.get('SQL_USERNAME')
SQL_PASSWORD = os.environ.get('SQL_PASSWORD')
SQL_SCHEMA=os.environ.get('SQL_SCHEMA')
SQL_TABLE=os.environ.get('SQL_TABLE')
SQL_DIALECT = os.environ.get('SQL_DIALECT')
SQL_DIRVER = os.environ.get('SQL_DRIVER')
SQL_HOST = os.environ.get('SQL_HOST')
SQL_PORT = os.environ.get('SQL_PORT')

In [None]:
engine = sqlalchemy.create_engine(f'{SQL_DIALECT}+{SQL_DIRVER}://{SQL_USERNAME}:{SQL_PASSWORD}@{SQL_HOST}:{SQL_PORT}')
with engine.connect() as connection:
    connection.execute(text('Create database if not exists eighties'))

In [None]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
chart_power_df = pd.read_excel('chart-power-scores_80s.xlsx')
chart_power_df = chart_power_df.applymap(lambda s: s.lower() if type(s) == str else s)

In [None]:
chart_power_df = chart_power_df[['Song', 'Artist', 'Points']].groupby(['Song', 'Artist']).sum()
chart_power_df.reset_index(inplace=True)

In [None]:
chart_power_df.shape

## Functions

In [None]:
def get_genres(isrc, api_key=LAST_FM_API_KEY):
    '''
    Retrives the genres of a song via the Last FM API.

    First the isrc number is used to retrieve the musicbrainz id via the musicbraniz API. 
    This is used to retrive the genres later on from the Last FM API.

    Parameter
    ---------
    isrc: string
        ISRC number of the song

    api_key: string
        API key for the Last FM API

    Return
    ------
    genre_list: string
        string that contains the genres in a list
    '''
    # print(f'Get genre for isrc: {isrc}')
    genre_list = ''
    try:
        # Use musicbrainz to get the musicbraniz id (https://musicbrainz.org/doc/MusicBrainz_API)
        recordings = musicbrainzngs.get_recordings_by_isrc(isrc)
        isrcs = recordings['isrc'] if 'isrc' in recordings else ''
        recording_list = isrcs['recording-list'] if 'recording-list' in isrcs else ''
        # recording_list = musicbrainzngs.get_recordings_by_isrc(isrc)['isrc']['recording-list']
    except musicbrainzngs.ResponseError as e:
        cause = str(e.cause)
        if cause.find('404') == -1:
            print(cause)
        return genre_list
    except Exception as e:
        print(e)
        return genre_list
    
    # time.sleep(1)

    if len(recording_list) == 0:
        return genre_list
    elif len(recording_list) > 1:
        print(f'Multiple recordings with isrc: {isrc}')

    if 'id' in recording_list[0]:
        mbid = recording_list[0]['id']
    else:
        return genre_list

    # Use Last FM to get the genres (https://www.last.fm/api/show/track.getInfo)
    url = f"http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={api_key}&mbid={mbid}&format=json"
    try:
        response = requests.get(url)
        data = json.loads(response.text)
    except Exception as e:
        print('Failed to retrieve genres from the Last FM API')
        print(e)
        return genre_list
    
    track = data['track'] if 'track' in data else ''
    toptags = track['toptags'] if 'toptags' in track else ''
    if 'tag' in toptags:
        genres = toptags['tag']
        for genre in genres:
            genre_list = f'{genre_list}{genre["name"]},'
        return genre_list[:len(genre_list)-1]
    else:    
        return genre_list
    
    

In [None]:
def filter_track_features(track, genre, features):
    '''
    Filters the relevant features of a track in returns them in JSON object.

    Parameter
    ---------
    track: Object
        Track returend by the spotify API

    Return
    ------
    relevant_features: Object
        JSON Object that contains the relevant featues
    '''
    external_ids = track['external_ids'] if 'external_ids' in track else {}
    isrc = external_ids['isrc'] if 'isrc' in external_ids else NaN
    artist_names = []

    if 'artists' in track and type(track['artists']) == list:
        for artist in track['artists']:
            if 'name' in artist:
                artist_names.append(artist['name'])
    #         ids = []
    #         for artist in track['artists']:
                # if 'id' in artist:
                    # ids.append(artist['id'])
            # artists = sp.artists(ids)
            # artists = artists['artists'] if 'artists' in artists else []
            # if type(artists) == list:
                #  for artist in artists:
                    # genres = ','.join(artist['genres']) if 'genres' in artist else []
            #         if 'name' in artist:
            #              artist_names.append(artist['name'])

    artist_names = ','.join(artist_names)
    
    if 'album' in track:
        album = track['album']['name'] if 'name' in track['album'] else NaN
        release_date = track['album']['release_date'] if 'release_date' in track['album'] else NaN
        release_date_precision = track['album']['release_date_precision'] if 'release_date_precision' in track['album'] else NaN
    else:
         album = NaN
         release_date = NaN
         release_date_precision = NaN


    track_name = track['name'] if 'name' in track else NaN
    if track_name != NaN:
        points = chart_power_df.loc[(chart_power_df.Song == track_name.lower()) & (chart_power_df.Artist == artist_names.lower())]['Points']
        if points.empty:
             points = NaN
        else:
            points = int(points)
    else:
         points = NaN


    return {
        'name': track_name,
        'artists': artist_names,
        'album': album,
        'release_date': release_date,
        'release_date_precision': release_date_precision,
        'spotify_id': track['id'] if 'id' in track else NaN,
        'chart_power': points,
        'uri': track['uri'] if 'uri' in track else NaN,
        'popularity': track['popularity'] if 'popularity' in track else NaN,
        'genres': genre,
        'danceability': features['danceability'] if 'danceability' in features else NaN,
        'energy': features['energy'] if 'energy' in features else NaN,
        'key': features['key'] if 'key' in features else NaN,
        'loudness': features['loudness'] if 'loudness' in features else NaN,
        'mode': features['mode'] if 'mode' in features else NaN,
        'speechiness': features['speechiness'] if 'speechiness' in features else NaN,
        'acousticness': features['acousticness'] if 'acousticness' in features else NaN,
        'instrumentalness': features['instrumentalness'] if 'instrumentalness' in features else NaN,
        'liveness': features['liveness'] if 'liveness' in features else NaN,
        'valence': features['valence'] if 'valence' in features else NaN,
        'tempo': features['tempo'] if 'tempo' in features else NaN,
        'duration_ms': features['duration_ms'] if 'duration_ms' in features else NaN,
        'time_signature': features['time_signature'] if 'time_signature' in features else NaN,
        'isrc': isrc,
    }

In [None]:
def save_df_to_sql(df: pd.DataFrame, table_name=SQL_TABLE, schema=SQL_SCHEMA, if_exists='replace'):
    '''
    Saves the DataFrame in the SQL Database

    Parameter
    ---------
    df: pd.DataFrame
        DataFrame that should be saved

    table_name: string; default=SQL_TABLE (.env)
        Table name the DataFrame should be saved in.

    schema: string; default=SQL_SCHMEA (.env)
        Schema that should be used for the database

    if_exists: string; default="replace"
        Action that should be performed if the specified table already exists. Possible values are "replace", "fail", "append".
    '''
    try:
        df.to_sql(table_name, engine, schema=schema, if_exists=if_exists)
    except Exception as e:
        print(e)

In [None]:
def read_df_from_sql(table_name=SQL_TABLE, schema=SQL_SCHEMA):
    '''
    Reads a SQL table and saves it into a DataFrame.

    Parameter
    ---------
    table_name: string; default=SQL_TABLE (.env)
        Name of the table in the database

    schema: string; default=SQL_SCHEMA (.env)
        Name of the SQL schmea

    Return
    ------
    df: pd.DataFrame
        SQL table in a DatFrame
    '''
    try:
        with engine.connect() as connection:
            return pd.read_sql_table(table_name, con=connection, schema=schema)
    except Exception as e:
        print(e)

In [None]:
# Make initial request to get total number of results
def get_number_of_tracks(release_year, start_letters, genre):
    '''
    Retrieves the number of tracks the spotfiy API returns for a specific query.

    Parameter
    ---------
    release_year: int
        Year the tracks were released

    start_letters: string
        Letters the songs start with

    Return
    ------
    num: int
        Number of tracks that spotify has data for. The max number is 1000. If 1000 is returned, it is possible that the number is higher.
    '''
    try:
        call_api()
        result = sp.search(q=f'year:{release_year} track:{start_letters}* genre:{genre}', type='track', limit=1, offset=0, market='DE')
        tracks = result['tracks'] if 'tracks' in result else ''
        return tracks['total'] if 'total' in tracks else 0
    except Exception as e:
        print(e)
    return 0

In [None]:
def req_query_tracks(release_year, genres, start_letters = '', limit=50):
    '''
    Recursivley queries all tracks spotify returns for a specific query.

    Parameter
    ---------
    release_year: int
        Year the tracks were released

    start_letters: string, default=''
        Letters the songs start with
    
    limit: int; default=50
        Number of tracks that should be queried at once. Max number is 50
    '''
    global df
    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    if type(genres) == str:
        genres = [genres]
    for genre in genres:
        for letter in alphabet:
            letters = start_letters + letter
            total_results = get_number_of_tracks(release_year, letters, genre)
            if total_results < 1000:
                # print(release_year, letters, genre, total_results)
                logging.info(f'{release_year}-{letters}-{genre}-{total_results}')
                # Loop through results and retrieve tracks
                offset = 0

                while offset < total_results:
                    try:
                        track_features = []
                        call_api()
                        result = sp.search(q=f'year:{release_year} track:{letters}* genre:{genre}', type='track', limit=limit, offset=offset)
                        tracks = result['tracks'] if 'tracks' in result else ''
                        if 'items' in tracks:
                            tracks_50_chunks = []
                            for i in range(0, len(tracks['items']), limit):
                                x = i
                                tracks_50_chunks.append(tracks['items'][x:x+limit])
                            for tracks_50 in tracks_50_chunks:
                                ids = []
                                for track in tracks_50:
                                    ids.append(track['id'])
                                call_api()
                                features_50 = sp.audio_features(ids)
                                for track in tracks['items']:
                                    for feature in features_50:
                                        if(feature['id'] == track['id']):
                                            features = filter_track_features(track, genre, feature)
                                            track_features.append(features)
                            offset += limit
                            df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
                        else:
                            continue
                    except Exception as e:
                        print(e)
            else:
                req_query_tracks(release_year, genre, letters)

In [None]:
call_api()
result = sp.search(q='year:1981')
tracks = result['tracks'] if 'tracks' in result else ''
if 'items' in tracks:
    for track in tracks['items']:
        for artist in track['artists']:
            print(sp.artist(artist['id'])['genres'])

## Code

In [None]:
columns = ['name', 'artists', 'album', 'release_date', 'release_date_precision', 'chart_power', 'spotify_id', 'uri', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'isrc', 'genres']
df = pd.DataFrame(columns=columns)

In [None]:
# Set up query parameters
query = 'year:1980'
limit = 1
offset = 0

# Make initial request to get total number of results
result = sp.search(q=query, type='track', limit=1, offset=0, market='DE')
total_results = result['tracks']['total']

# Loop through results and retrieve tracks
while offset < 10:
    call_api()
    result = sp.search(q=query, type='track', limit=limit, offset=offset, market='DE')
    track_features = []
    # 50 chunks
    tracks_50_chunks = []
    for i in range(0, len(result['tracks']['items']), 50):
        x = i
        tracks_50_chunks.append(result['tracks']['items'][x:x+50])
    for tracks_100 in tracks_50_chunks:
        ids = []
        for track in tracks_100:
            ids.append(track['id'])
        call_api()
        features_100 = sp.audio_features(ids)
        for track in tracks_100:
            for feature in features_100:
                if(feature['id'] == track['id']):
                    features = filter_track_features(track, 'rock', feature)
                    track_features.append(features)
    offset += limit
    df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
df

In [None]:
"""
call_api()
genres = sp.recommendation_genre_seeds()['genres']
for year in range(1980, 1986):
    req_query_tracks(year, genres, '')
"""

In [None]:
# check if there is any duplicated track( a track may have more than 1 genre)
# these should be join together later (e.g.genres:pop,country)
df['isrc'].duplicated
df

In [None]:
call_api()
genres = sp.recommendation_genre_seeds()['genres']

from concurrent.futures import ThreadPoolExecutor, as_completed


#start_time = datetime.now() 
#for year in range(1980, 1985):
#    req_query_tracks(year, genres[0:3], '')
#end_time = datetime.now()         
#print('Duration: {}'.format(end_time - start_time))

#threading:
start_time = datetime.now() 

threads= []
with ThreadPoolExecutor(max_workers=10) as executor:
    for year in range(1981, 1982):
        threads.append(executor.submit(req_query_tracks, year, genres))
        
    for task in as_completed(threads):
        print(task.result()) 

end_time = datetime.now()         
print('Duration: {}'.format(end_time - start_time))

In [None]:
track_1980 = df.to_csv("track_1981.csv")