## Imports

In [172]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import json
import musicbrainzngs
import sqlalchemy
from sqlalchemy import text
from dotenv import load_dotenv
import os
import time

## Inital setup
- load env variables
- create connection to the sql database
- create a spotify API client

In [173]:
musicbrainzngs.set_useragent('application-project', '0.0.1')

In [174]:
load_dotenv()
# token = os.environ.get("ENV_VARIABLE")

SPOTIFY_CLIENT_ID = os.environ.get("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = os.environ.get("SPOTIFY_CLIENT_SECRET")
LAST_FM_API_KEY = os.environ.get("LAST_FM_API")

SQL_USERNAME = os.environ.get('SQL_USERNAME')
SQL_PASSWORD = os.environ.get('SQL_PASSWORD')
SQL_SCHEMA=os.environ.get('SQL_SCHEMA')
SQL_TABLE=os.environ.get('SQL_TABLE')
SQL_DIALECT = os.environ.get('SQL_DIALECT')
SQL_DIRVER = os.environ.get('SQL_DRIVER')               # pip install mysqlclient
SQL_HOST = os.environ.get('SQL_HOST')
SQL_PORT = os.environ.get('SQL_PORT')

In [175]:
engine = sqlalchemy.create_engine(f'{SQL_DIALECT}+{SQL_DIRVER}://{SQL_USERNAME}:{SQL_PASSWORD}@{SQL_HOST}:{SQL_PORT}')
with engine.connect() as connection:
    connection.execute(text('Create database if not exists eighties'))

In [176]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Functions

In [177]:
def get_genres(isrc, api_key=LAST_FM_API_KEY):
    '''
    Retrives the genres of a song via the Last FM API.

    First the isrc number is used to retrieve the musicbrainz id via the musicbraniz API. 
    This is used to retrive the genres later on from the Last FM API.

    Parameter
    ---------
    isrc: string
        ISRC number of the song

    api_key: string
        API key for the Last FM API

    Return
    ------
    genre_list: string
        string that contains the genres in a list
    '''
    # print(f'Get genre for isrc: {isrc}')
    genre_list = ''
    try:
        # Use musicbrainz to get the musicbraniz id (https://musicbrainz.org/doc/MusicBrainz_API)
        recordings = musicbrainzngs.get_recordings_by_isrc(isrc)
        isrcs = recordings['isrc'] if 'isrc' in recordings else ''
        recording_list = isrcs['recording-list'] if 'recording-list' in isrcs else ''
        # recording_list = musicbrainzngs.get_recordings_by_isrc(isrc)['isrc']['recording-list']
    except Exception as e:
        print('Failed to to retrieve recordings from the musicbraniz API')
        print(e)
        return genre_list
    
    # time.sleep(1)

    if len(recording_list) == 0:
        return genre_list
    elif len(recording_list) > 1:
        print(f'Multiple recordings with isrc: {isrc}')

    if 'id' in recording_list[0]:
        mbid = recording_list[0]['id']
    else:
        return genre_list

    # Use Last FM to get the genres (https://www.last.fm/api/show/track.getInfo)
    url = f"http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={api_key}&mbid={mbid}&format=json"
    try:
        response = requests.get(url)
        data = json.loads(response.text)
    except Exception as e:
        print('Failed to retrieve genres from the Last FM API')
        print(e)
        return genre_list
    
    track = data['track'] if 'track' in data else ''
    toptags = track['toptags'] if 'toptags' in track else ''
    if 'tag' in toptags:
        genres = toptags['tag']
        for genre in genres:
            genre_list = f'{genre_list}{genre["name"]},'
        return genre_list[:len(genre_list)-1]
    else:    
        return genre_list
    
    

In [178]:
def filter_track_features(track,genre):
    '''
    Filters the relevant features of a track in returns them in JSON object.

    Parameter
    ---------
    track: Object
        Track returend by the spotify API

    Return
    ------
    relevant_features: Object
        JSON Object that contains the relevant featues
    '''
    
    features = sp.audio_features(track['id'])[0]
    external_ids = track['external_ids'] if 'external_ids' in track else {}
    isrc = external_ids['isrc'] if 'isrc' in external_ids else ''
 
    artists = []
    for artist in track['album']['artists']:
        artists.append(artist['name'])
    return {
        'name': track['name'] if 'name' in track else '',
        'artist': ','.join(artists),
        'popularity': track['popularity'] if 'popularity' in track else '',
        'genres': genre,
        'danceability': features['danceability'] if 'danceability' in features else '',
        'energy': features['energy'] if 'energy' in features else '',
        'key': features['key'] if 'key' in features else '',
        'loudness': features['loudness'] if 'loudness' in features else '',
        'mode': features['mode'] if 'mode' in features else '',
        'speechiness': features['speechiness'] if 'speechiness' in features else '',
        'acousticness': features['acousticness'] if 'acousticness' in features else '',
        'instrumentalness': features['instrumentalness'] if 'instrumentalness' in features else '',
        'liveness': features['liveness'] if 'liveness' in features else '',
        'valence': features['valence'] if 'valence' in features else '',
        'tempo': features['tempo'] if 'tempo' in features else '',
        'duration_ms': features['duration_ms'] if 'duration_ms' in features else '',
        'time_signature': features['time_signature'] if 'time_signature' in features else '',
        'isrc': isrc,
    }

In [179]:
def save_df_to_sql(df: pd.DataFrame, table_name=SQL_TABLE, schema=SQL_SCHEMA, if_exists='replace'):
    '''
    Saves the DataFrame in the SQL Database

    Parameter
    ---------
    df: pd.DataFrame
        DataFrame that should be saved

    table_name: string; default=SQL_TABLE (.env)
        Table name the DataFrame should be saved in.

    schema: string; default=SQL_SCHMEA (.env)
        Schema that should be used for the database

    if_exists: string; default="replace"
        Action that should be performed if the specified table already exists. Possible values are "replace", "fail", "append".
    '''
    try:
        df.to_sql(table_name, engine, schema=schema, if_exists=if_exists)
    except Exception as e:
        print(e)

In [180]:
def read_df_from_sql(table_name=SQL_TABLE, schema=SQL_SCHEMA):
    '''
    Reads a SQL table and saves it into a DataFrame.

    Parameter
    ---------
    table_name: string; default=SQL_TABLE (.env)
        Name of the table in the database

    schema: string; default=SQL_SCHEMA (.env)
        Name of the SQL schmea

    Return
    ------
    df: pd.DataFrame
        SQL table in a DatFrame
    '''
    try:
        with engine.connect() as connection:
            return pd.read_sql_table(table_name, con=connection, schema=schema)
    except Exception as e:
        print(e)

In [181]:
# Make initial request to get total number of results
def get_number_of_tracks(release_year, start_letters, genre):
    '''
    Retrieves the number of tracks the spotfiy API returns for a specific query.

    Parameter
    ---------
    release_year: int
        Year the tracks were released

    start_letters: string
        Letters the songs start with

    Return
    ------
    num: int
        Number of tracks that spotify has data for. The max number is 1000. If 1000 is returned, it is possible that the number is higher.
    '''
    try:
        result = sp.search(q=f'year:{release_year} track:{start_letters}* genre:{genre}', type='track', limit=1, offset=0)
        tracks = result['tracks'] if 'tracks' in result else ''
        return tracks['total'] if 'total' in tracks else 0
    except Exception as e:
        print(e)
    return 0

In [182]:
def req_query_tracks(release_year, genres,start_letters = '',limit=50):
    '''
    Recursivley queries all tracks spotify returns for a specific query.

    Parameter
    ---------
    release_year: int
        Year the tracks were released

    start_letters: string, default=''
        Letters the songs start with
    
    limit: int; default=50
        Number of tracks that should be queried at once. Max number is 50
    '''
    global df
    alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for genre in genres:
        for letter in alphabet:
            letters = start_letters + letter
            total_results = get_number_of_tracks(release_year, letters,genre)
            if total_results < 1000:
                print(release_year, letters, genre)
                # Loop through results and retrieve tracks
                offset = 0

                while offset < total_results:
                    try:
                        track_features = []
                        result = sp.search(q=f'year:{release_year} track:{letters}* genre:{genre}', type='track', limit=limit, offset=offset)
                        tracks = result['tracks'] if 'tracks' in result else ''
                        if 'items' in tracks:
                            for track in tracks['items']:
                                features = filter_track_features(track,genre)
                                track_features.append(features)
                            offset += limit
                            df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)
                        else:
                            continue
                    except Exception as e:
                        print(e)
            else:
                req_query_tracks(release_year, [genre], letters)

## Code

In [183]:
columns = ['name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'isrc', 'genres']
df = pd.DataFrame(columns=columns)

In [184]:
# Set up query parameters
query = 'year:1980'
limit = 1
offset = 0

# Make initial request to get total number of results
result = sp.search(q=query, type='track', limit=1, offset=0)
total_results = result['tracks']['total']
print(result)

# Loop through results and retrieve tracks
while offset < 10:
    result = sp.search(q=query, type='track', limit=limit, offset=offset)
    track_features = []
    for track in result['tracks']['items']:
        features = filter_track_features(track,'pop')
        track_features.append(features)
    offset += limit
    df = pd.concat([df, pd.DataFrame(track_features)], ignore_index=True)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=year%3A1980&type=track&offset=0&limit=1', 'items': [{'album': {'album_group': 'album', 'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/711MCceyCBcFnzjGY4Q7Un'}, 'href': 'https://api.spotify.com/v1/artists/711MCceyCBcFnzjGY4Q7Un', 'id': '711MCceyCBcFnzjGY4Q7Un', 'name': 'AC/DC', 'type': 'artist', 'uri': 'spotify:artist:711MCceyCBcFnzjGY4Q7Un'}], 'available_markets': ['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT', 'AU', 'AZ', 'BA', 'BB', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BN', 'BO', 'BR', 'BS', 'BT', 'BW', 'BY', 'BZ', 'CA', 'CD', 'CG', 'CH', 'CI', 'CL', 'CM', 'CO', 'CR', 'CV', 'CW', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'ET', 'FI', 'FJ', 'FM', 'FR', 'GA', 'GB', 'GD', 'GE', 'GH', 'GM', 'GN', 'GQ', 'GR', 'GT', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'HU', 'ID', 'IE', 'IL', 'IN', 'IQ', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM'

In [185]:
genres = sp.recommendation_genre_seeds()['genres']

for year in range(1980, 1981):
    req_query_tracks(year,genres,'')



1980 a acoustic
1980 b acoustic
1980 c acoustic
1980 d acoustic
1980 e acoustic
1980 f acoustic
1980 g acoustic
1980 h acoustic
1980 i acoustic
1980 j acoustic
1980 k acoustic
1980 l acoustic
1980 m acoustic
1980 n acoustic
1980 o acoustic
1980 p acoustic
1980 q acoustic
1980 r acoustic
1980 s acoustic
1980 t acoustic
1980 u acoustic
1980 v acoustic
1980 w acoustic
1980 x acoustic
1980 y acoustic
1980 z acoustic
1980 a afrobeat
1980 b afrobeat
1980 c afrobeat
1980 d afrobeat
1980 e afrobeat
1980 f afrobeat
1980 g afrobeat
1980 h afrobeat
1980 i afrobeat
1980 j afrobeat
1980 k afrobeat
1980 l afrobeat
1980 m afrobeat
1980 n afrobeat
1980 o afrobeat
1980 p afrobeat
1980 q afrobeat
1980 r afrobeat
1980 s afrobeat
1980 t afrobeat
1980 u afrobeat
1980 v afrobeat
1980 w afrobeat
1980 x afrobeat
1980 y afrobeat
1980 z afrobeat
1980 a alt-rock
1980 b alt-rock
1980 c alt-rock
1980 d alt-rock
1980 e alt-rock
1980 f alt-rock
1980 g alt-rock
1980 h alt-rock
1980 i alt-rock
1980 j alt-rock
1980 k a

In [None]:
# check if there is any duplicated track( a track may have more than 1 genre)
# these should be join together later (e.g.genres:pop,country)
df['isrc'].duplicated
df

Unnamed: 0,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres,artist
0,You Shook Me All Night Long,80,0.532,0.767,7,-5.509,1,0.0574,0.00287,0.000513,0.3900,0.755,127.361,210173,4,AUAP08000047,pop,AC/DC
1,Back In Black,82,0.310,0.700,9,-5.678,1,0.0470,0.01100,0.009650,0.0828,0.763,188.386,255493,4,AUAP08000046,pop,AC/DC
2,You Make My Dreams (Come True),77,0.751,0.501,5,-12.151,1,0.0551,0.23400,0.112000,0.0467,0.902,167.057,190627,4,USRC10301828,pop,Daryl Hall & John Oates
3,Another One Bites The Dust - Remastered 2011,72,0.933,0.528,5,-6.472,0,0.1620,0.11200,0.329000,0.1630,0.756,109.975,214653,4,GBUM71029605,pop,Queen
4,Could You Be Loved,79,0.916,0.720,0,-8.548,1,0.1000,0.36000,0.000160,0.0958,0.760,103.312,237000,4,USIR28000016,pop,Bob Marley & The Wailers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,"The Tempest, Act V Scene 1: ""Where The Bee Suc...",0,0.525,0.063,6,-18.975,1,0.0466,0.98300,0.000236,0.1860,0.540,103.027,51600,4,USX762290303,acoustic,Various Artists
192,Henry's Rockin' Boogie Woogie,0,0.674,0.615,5,-5.359,1,0.0351,0.68500,0.813000,0.1420,0.905,127.291,117200,4,FIFMF8000112,acoustic,Champion Jack Dupree
193,Round And Round,5,0.887,0.344,1,-11.985,1,0.0481,0.83200,0.028000,0.3370,0.620,129.867,171445,4,DER387900019,acoustic,J.B. Lenoir
194,Will Me Your Gold Watch And Chain,0,0.506,0.218,3,-14.893,0,0.0375,0.98400,0.717000,0.1180,0.301,94.895,223360,4,US2AH0401024,acoustic,Mississippi Fred McDowell


In [None]:
df['isrc'].duplicated

<bound method Series.duplicated of 0      AUAP08000047
1      AUAP08000046
2      USRC10301828
3      GBUM71029605
4      USIR28000016
           ...     
191    USX762290303
192    FIFMF8000112
193    DER387900019
194    US2AH0401024
195    US2AH0401027
Name: isrc, Length: 196, dtype: object>