# Scrape Genius.com and Spotify for Lyrics and Song Metadata

The following uses the "lyricGenious" wrapper and Spotify API to gather data for analysis. Further analysis conducted in R. Author: Marc Petta 

## Get Lyrics from Genius.com

In [None]:
import lyricsgenius as genius
import pandas as pd
from datetime import datetime
import re

api = genius.Genius("Access Token Here",sleep_time=0.01, verbose=False)

all_songs = pd.read_csv("Aesop_AllSongs.csv")
all_song_data = pd.DataFrame()
start_time = datetime.now()
print("Started at {}".format(start_time))
for i in range(0, len(all_songs)):
    rolling_pct = int((i/len(all_songs))*100)
    print(str(rolling_pct) + "% complete." + " Collecting Record " + str(i) +" of " +
          str(len(all_songs)) +". Year " + str(all_songs.iloc[i]['Year']) + "." + " Currently collecting " + 
          all_songs.iloc[i]['Song Title'] + " by " + all_songs.iloc[i]['Artist'] + " "*50, end="\r")
    song_title = all_songs.iloc[i]['Song Title']
    song_title = re.sub(" and ", " & ", song_title)
    artist_name = all_songs.iloc[i]['Artist']
    artist_name = re.sub(" and ", " & ", artist_name)

    try:
        song = api.search_song(song_title, artist=artist_name)
        song_album = song.album
        song_album_url = song.album_url
        featured_artists = song.featured_artists
        song_lyrics = re.sub("\n", " ", song.lyrics)
        song_media = song.media
        song_url = song.url
        song_writer_artists = song.writer_artists
        song_year = song.year
    except:
        song_album = "null"
        song_album_url = "null"
        featured_artists = "null"
        song_lyrics = "null"
        song_media = "null"
        song_url = "null"
        song_writer_artists = "null"
        song_year = "null"
        
    row = {
        "Year": all_songs.iloc[i]['Year'],
        "Rank": all_songs.iloc[i]['Rank'],
        "Song Title": all_songs.iloc[i]['Song Title'],
        "Artist": all_songs.iloc[i]['Artist'],
        "Album": song_album,
        "Album URL": song_album_url,
        "Featured Artists": featured_artists,
        "Lyrics": song_lyrics,
        "Media": song_media,
        "Song URL": song_url,
        "Writers": song_writer_artists,
        "Release Date": song_year
    }
    all_song_data = all_song_data.append(row, ignore_index=True)
end_time = datetime.now()
print("\nCompleted at {}".format(start_time))
print("Total time to collect: {}".format(end_time - start_time))

In [None]:
# Write all the results from the API to csv
all_song_data.to_csv("all_songs_data.csv")

## Get data from Spotify

In [9]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data

# Set up access token for API and identify artist
client_id = "Your ID Here"
client_secret = "Your Token Here"

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

name = "Aesop Rock" #chosen artist
result = sp.search(name) #search query
result['tracks']['items'][0]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/2fSaE6BXtQy0x7R7v9IOmZ'},
  'href': 'https://api.spotify.com/v1/artists/2fSaE6BXtQy0x7R7v9IOmZ',
  'id': '2fSaE6BXtQy0x7R7v9IOmZ',
  'name': 'Aesop Rock',
  'type': 'artist',
  'uri': 'spotify:artist:2fSaE6BXtQy0x7R7v9IOmZ'}]

In [10]:
# Get album names and IDs in separate lists for reference later.

#Extract Artist's uri
artist_uri = result['tracks']['items'][0]['artists'][0]['uri']
#Pull all of the artist's albums
sp_albums = sp.artist_albums(artist_uri, album_type='album')
#Store artist's albums' names' and uris in separate lists
album_names = []
album_uris = []
for i in range(len(sp_albums['items'])):
    album_names.append(sp_albums['items'][i]['name'])
    album_uris.append(sp_albums['items'][i]['uri'])
    
album_names
album_uris
#Keep names and uris in same order to keep track of duplicate albums

['spotify:album:6Cun8XaplQbI9DtM1zPvFr',
 'spotify:album:5QKnNnOC78CJRkl9Udrf31',
 'spotify:album:6VzGx5oL2WlX06kpjjJ3jp',
 'spotify:album:1An1m0S3ZdQy9Uuo476D12',
 'spotify:album:5LZS2qqiqBRiEtN6kFstrJ',
 'spotify:album:2pfG39PMUa3su3zM6lgOIU',
 'spotify:album:7iUK0YF0sRXqnPhY0cSKnw',
 'spotify:album:2qEWs070O7BysPeVOWJZ1a',
 'spotify:album:1ospGKjZezat2STJw0AusM',
 'spotify:album:1kZj0bKp9tzT5fSGItgzSB',
 'spotify:album:2m2ilugjopzt2Tq6zEfqkg',
 'spotify:album:7mB5yhTfOUdtLdjRAsPY4t',
 'spotify:album:3syXRVRYkudG7o5ZQXb9Tt',
 'spotify:album:1uRuLJQtC0iP0H8uix3TKB',
 'spotify:album:7b9hzKJvQyAM2h62iY2GrC',
 'spotify:album:6a5f8Ox6TV6wLjUfbIMauo',
 'spotify:album:1EozNWj9YemnvgrpvqHpca',
 'spotify:album:4uC2NmWBxzizUR5Q2y2IAg']

In [15]:
# Create function to grab songs from each album

def albumSongs(uri):
    album = uri #assign album uri to a_name
    
    spotify_albums[album] = {} #Creates dictionary for that specific album

    #Create keys-values of empty lists inside nested dictionary for album
    spotify_albums[album]['album'] = [] #create empty list
    spotify_albums[album]['track_number'] = []
    spotify_albums[album]['id'] = []
    spotify_albums[album]['name'] = []
    spotify_albums[album]['uri'] = []
    
    tracks = sp.album_tracks(album) #pull data on album tracks

    for n in range(len(tracks['items'])): #for each song track
        spotify_albums[album]['album'].append(album_names[album_count]) #append album name tracked via album_count
        spotify_albums[album]['track_number'].append(tracks['items'][n]['track_number'])
        spotify_albums[album]['id'].append(tracks['items'][n]['id'])
        spotify_albums[album]['name'].append(tracks['items'][n]['name'])
        spotify_albums[album]['uri'].append(tracks['items'][n]['uri'])
        

In [16]:
# Create empty dict to hold songs and loop thru albums with the function  
spotify_albums = {}
album_count = 0
for i in album_uris: #each album
    albumSongs(i)
    print("Album " + str(album_names[album_count]) + " songs has been added to spotify_albums dictionary")
    album_count+=1 #Updates album count once all tracks have been added

Album Malibu Ken songs has been added to spotify_albums dictionary
Album Malibu Ken songs has been added to spotify_albums dictionary
Album Bushwick (Original Motion Picture Soundtrack) songs has been added to spotify_albums dictionary
Album The Impossible Kid songs has been added to spotify_albums dictionary
Album The Impossible Kid songs has been added to spotify_albums dictionary
Album The Impossible Kid (Instrumental Version) songs has been added to spotify_albums dictionary
Album Skelethon [Deluxe Version] songs has been added to spotify_albums dictionary
Album Skelethon songs has been added to spotify_albums dictionary
Album Skelethon (Instrumental Version) songs has been added to spotify_albums dictionary
Album None Shall Pass - Instrumentals And Accapellas songs has been added to spotify_albums dictionary
Album None Shall Pass (Bonus Edition) songs has been added to spotify_albums dictionary
Album None Shall Pass songs has been added to spotify_albums dictionary
Album None Shal

In [17]:
# Create function to add additional key-values to store the audio features of each 
# album track and append the data into lists representing all the music tracks for that album

def audio_features(album):
    #Add new key-values to store audio features
    spotify_albums[album]['acousticness'] = []
    spotify_albums[album]['danceability'] = []
    spotify_albums[album]['energy'] = []
    spotify_albums[album]['instrumentalness'] = []
    spotify_albums[album]['liveness'] = []
    spotify_albums[album]['loudness'] = []
    spotify_albums[album]['speechiness'] = []
    spotify_albums[album]['tempo'] = []
    spotify_albums[album]['valence'] = []
    spotify_albums[album]['popularity'] = []
    #create a track counter
    track_count = 0
    for track in spotify_albums[album]['uri']:
        #pull audio features per track
        features = sp.audio_features(track)
        
        #Append to relevant key-value
        spotify_albums[album]['acousticness'].append(features[0]['acousticness'])
        spotify_albums[album]['danceability'].append(features[0]['danceability'])
        spotify_albums[album]['energy'].append(features[0]['energy'])
        spotify_albums[album]['instrumentalness'].append(features[0]['instrumentalness'])
        spotify_albums[album]['liveness'].append(features[0]['liveness'])
        spotify_albums[album]['loudness'].append(features[0]['loudness'])
        spotify_albums[album]['speechiness'].append(features[0]['speechiness'])
        spotify_albums[album]['tempo'].append(features[0]['tempo'])
        spotify_albums[album]['valence'].append(features[0]['valence'])
        #popularity is stored elsewhere
        pop = sp.track(track)
        spotify_albums[album]['popularity'].append(pop['popularity'])
        track_count+=1

In [18]:
# Loop through albums extracting the audio features

import time
import numpy as np
sleep_min = 2
sleep_max = 5
start_time = time.time()
request_count = 0
for i in spotify_albums:
    audio_features(i)
    request_count+=1
    if request_count % 5 == 0:
        print(str(request_count) + " playlists completed")
        time.sleep(np.random.uniform(sleep_min, sleep_max))
        print('Loop #: {}'.format(request_count))
        print('Elapsed Time: {} seconds'.format(time.time() - start_time))

5 playlists completed
Loop #: 5
Elapsed Time: 39.08202600479126 seconds
10 playlists completed
Loop #: 10
Elapsed Time: 95.86552214622498 seconds
15 playlists completed
Loop #: 15
Elapsed Time: 132.58294224739075 seconds


In [19]:
# Add data to a new dataframe and dict

dic_df = {}
dic_df['album'] = []
dic_df['track_number'] = []
dic_df['id'] = []
dic_df['name'] = []
dic_df['uri'] = []
dic_df['acousticness'] = []
dic_df['danceability'] = []
dic_df['energy'] = []
dic_df['instrumentalness'] = []
dic_df['liveness'] = []
dic_df['loudness'] = []
dic_df['speechiness'] = []
dic_df['tempo'] = []
dic_df['valence'] = []
dic_df['popularity'] = []
for album in spotify_albums: 
    for feature in spotify_albums[album]:
        dic_df[feature].extend(spotify_albums[album][feature])
        
len(dic_df['album'])

265

In [33]:
list(aesop_spot_full.columns)

['album',
 'track_number',
 'id',
 'name',
 'uri',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity']

In [35]:
import pandas as pd

# Create a data frame with the Spotify data 
aesop_spot_full = pd.DataFrame.from_dict(dic_df)

# Rename column for consistancy for joining
aesop_spot_full.rename(columns={'name': 'Song Title'})

# Assign colum names for joining with Genious data
aesop_spot_full.columns = ['album',
 'track_number',
 'id',
 'Song Title',
 'uri',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity']

# Remove duplicates 
print(len(aesop_spot_full))
aesop_spot_full = aesop_spot_full.sort_values('popularity', ascending=False).drop_duplicates('Song Title').sort_index()
print(len(aesop_spot_full))
aesop_spot_full.head()


265
196


Unnamed: 0,album,track_number,id,Song Title,uri,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,Malibu Ken,1,2YSdttfME0OmOj9mPwHFIK,Corn Maze,spotify:track:2YSdttfME0OmOj9mPwHFIK,0.363,0.709,0.624,3.1e-05,0.109,-6.065,0.289,91.079,0.864,45
1,Malibu Ken,2,3q3QvTkJ8Ofu07eqB1eG6m,Tuesday,spotify:track:3q3QvTkJ8Ofu07eqB1eG6m,0.11,0.521,0.731,0.0,0.136,-6.997,0.446,168.111,0.723,44
2,Malibu Ken,3,3PAlzO0osm3cmLxVQtLLVM,Save Our Ship,spotify:track:3PAlzO0osm3cmLxVQtLLVM,0.151,0.7,0.699,1.9e-05,0.315,-6.539,0.285,97.838,0.538,40
3,Malibu Ken,4,2JSzGK853TkdFkbeABrH41,Sword Box,spotify:track:2JSzGK853TkdFkbeABrH41,0.0986,0.496,0.559,0.000152,0.0734,-8.154,0.367,75.231,0.366,40
4,Malibu Ken,5,6sJcFtSg42N9Hfpvdfh9MN,Dog Years,spotify:track:6sJcFtSg42N9Hfpvdfh9MN,0.0896,0.806,0.458,2.4e-05,0.0773,-8.14,0.226,135.89,0.819,40


In [36]:
# Write to csv 
aesop_spot_full.to_csv("aesop_spot_full.csv")
aesop_spot_full.shape

(196, 15)

# Data Preperation and Cleaning

In [38]:
# Load the csv for cleaning
import pandas as pd 
df = pd.read_csv("all_songs_data.csv",index_col=0)
df = df[['Song Title', 'Album', 'Release Date', 'Lyrics' ]]
df.head()

Unnamed: 0,Song Title,Album,Release Date,Lyrics
0,None Shall Pass,None Shall Pass,2007-08-28,[Verse 1] Flash that buttery gold Jittery zeit...
1,Daylight,Labor Days,2001-09-18,"[Intro: Sample] Yes, yes, y'all, and you don't..."
2,Zero Dark Thirty,Skelethon,2012-07-10,[Intro] They did not know how long they had be...
3,Rings,The Impossible Kid,2016-02-16,[Verse 1] Used to draw Hard to admit that I us...
4,Coffee,None Shall Pass,,[Chorus: Aesop Rock] We don't need no walkie-t...


In [39]:
# Drop any songs that do not have lyrics
df = df.dropna(subset=['Lyrics'])
df.shape

(177, 4)

In [None]:
# Write csv
df.to_csv("aesop_full.csv")

In [40]:
# Check rows based on condition 
df.loc[df['Album'] == "Ghosts of the Barbary Coast"]

Unnamed: 0,Song Title,Album,Release Date,Lyrics
121,Tomorrow Morning,Ghosts of the Barbary Coast,,{Intro] In the district called the Barbary Coa...


In [41]:
import warnings
warnings.filterwarnings('ignore')

# Filter by only solo EP and album releases
filter_list = ['None Shall Pass', 'Labor Days', 'Skelethon','The Impossible Kid',
               'Daylight', 'Fast Cars, Danger, Fire, and Knives', 'Float', 
               'Cat Food EP','Bazooka Tooth','Music for Earthworms', 'Appleseed','My Belly', 
               'Ghosts of the Barbary Coast',]
df_1 = df[df.Album.isin(filter_list)]

# Truncate dates and deal with missing values 
df_1.loc[df['Album'] == 'None Shall Pass', 'Release Date'] = 2007   
df_1.loc[df['Album'] == 'Labor Days', 'Release Date'] = 2001
df_1.loc[df['Album'] == 'Skelethon', 'Release Date'] = 2012
df_1.loc[df['Album'] == 'The Impossible Kid', 'Release Date'] = 2016
df_1.loc[df['Album'] == 'Daylight', 'Release Date'] = 2002
df_1.loc[df['Album'] == 'Fast Cars, Danger, Fire, and Knives', 'Release Date'] = 2005
df_1.loc[df['Album'] == 'Float', 'Release Date'] = 2009
df_1.loc[df['Album'] == 'Cat Food EP', 'Release Date'] = 2015
df_1.loc[df['Album'] == 'Bazooka Tooth', 'Release Date'] = 2003
df_1.loc[df['Album'] == 'Music for Earthworms', 'Release Date'] = 1997
df_1.loc[df['Album'] == 'Appleseed', 'Release Date'] = 1999
df_1.loc[df['Album'] == 'My Belly', 'Release Date'] = 2016
df_1.loc[df['Album'] == 'Ghosts of the Barbary Coast', 'Release Date'] = 2008

# Check for any Null values
df_1.isnull().sum().sum()

0

In [42]:
# Get indexes for which column Lyrics is Instrumental
indexInst = df_1[ df_1['Lyrics'] == "[Instrumental]" ].index
# Delete these row indexes from dataFrame
df_1.drop(indexInst , inplace=True)

# Get indexes for which column Lyrics is Instrumental Sample
indexInst2 = df_1[ df_1['Lyrics'] == "[Instrumental: Sample] Fuck you" ].index
# Delete this row indexes from dataFrame
df_1.drop(indexInst2 , inplace=True)

# Review
df_1.shape

(131, 4)

In [43]:
df_1.tail()

Unnamed: 0,Song Title,Album,Release Date,Lyrics
147,89.9FM Nighttrain Promo,Music for Earthworms,1997,(What's up this is Aesop Rock for WKCR) I tra...
148,Live On 89.9FM Nighttrain,Music for Earthworms,1997,"(DJ chatter) [Verse I] Check it, in a box ma..."
149,Coordinates,Music for Earthworms,1997,[Les Miserables sample] Platoon of sappers adv...
172,For The Whole Family,None Shall Pass,2007,[Verse 1: Aesop Rock] Before shooting troops w...
174,Home Again,Appleseed,1999,"Shh... [Aesop Rock] + (Dose One) (Well or, an..."


In [45]:
#Join Spotify and Genious data
aesop_merged_left = pd.merge(left=df_1,right=aesop_spot_full, how='left', left_on='Song Title', right_on='Song Title')
aesop_merged_left.

(131, 18)

In [46]:
# Write full csv
aesop_merged_left.to_csv("aesop_combined_final.csv")

In [47]:
# Review collumn names
list(aesop_merged_left.columns)

['Song Title',
 'Album',
 'Release Date',
 'Lyrics',
 'album',
 'track_number',
 'id',
 'uri',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity']

In [48]:
# Write subset of columns for final analysis csv
df_2 = aesop_merged_left[['Song Title',
 'Album',
 'Release Date',
 'Lyrics',
 'album',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence',
 'popularity']]

df_2.to_csv("aesop_final_analysis.csv")
df_2.head()

Unnamed: 0,Song Title,Album,Release Date,Lyrics,album,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,None Shall Pass,None Shall Pass,2007,[Verse 1] Flash that buttery gold Jittery zeit...,None Shall Pass,0.184,0.707,0.683,0.00108,0.108,-8.728,0.0683,116.009,0.567,56.0
1,Daylight,Labor Days,2001,"[Intro: Sample] Yes, yes, y'all, and you don't...",Labor Days,0.0242,0.564,0.758,0.00109,0.376,-6.85,0.283,92.674,0.597,53.0
2,Zero Dark Thirty,Skelethon,2012,[Intro] They did not know how long they had be...,Skelethon [Deluxe Version],0.0636,0.826,0.897,7.9e-05,0.26,-7.404,0.211,106.024,0.722,41.0
3,Rings,The Impossible Kid,2016,[Verse 1] Used to draw Hard to admit that I us...,The Impossible Kid,0.0476,0.808,0.857,0.000888,0.767,-5.404,0.0978,127.939,0.776,50.0
4,Coffee,None Shall Pass,2007,[Chorus: Aesop Rock] We don't need no walkie-t...,None Shall Pass (Bonus Edition),0.133,0.453,0.79,0.0,0.193,-8.676,0.521,77.234,0.175,35.0
