In [1]:
from secret_keys import SP_DC, SP_KEY, CLIENT_SECRET
from syrics.api import Spotify
import requests
import pandas as pd
import time

In [2]:
import spotify_token as st

data = st.start_session(SP_DC, SP_KEY)
access_token = data[0]
expiration_date = data[1]
sp = Spotify(SP_DC)

In [3]:
# print(sp.get_lyrics("1v4m9GLt7lpFM5iOvwQZrU"))
# # print(sp.get_current_song())

In [3]:
def cleanLyrics(lyrics):
    if lyrics == None:
        return None
    lyrics = lyrics['lyrics']
    lines = lyrics['lines']
    to_return = []
    for line in lines:
        to_return.append(line['words'])
    return to_return

In [5]:
def searchArtists(name, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = "https://api.spotify.com/v1/search"
            q = name
            type = ["artist"]
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            params = {
                "q": q,
                "type": type
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                spotify_href = response.json()['artists']['items'][0]['external_urls']['spotify']
                id = spotify_href.split("/")[-1]
                return id
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return None
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception(response.status_code, response.text)
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return None

In [6]:
def getArtistsAlbums(id, limit=50, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = f"https://api.spotify.com/v1/artists/{id}/albums"
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            market = "US"
            limit = limit
            params = {
                "market": market,
                "limit": limit
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                albums = response.json()['items']
                to_return = []
                for album in albums:
                    return_album = {
                        "name": album['name'],
                        "id": album['id'],
                        "release_date": album['release_date'],
                        "album_type": album['album_type']
                    }
                    to_return.append(return_album)
                return to_return
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return an empty list
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception("Error getting albums")
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return []

In [7]:
def getAlbumTracks(album_id, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            limit = 50
            params = {
                "limit": limit
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                tracks = response.json()['items']
                to_return = []
                for track in tracks:
                    return_track = {
                        "name": track['name'],
                        "id": track['id'],
                        "duration_ms": track['duration_ms']
                    }
                    to_return.append(return_track)
                return to_return
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return an empty list
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception("Error getting tracks")
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return []

In [8]:
def generateLyrics(name, limit = 50):
    artist_id = searchArtists(name)
    albums = getArtistsAlbums(artist_id, limit)
    to_return = []
    for album in albums:
        album_name = album['name']
        album_id = album['id']
        try:
            album_tracks = getAlbumTracks(album_id)
            for track in album_tracks:
                track_id = track['id']
                if track_id == None:
                    lyrics = []
                else:
                    lyrics = cleanLyrics(sp.get_lyrics(track_id))
                    to_return.append({
                        "artist": name,
                        "album": album_name,
                        "track": track['name'],
                        "track_id": track_id,
                        "lyrics": lyrics,
                        "duration_ms": track['duration_ms']
                    })
        except:
            pass
            
    return to_return

In [9]:
sample_artists = ["Taylor Swift", "CoCoMelon", "Keshi", "Conan Gray", "Slayer", "Black Sabbath", "Khalid", "Lana Del Ray", "IU", "YOASOBI", "LilyPichu", "League of Legends"]
top_x_artists = [
    "The Weeknd", "Taylor Swift", "Rihanna", "Ariana Grande", "Drake",
    "Kanye West", "Justin Bieber", "Dua Lipa", "Coldplay", "Bruno Mars",
    "Beyoncé", "SZA", "David Guetta", "Ed Sheeran", "Eminem", "Bad Bunny",
    "Miley Cyrus", "Marshmello", "Travis Scott", "Calvin Harris", "Billie Eilish",
    "Doja Cat", "21 Savage", "Maroon 5", "Shakira", "Imagine Dragons",
    "Tate McRae", "Post Malone", "Lady Gaga", "Katy Perry", "Olivia Rodrigo",
    "Lana Del Rey", "Harry Styles", "Adele", "Peso Pluma", "Playboi Carti",
    "KAROL G", "Nicki Minaj", "Sia", "Tiësto", "Benson Boone", "J Balvin",
    "Kendrick Lamar", "Ty Dolla $ign", "Queen", "Arctic Monkeys", "Future",
    "Metro Boomin", "Elton John", "Khalid", "Sam Smith", "Daddy Yankee",
    "Selena Gomez", "Chris Brown", "Jack Harlow", "OneRepublic", "USHER",
    "Justin Timberlake", "Pitbull", "Feid", "Shawn Mendes", "Madonna",
    "Myke Towers", "One Direction", "Halsey", "Kali Uchis", "Linkin Park",
    "J. Cole", "Maluma", "Rauw Alejandro", "Black Eyed Peas", "The Chainsmokers",
    "Ozuna", "Bebe Rexha", "Michael Jackson", "Camila Cabello", "Bizarrap",
    "James Arthur", "Teddy Swims", "Swae Lee", "Avicii", "Lil Wayne",
    "Ellie Goulding", "Jason Derulo", "Arijit Singh", "JAY-Z", "Mitski",
    "XXXTENTACION", "Britney Spears", "P!nk", "Charlie Puth", "Manuel Turizo",
    "Noah Kahan", "The Neighbourhood", "Pharrell Williams", "Lewis Capaldi",
    "Ava Max", "50 Cent", "Pritam", "CoCoMelon", "Keshi", "Conan Gray", "Slayer",
    "Black Sabbath", "LilyPichu", "League of Legends", "Billy Joel", "AC/DC", "Demi Lovato",
    "Twenty One Pilots", "Panic! At The Disco", "5 Seconds of Summer", "Fall Out Boy",
]


In [62]:
# top_x = []
# for artists in top_x_artists:
#     top_x += generateLyrics(artists)

# df = pd.DataFrame(top_x)
# df.to_csv("top_x.csv", index=True)
df = pd.read_csv("top_x.csv")

In [45]:
df

Unnamed: 0.1,Unnamed: 0,artist,album,track,track_id,lyrics
0,0,The Weeknd,The Highlights (Deluxe),Die For You,2vz6HIZBaQOnWCH7kKhKQH,"[""I'm findin' ways to articulate"", ""The feelin..."
1,1,The Weeknd,The Highlights (Deluxe),Starboy (feat. Daft Punk),218WdV0d4ijtTtPTKGuf1E,"[""I'm tryna put you in the worst mood, ah"", 'P..."
2,2,The Weeknd,The Highlights (Deluxe),Save Your Tears,6YckHetPOkzxtXXaYx0Gt1,"['Ooh', 'Na-na, yeah', 'I saw you dancing in a..."
3,3,The Weeknd,The Highlights (Deluxe),Blinding Lights,04948IGlqY1vSh7AHbueiQ,"['Yeah', '♪', ""I've been tryna call"", ""I've be..."
4,4,The Weeknd,The Highlights (Deluxe),In Your Eyes,4SD0V2HMxkBupk6ml9alm4,"['Oh, yeah', 'I just pretend, uh', ""That I'm i..."
...,...,...,...,...,...,...
47007,47007,Fall Out Boy,"""The Take Over, The Breaks Over""","""The Take Over, The Breaks Over""",3fzG5wagHTnKSCd7dsnd2j,
47008,47008,Fall Out Boy,"""The Take Over, The Breaks Over""",Thriller - 2006/AOL Music Sessions,3Eljf6kcc09IJPjTvUH2nA,
47009,47009,Fall Out Boy,"This Ain't a Scene, It's an Arms Race","This Ain't A Scene, It's An Arms Race",4HsgxsHOq8EsJ2dJd3wbmt,['I am an arms dealer fitting you with weapons...
47010,47010,Fall Out Boy,"This Ain't a Scene, It's an Arms Race",The Carpal Tunnel Of Love,791XoVbIt2447gVljJLwzj,"[""We take sour sips from life's lush lips"", 'A..."


In [12]:
# show just the rows where lyrics are None or NaN

lyricalest_df = df[df['lyrics'].isnull()].copy()

In [14]:
# remove all rows where "Soundtrack" is in the album name
lyricalest_df = lyricalest_df[lyricalest_df['album'].apply(lambda x: "Soundtrack" not in x)]
lyricalest_df.reset_index(drop=True, inplace=True)
lyricalest_df

In [16]:
# loop through lyrical_df and get the lyrics for each track
data = st.start_session(SP_DC, SP_KEY)
access_token = data[0]
expiration_date = data[1]
sp = Spotify(SP_DC)

new = []

for i, row in lyricalest_df.iterrows():
    if i % 1000 == 0:
        data = st.start_session(SP_DC, SP_KEY)
        access_token = data[0]
        expiration_date = data[1]
        sp = Spotify(SP_DC)
    
    if i % 100 == 0 and i != 0:
        print(new[i-1])
        print(i)
    
    time.sleep(0.5)

    artist = row['artist']
    album = row['album']
    track = row['track']
    track_id = row['track_id']
    lyrics = cleanLyrics(sp.get_lyrics(track_id))
    new.append({
        "artist": artist,
        "album": album,
        "track": track,
        "track_id": track_id,
        "lyrics": lyrics
    })


{'artist': 'The Weeknd', 'album': 'One of the Girls', 'track': 'One Of The Girls - Instrumental', 'track_id': '7zNS5065xzKyhOBMOj7pCr', 'lyrics': None}
100
{'artist': 'Taylor Swift', 'album': "1989 (Taylor's Version)", 'track': "Now That We Don't Talk (Taylor's Version) (From The Vault)", 'track_id': '5KD6AEm19QnMbfWpfoOHMl', 'lyrics': ['You went to a party, I heard from everybody', "You part the crowd like the Red Sea, don't even get me started", 'Did you get anxious, though, on the way home?', "I guess I'll never, ever know", "Now that we don't talk", '♪', 'You grew your hair long, you got new icons', "And from the outside, it looks like you're trying lives on", "I miss the old ways, you didn't have to change", "But I guess I don't have a say", "Now that we don't talk", 'I called my mom, she said that it was for the best', "Remind myself the more I gave, you'd want me less", 'I cannot be your friend, so I pay the price of what I lost', 'And what it cost', "Now that we don't talk", 'W

In [61]:
# lyricalest_df = pd.DataFrame(new)
lyricalest_df = pd.read_csv("top_x_additional.csv")

In [31]:
lyricalest_df.to_csv("top_x_additional.csv")

In [63]:
df = df.drop_duplicates(subset=['lyrics'])
df = df.dropna(subset=['lyrics'])
df = df.reset_index(drop=True)

In [64]:
lyricalest_df = lyricalest_df.drop_duplicates(subset=['lyrics'])
lyricalest_df = lyricalest_df.dropna(subset=['lyrics'])
lyricalest_df = lyricalest_df.reset_index(drop=True)

In [3]:
# clean_df = pd.concat([df, lyricalest_df], ignore_index=True)
clean_df = pd.read_csv("top_x_cleaned.csv")

In [4]:
clean_df = clean_df.drop(columns=['Unnamed: 0'])
clean_df = clean_df.drop_duplicates(subset=['lyrics'])
clean_df = clean_df.dropna(subset=['lyrics'])
clean_df = clean_df.reset_index(drop=True)
clean_df

# flatten lyrics
import ast

clean_df['lyrics'] = clean_df['lyrics'].apply(lambda x: " ".join(ast.literal_eval(x)))

In [6]:
clean_df.to_csv("final.csv")

In [75]:
# find unique artists
unique_artists = clean_df['artist'].unique()
artist_genre = {}
for artist in unique_artists:
    id = searchArtists(artist)
    endpoint = f"https://api.spotify.com/v1/artists/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)
    if response.status_code == 200:
        genres = response.json()['genres']
    else:
        genres = None

    artist_genre[artist] = genres

In [77]:
clean_df['genre'] = clean_df['artist'].apply(lambda x: artist_genre[x])

In [79]:
clean_df.to_csv("top_x_cleaned.csv", index=True)

In [80]:
lilypichu = clean_df[clean_df['artist'] == "LilyPichu"]

In [3]:
df = pd.read_csv("finalfinalfinalfinalfinal.csv")

In [4]:
import requests

def getArtistFromTrackId(id):
    # id = track['track_id']
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    
    response = requests.get(endpoint, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    artist_names = []
    for name in track_info['artists']:
        artist_names.append(name['name'])
    
    return artist_names

def getReleaseDateFromId(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    release_date = track_info['album']['release_date']

    return release_date


In [14]:
getReleaseDateFromId("11vHLsXrmCJL04hBCVGUmb")

'2022-06-02'

In [5]:
df['artists'] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            artists = getArtistFromTrackId(row['track_id'])
            df.at[i, 'artists'] = artists
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)



0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300


In [15]:
df['release_date'] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            release_date = getReleaseDateFromId(row['track_id'])
            df.at[i, 'release_date'] = release_date
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [16]:
df.to_csv("finalfinalfinalfinal.csv", index=True)

In [7]:
df = pd.read_csv("finalfinalfinalfinal.csv", index_col=0)

In [5]:
def isExplicit(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    explicit = track_info['explicit']

    return explicit
        

In [9]:
df["explicit"] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            explicit = isExplicit(row['track_id'])
            df.at[i, 'explicit'] = explicit
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [11]:
df.to_csv("finalfinalfinalfinalfinal.csv", index=False)