# Imports

In [None]:
from secret_keys import SP_DC, SP_KEY, CLIENT_SECRET
from syrics.api import Spotify
import requests
import pandas as pd
import time

In [None]:
import spotify_token as st

data = st.start_session(SP_DC, SP_KEY)
access_token = data[0]
expiration_date = data[1]
sp = Spotify(SP_DC)

# Functions

In [None]:
def cleanLyrics(lyrics):
    if lyrics == None:
        return None
    lyrics = lyrics['lyrics']
    lines = lyrics['lines']
    to_return = []
    for line in lines:
        to_return.append(line['words'])
    return to_return

In [None]:
def searchArtists(name, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = "https://api.spotify.com/v1/search"
            q = name
            type = ["artist"]
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            params = {
                "q": q,
                "type": type
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                spotify_href = response.json()['artists']['items'][0]['external_urls']['spotify']
                id = spotify_href.split("/")[-1]
                return id
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return None
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception(response.status_code, response.text)
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return None

In [None]:
def getArtistsAlbums(id, limit=50, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = f"https://api.spotify.com/v1/artists/{id}/albums"
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            market = "US"
            limit = limit
            params = {
                "market": market,
                "limit": limit
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                albums = response.json()['items']
                to_return = []
                for album in albums:
                    return_album = {
                        "name": album['name'],
                        "id": album['id'],
                        "release_date": album['release_date'],
                        "album_type": album['album_type']
                    }
                    to_return.append(return_album)
                return to_return
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return an empty list
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception("Error getting albums")
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return []

In [None]:
def getAlbumTracks(album_id, retry_count=8):
    global data, access_token, expiration_date, sp
    for i in range(retry_count):
        try:
            endpoint = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
            headers = {
                "Authorization": f"Bearer {access_token}"
            }
            limit = 50
            params = {
                "limit": limit
            }
            response = requests.get(endpoint, headers=headers, params=params)
            if response.status_code == 200:
                tracks = response.json()['items']
                to_return = []
                for track in tracks:
                    return_track = {
                        "name": track['name'],
                        "id": track['id'],
                        "duration_ms": track['duration_ms']
                    }
                    to_return.append(return_track)
                return to_return
            elif response.status_code == 401:
                data = st.start_session(SP_DC, SP_KEY)
                access_token = data[0]
                expiration_date = data[1]
                sp = Spotify(SP_DC)
                continue  # Retry the request with the new access token
            elif response.status_code == 403:
                break  # Break out of the loop and return an empty list
            elif response.status_code == 429 or response.status_code == 503:
                time.sleep(20)  # Wait for a few seconds before retrying
            else:
                raise Exception("Error getting tracks")
        except Exception as e:
            if i == retry_count - 1:
                raise e
            else:
                time.sleep(20)  # Wait for a few seconds before retrying
    return []

In [None]:
def generateLyrics(name, limit = 50):
    artist_id = searchArtists(name)
    albums = getArtistsAlbums(artist_id, limit)
    to_return = []
    for album in albums:
        album_name = album['name']
        album_id = album['id']
        try:
            album_tracks = getAlbumTracks(album_id)
            for track in album_tracks:
                track_id = track['id']
                if track_id == None:
                    lyrics = []
                else:
                    lyrics = cleanLyrics(sp.get_lyrics(track_id))
                    to_return.append({
                        "artist": name,
                        "album": album_name,
                        "track": track['name'],
                        "track_id": track_id,
                        "lyrics": lyrics,
                        "duration_ms": track['duration_ms']
                    })
        except:
            pass
            
    return to_return

# First Pass

In [None]:
sample_artists = ["Taylor Swift", "CoCoMelon", "Keshi", "Conan Gray", "Slayer", "Black Sabbath", "Khalid", "Lana Del Ray", "IU", "YOASOBI", "LilyPichu", "League of Legends"]
top_x_artists = [
    "The Weeknd", "Taylor Swift", "Rihanna", "Ariana Grande", "Drake",
    "Kanye West", "Justin Bieber", "Dua Lipa", "Coldplay", "Bruno Mars",
    "Beyoncé", "SZA", "David Guetta", "Ed Sheeran", "Eminem", "Bad Bunny",
    "Miley Cyrus", "Marshmello", "Travis Scott", "Calvin Harris", "Billie Eilish",
    "Doja Cat", "21 Savage", "Maroon 5", "Shakira", "Imagine Dragons",
    "Tate McRae", "Post Malone", "Lady Gaga", "Katy Perry", "Olivia Rodrigo",
    "Lana Del Rey", "Harry Styles", "Adele", "Peso Pluma", "Playboi Carti",
    "KAROL G", "Nicki Minaj", "Sia", "Tiësto", "Benson Boone", "J Balvin",
    "Kendrick Lamar", "Ty Dolla $ign", "Queen", "Arctic Monkeys", "Future",
    "Metro Boomin", "Elton John", "Khalid", "Sam Smith", "Daddy Yankee",
    "Selena Gomez", "Chris Brown", "Jack Harlow", "OneRepublic", "USHER",
    "Justin Timberlake", "Pitbull", "Feid", "Shawn Mendes", "Madonna",
    "Myke Towers", "One Direction", "Halsey", "Kali Uchis", "Linkin Park",
    "J. Cole", "Maluma", "Rauw Alejandro", "Black Eyed Peas", "The Chainsmokers",
    "Ozuna", "Bebe Rexha", "Michael Jackson", "Camila Cabello", "Bizarrap",
    "James Arthur", "Teddy Swims", "Swae Lee", "Avicii", "Lil Wayne",
    "Ellie Goulding", "Jason Derulo", "Arijit Singh", "JAY-Z", "Mitski",
    "XXXTENTACION", "Britney Spears", "P!nk", "Charlie Puth", "Manuel Turizo",
    "Noah Kahan", "The Neighbourhood", "Pharrell Williams", "Lewis Capaldi",
    "Ava Max", "50 Cent", "Pritam", "CoCoMelon", "Keshi", "Conan Gray", "Slayer",
    "Black Sabbath", "LilyPichu", "League of Legends", "Billy Joel", "AC/DC", "Demi Lovato",
    "Twenty One Pilots", "Panic! At The Disco", "5 Seconds of Summer", "Fall Out Boy",
]


In [None]:
top_x = []
for artists in top_x_artists:
    # generate lyrics also runs through all the other functions, so almost every song by the artist is added to the list
    top_x += generateLyrics(artists)

df = pd.DataFrame(top_x)
df.to_csv("top_x.csv", index=True)

In [None]:
# checkpoint because crawling takes days
df = pd.read_csv("top_x.csv")

In [None]:
df

## Re-running to catch missing lyrics

In [None]:
# show just the rows where lyrics are None or NaN
lyricalest_df = df[df['lyrics'].isnull()].copy()

In [None]:
# remove all rows where "Soundtrack" is in the album name
# soundtracks probably won't have lyrics
lyricalest_df = lyricalest_df[lyricalest_df['album'].apply(lambda x: "Soundtrack" not in x)]
lyricalest_df.reset_index(drop=True, inplace=True)
lyricalest_df

In [None]:
# loop through lyrical_df and get the lyrics for each track
data = st.start_session(SP_DC, SP_KEY)
access_token = data[0]
expiration_date = data[1]
sp = Spotify(SP_DC)

new = []

for i, row in lyricalest_df.iterrows():
    # prevent the access token from expiring
    if i % 1000 == 0:
        data = st.start_session(SP_DC, SP_KEY)
        access_token = data[0]
        expiration_date = data[1]
        sp = Spotify(SP_DC)
    
    # to check progress
    if i % 100 == 0 and i != 0:
        print(new[i-1])
        print(i)
    
    time.sleep(0.5)

    artist = row['artist']
    album = row['album']
    track = row['track']
    track_id = row['track_id']
    lyrics = cleanLyrics(sp.get_lyrics(track_id))
    new.append({
        "artist": artist,
        "album": album,
        "track": track,
        "track_id": track_id,
        "lyrics": lyrics
    })


## Merging with first pass

In [None]:
lyricalest_df = pd.DataFrame(new)
lyricalest_df.to_csv("top_x_additional.csv", index=True)

In [None]:
# checkpoint two
lyricalest_df = pd.read_csv("top_x_additional.csv")

In [None]:
# remove duplicates and NaNs
df = df.drop_duplicates(subset=['lyrics'])
df = df.dropna(subset=['lyrics'])
df = df.reset_index(drop=True)

In [None]:
clean_df = pd.concat([df, lyricalest_df], ignore_index=True)

In [None]:
clean_df = clean_df.drop(columns=['Unnamed: 0'])
clean_df = clean_df.drop_duplicates(subset=['lyrics'])
clean_df = clean_df.dropna(subset=['lyrics'])
clean_df = clean_df.reset_index(drop=True)

clean_df.to_csv("top_x_cleaned.csv", index=True)

# flatten lyrics
# lyrics were initially stored as strings, so we need to convert them back to lists
import ast

clean_df['lyrics'] = clean_df['lyrics'].apply(lambda x: " ".join(ast.literal_eval(x)))

In [None]:
# checkpoint 3
clean_df = pd.read_csv("top_x_cleaned.csv")
clean_df

# Crawling and adding genre information

In [None]:
# find unique artists
unique_artists = clean_df['artist'].unique()
artist_genre = {}
for artist in unique_artists:
    id = searchArtists(artist)
    endpoint = f"https://api.spotify.com/v1/artists/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)
    if response.status_code == 200:
        genres = response.json()['genres']
    else:
        genres = None

    artist_genre[artist] = genres

In [None]:
clean_df['genre'] = clean_df['artist'].apply(lambda x: artist_genre[x])

In [None]:
clean_df.to_csv("finalfinalfinalfinalfinal.csv", index=True)

# Recleaning

In [None]:
# checkpoint 4
df = pd.read_csv("finalfinalfinalfinalfinal.csv")

## Removing Non English

In [None]:
df = df[df['lyrics'].str.contains(r'^[a-zA-Z0-9 !@#$%^&*()_+-=,♫♪\'\"?—{}–\[\]ㅤ]*$')]

In [None]:
import requests

def getArtistFromTrackId(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    
    response = requests.get(endpoint, headers=headers)
    
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    artist_names = []
    for name in track_info['artists']:
        artist_names.append(name['name'])
    
    return artist_names

def getReleaseDateFromId(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    release_date = track_info['album']['release_date']

    return release_date

## Allowing songs to have multiple artists

In [None]:
df['artists'] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            artists = getArtistFromTrackId(row['track_id'])
            df.at[i, 'artists'] = artists
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)


In [None]:
df.drop(columns=['artist'], inplace=True)
df

In [None]:
# checkpoint 5
df.to_csv("multiartists.csv", index=True)

## Adding additional information for Frontend

---
URL is to allow users to get to the song on Spotify.
Image is to allow users to see the album art.

In [5]:
df = pd.read_csv("multiartists.csv")

In [6]:
def getMisc(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    response = requests.get(endpoint, headers=headers)
    data = response.json()
    url = data["external_urls"]["spotify"]
    name = data["name"]
    preview_url = data["preview_url"]
    image = data["album"]["images"][0]["url"]
    return url, name, preview_url, image


In [7]:
df['url'] = None
df['name'] = None
df['preview_url'] = None
df['image'] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            url, name, preview, image = getMisc(row['track_id'])
            df.at[i, "url"] = url
            df.at[i, "name"] = name
            df.at[i, "preview_url"] = preview
            df.at[i, "image"] = image
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

In [13]:
df.to_csv("additional_info.csv")

## Adding release date information

In [None]:
df['release_date'] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            release_date = getReleaseDateFromId(row['track_id'])
            df.at[i, 'release_date'] = release_date
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)

## Adding explicit flags

In [None]:
def isExplicit(id):
    endpoint = f"https://api.spotify.com/v1/tracks/{id}"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.get(endpoint, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}")
    
    track_info = response.json()
    explicit = track_info['explicit']

    return explicit
        

In [None]:
df["explicit"] = None
for i, row in df.iterrows():
    if i % 100 == 0:
        print(i)
    success = False
    while not success:
        try:
            explicit = isExplicit(row['track_id'])
            df.at[i, 'explicit'] = explicit
            success = True
        except:
            data = st.start_session(SP_DC, SP_KEY)
            access_token = data[0]
            expiration_date = data[1]
            sp = Spotify(SP_DC)
            time.sleep(20)

## Removing remixes and re-releases
---
Example, Taylor Swift's original songs, vs Taylor's Version

In [45]:
# remove songs where "remix" is in track
df = df[~df['track'].apply(lambda x: "Remix" in x)]

# create new column called normalized_lyrics
# this column will contain the lyrics with all non-alphanumeric characters removed
import re

df['normalized_lyrics'] = df['lyrics'].apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', '', x))
df['normalized_lyrics'] = df['normalized_lyrics'].apply(lambda x: x.lower())
# remove all spaces
df['normalized_lyrics'] = df['normalized_lyrics'].apply(lambda x: x.replace(" ", ""))


# # remove songs where normalized_lyrics is duped
df = df.drop_duplicates(subset=['normalized_lyrics'])

# # remove normalized_lyrics column
df.drop(columns=['normalized_lyrics'], inplace=True)

df.to_csv("final_cleaned.csv", index=False)

In [46]:
df

Unnamed: 0,album,track,track_id,lyrics,duration,genre,release_date,explicit,artists,url,name,preview_url,image
0,The Highlights (Deluxe),Die For You,2vz6HIZBaQOnWCH7kKhKQH,I'm findin' ways to articulate The feeling I'm...,4 minutes 20 seconds,"['canadian contemporary r&b', 'canadian pop', ...",2024-02-09,False,['The Weeknd'],https://open.spotify.com/track/2vz6HIZBaQOnWCH...,Die For You,https://p.scdn.co/mp3-preview/e16852119b0d41ef...,https://i.scdn.co/image/ab67616d0000b273c87bfe...
1,The Highlights (Deluxe),Starboy (feat. Daft Punk),218WdV0d4ijtTtPTKGuf1E,"I'm tryna put you in the worst mood, ah P1 cle...",3 minutes 50 seconds,"['canadian contemporary r&b', 'canadian pop', ...",2024-02-09,True,"['The Weeknd', 'Daft Punk']",https://open.spotify.com/track/218WdV0d4ijtTtP...,Starboy (feat. Daft Punk),https://p.scdn.co/mp3-preview/508e00c3470094fe...,https://i.scdn.co/image/ab67616d0000b273c87bfe...
2,The Highlights (Deluxe),Save Your Tears,6YckHetPOkzxtXXaYx0Gt1,"Ooh Na-na, yeah I saw you dancing in a crowded...",3 minutes 35 seconds,"['canadian contemporary r&b', 'canadian pop', ...",2024-02-09,True,['The Weeknd'],https://open.spotify.com/track/6YckHetPOkzxtXX...,Save Your Tears,https://p.scdn.co/mp3-preview/59b0119674967b35...,https://i.scdn.co/image/ab67616d0000b273c87bfe...
3,The Highlights (Deluxe),Blinding Lights,04948IGlqY1vSh7AHbueiQ,Yeah ♪ I've been tryna call I've been on my ow...,3 minutes 20 seconds,"['canadian contemporary r&b', 'canadian pop', ...",2024-02-09,False,['The Weeknd'],https://open.spotify.com/track/04948IGlqY1vSh7...,Blinding Lights,https://p.scdn.co/mp3-preview/deb0fb99d88264e4...,https://i.scdn.co/image/ab67616d0000b273c87bfe...
4,The Highlights (Deluxe),In Your Eyes,4SD0V2HMxkBupk6ml9alm4,"Oh, yeah I just pretend, uh That I'm in the da...",3 minutes 57 seconds,"['canadian contemporary r&b', 'canadian pop', ...",2024-02-09,True,['The Weeknd'],https://open.spotify.com/track/4SD0V2HMxkBupk6...,In Your Eyes,https://p.scdn.co/mp3-preview/88e6d52b150be7e4...,https://i.scdn.co/image/ab67616d0000b273c87bfe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,America's Suitehearts,America's Suitehearts - Acoustic,0zRX1ItX2sh7PJtKuvOuO5,You could have knocked me out with a feather I...,3 minutes 40 seconds,"['emo', 'modern rock', 'pop', 'rock']",2008-01-01,False,['Fall Out Boy'],https://open.spotify.com/track/0zRX1ItX2sh7PJt...,America's Suitehearts - Acoustic,https://p.scdn.co/mp3-preview/0f7b111fe42cea99...,https://i.scdn.co/image/ab67616d0000b2738d11ff...
20794,So Sick,So Sick - (BBC Radio 1 - Live Lounge),0Csc6pK1zv3k071tSWqJ5D,Gotta change my answering machine Now that I'm...,2 minutes 54 seconds,"['emo', 'modern rock', 'pop', 'rock']",2007-01-01,False,['Fall Out Boy'],https://open.spotify.com/track/0Csc6pK1zv3k071...,So Sick - (BBC Radio 1 - Live Lounge),https://p.scdn.co/mp3-preview/8c08158f0d9b5946...,https://i.scdn.co/image/ab67616d0000b273bbb2c5...
20795,I'm Like A Lawyer With The Way I'm Always Tryi...,I'm Like A Lawyer With The Way I'm Always Tryi...,7eOlcfmf0J6rvWQrBVhorO,Last year's wishes are this year's apologies E...,3 minutes 36 seconds,"['emo', 'modern rock', 'pop', 'rock']",2007-01-01,False,['Fall Out Boy'],https://open.spotify.com/track/7eOlcfmf0J6rvWQ...,I'm Like A Lawyer With The Way I'm Always Tryi...,https://p.scdn.co/mp3-preview/3e26c8d0d3e82e13...,https://i.scdn.co/image/ab67616d0000b2737eb920...
20796,I'm Like A Lawyer With The Way I'm Always Tryi...,Golden - Live From Hammersmith Palais,2nzAvHE8a63oX0PQTMq2UZ,How cruel is the golden rule? When the lives w...,2 minutes 35 seconds,"['emo', 'modern rock', 'pop', 'rock']",2007-01-01,False,['Fall Out Boy'],https://open.spotify.com/track/2nzAvHE8a63oX0P...,Golden - Live From Hammersmith Palais,https://p.scdn.co/mp3-preview/c1c2a7daaa3b4558...,https://i.scdn.co/image/ab67616d0000b2737eb920...


O(n^2) if I were to use difflib

In [32]:
taylor_swift = df[df['artists'].str.contains("Taylor Swift")]

In [41]:
import difflib

# create a copy of the DataFrame to avoid modifying the original DataFrame
df_no_duplicates = df.copy()

# iterate over all pairs of songs
for i in range(len(df_no_duplicates)):
    for j in range(i + 1, len(df_no_duplicates)):
        # get the lyrics of the two songs
        lyrics_i = df_no_duplicates.iloc[i]['normalized_lyrics']
        lyrics_j = df_no_duplicates.iloc[j]['normalized_lyrics']

        # calculate the similarity ratio
        similarity_ratio = difflib.SequenceMatcher(None, lyrics_i, lyrics_j).ratio()

        # if the songs are more than 90% similar, consider them duplicates
        if similarity_ratio > 0.9:
            # drop the second song
            df_no_duplicates = df_no_duplicates.drop(df_no_duplicates.index[j])

# reset the index of the DataFrame
df_no_duplicates = df_no_duplicates.reset_index(drop=True)

IndexError: single positional indexer is out-of-bounds

In [44]:
import difflib
import time

# create a copy of the DataFrame to avoid modifying the original DataFrame
taylor_swift_no_duplicates = taylor_swift.copy()

start_time = time.time()

# number of comparisons
num_comparisons = 0

# list to hold the indices of the rows to be dropped
drop_indices = []

# iterate over all pairs of songs
for i in range(len(taylor_swift_no_duplicates)):
    for j in range(i + 1, len(taylor_swift_no_duplicates)):
        # get the lyrics of the two songs
        lyrics_i = taylor_swift_no_duplicates.iloc[i]['normalized_lyrics']
        lyrics_j = taylor_swift_no_duplicates.iloc[j]['normalized_lyrics']

        # calculate the similarity ratio
        similarity_ratio = difflib.SequenceMatcher(None, lyrics_i, lyrics_j).ratio()

        # if the songs are more than 90% similar, consider them duplicates
        if similarity_ratio > 0.9:
            # add the index of the second song to the list of indices to be dropped
            drop_indices.append(j)

        num_comparisons += 1

# drop the duplicates
taylor_swift_no_duplicates = taylor_swift_no_duplicates.drop(drop_indices)

# reset the index of the DataFrame
taylor_swift_no_duplicates = taylor_swift_no_duplicates.reset_index(drop=True)

end_time = time.time()

total_time = end_time - start_time
average_time = total_time / num_comparisons

print(f"Total time taken: {total_time} seconds")
print(f"Average time per comparison: {average_time} seconds")

KeyError: '[63, 322, 321, 324, 74, 84, 92, 100, 106, 233, 331, 323, 328, 105, 322, 75, 305, 313, 78, 92, 91, 98, 232, 306, 303, 95, 97, 232, 306, 112, 271, 322, 305, 313, 321, 324, 302, 308, 309, 310, 329, 330, 306, 325, 327, 273, 277, 278, 279, 280, 281, 283, 284, 313, 324, 330] not found in axis'