In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
import pandas as pd
import lyricsgenius
from time import sleep
import re
from langdetect import detect

# Don't reload the file! 

# Data scraping, cleaning and preparing the data set

### **1.** We will start by collecting data from spotify with spotipy librarary for artist name and track name and based on them we will add and genres.

Spotify has extensive list of genres - 126, we will select the only the main popular generes.*

 
 * pop
 
 * rock
 
 * metal
 
 * hip-hop
 
 * r&b
 
 * country
 
*classification of popular genres : [Wikipedia link](https://en.wikipedia.org/wiki/List_of_popular_music_genres)



In [2]:
client_id = "client_id"
client_secret = "client secret"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
all_genres = sp.recommendation_genre_seeds()['genres']

In [4]:
## excluding asian pop genres because of the lyrics
excl_pop_list = ['cantopop','mandopop', 'pop-film' ,'k-pop']
r_n_b_list = ['r-n-b', 'blues', 'soul', 'disco']
hip_hop_list = ["hip-hop", 'trip-hop', 'hardcore']

In [5]:
def group_genres(x):
    grouped_genres = {'pop': [], 'hip-hop/rap' : [], 'rock' : [], 'country': [], 'r&b': [], 'metal' : []}
    for i in range(0, len(all_genres)):
        if "pop" in all_genres[i]:
            if all_genres[i] not in excl_pop_list:
                grouped_genres['pop'].append(all_genres[i])  
        elif 'rock'in all_genres[i]:
            grouped_genres['rock'].append(all_genres[i])    
        elif "country" in all_genres[i] or 'bluegrass' in all_genres[i]:
            grouped_genres['country'].append(all_genres[i])
        elif all_genres[i] in hip_hop_list:
            grouped_genres['hip-hop/rap'].append(all_genres[i])
        elif all_genres[i] in r_n_b_list:
            grouped_genres['r&b'].append(all_genres[i])
        elif 'metal' in all_genres[i]:
            grouped_genres['metal'].append(all_genres[i])
    return grouped_genres

In [6]:
grouped_genres = group_genres(all_genres)

In [7]:
### check genres list
grouped_genres

{'pop': ['indie-pop', 'j-pop', 'pop', 'power-pop', 'synth-pop'],
 'hip-hop/rap': ['hardcore', 'hip-hop', 'trip-hop'],
 'rock': ['alt-rock',
  'hard-rock',
  'j-rock',
  'psych-rock',
  'punk-rock',
  'rock',
  'rock-n-roll',
  'rockabilly'],
 'country': ['bluegrass', 'country'],
 'r&b': ['blues', 'disco', 'r-n-b', 'soul'],
 'metal': ['black-metal',
  'death-metal',
  'heavy-metal',
  'metal',
  'metal-misc',
  'metalcore']}

In [8]:

# Making API request - going thorough the slected genres and request Spotify API to return artist name, track name, track_href and genre name

artist_name = []
track_name = []
genre_name = []
track_id = []
artist_href = []
track_href = []
#album_image = []


for genre, subgenre in grouped_genres.items():
    
    for _ in range(0,len(subgenre)):
        for i in range (0, 1000, 50):
            q1 = 'genre:'+str(subgenre[_])
        
            genre_results = sp.search(q=q1, type='track', limit=50, offset = i)
            for i, t in enumerate(genre_results['tracks']['items']):           
                artist_name.append(t['artists'][0]['name'])
                track_name.append(t['name'])
                track_id.append(t['id'])
                artist_href.append(t['artists'][0]['external_urls']['spotify'])
                track_href.append(t['external_urls']['spotify'])
                #album_image.append(t['album']['images'][0]['url'])
                genre_name.append(genre)
          
    
spotify_dataframe = pd.DataFrame({'artist_name' : artist_name, 'track_name' : track_name, 'track_id' : track_id, 'artist_href' : artist_href, 'track_href' : track_href, 'genre_name' : genre_name})
print(spotify_dataframe.shape)

(27000, 6)


In [9]:
## check how many songs per genre we have
spotify_dataframe.groupby(['genre_name']).size()

genre_name
country        2000
hip-hop/rap    3000
metal          5000
pop            5000
r&b            4000
rock           8000
dtype: int64

In [10]:
# Data Exploration

# using groupby to check for duplicates

grouped = spotify_dataframe.groupby(['artist_name','track_name'], as_index=True).size()
grouped[grouped > 1].count()

4627

In [11]:
# Data Cleanup - dropping duplicates based on artist_name and track_name

# dropping those values

spotify_dataframe.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [12]:

# Data Exploration

# making sure they are gone

grouped_after_dropping = spotify_dataframe.groupby(['artist_name','track_name'], as_index=True).size()
grouped_after_dropping[grouped_after_dropping > 1].count()

0

In [13]:
spotify_dataframe.groupby(['genre_name']).size()

genre_name
country        1687
hip-hop/rap    2665
metal          3752
pop            4415
r&b            2290
rock           5360
dtype: int64

### **2.** Audio features of the songs

In [14]:
# Making API request - going thorough the selected genres and request Spotify API to return audio features of the songs

rows = []
batchsize = 100


for i in range(0,len(spotify_dataframe['track_id']),batchsize):
    batch = spotify_dataframe['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        rows.append(t)


In [15]:
# Data Exploration

audio_features_df = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", audio_features_df.shape)

Shape of the dataset: (20169, 18)


In [16]:

# Data Cleanup - dropping and renaming columns

columns_to_drop = ['analysis_url','type','uri', 'track_href','time_signature']
audio_features_df.drop(columns_to_drop, axis=1,inplace=True)
audio_features_df.rename(columns={'id': 'track_id'}, inplace=True)
audio_features_df.shape

(20169, 13)

In [17]:
# Data Cleanup - now we merge user inner to make sure we only keep track ids present

audio_ft_df = pd.merge(spotify_dataframe,audio_features_df,on='track_id',how='inner')
audio_ft_df.head()

Unnamed: 0,artist_name,track_name,track_id,artist_href,track_href,genre_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Glass Animals,Heat Waves,3USxtqRwSYz57Ewm6wWRMp,https://open.spotify.com/artist/4yvcSjfu4PC0CY...,https://open.spotify.com/track/3USxtqRwSYz57Ew...,pop,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805
1,Måneskin,Beggin',3Wrjm47oTz2sjIgck11l5e,https://open.spotify.com/artist/0lAWpj5szCSwM4...,https://open.spotify.com/track/3Wrjm47oTz2sjIg...,pop,0.714,0.8,11,-4.808,0,0.0504,0.127,0.0,0.359,0.589,134.002,211560
2,girl in red,we fell in love in october,1BYZxKSf0aTxp8ZFoeyM3d,https://open.spotify.com/artist/3uwAm6vQy7kWPS...,https://open.spotify.com/track/1BYZxKSf0aTxp8Z...,pop,0.566,0.366,7,-12.808,1,0.028,0.113,0.181,0.155,0.237,129.959,184154
3,Joy Again,Looking Out for You,3jfZ9M23l0L7RxzYMTgBTv,https://open.spotify.com/artist/4jCIAMb0zEArF3...,https://open.spotify.com/track/3jfZ9M23l0L7Rxz...,pop,0.682,0.9,9,-5.028,1,0.0874,0.0884,0.063,0.0685,0.937,100.721,179499
4,Conan Gray,Heather,4xqrdfXkTW4T0RauPLv3WA,https://open.spotify.com/artist/4Uc8Dsxct0oMqx...,https://open.spotify.com/track/4xqrdfXkTW4T0Ra...,pop,0.357,0.425,5,-7.301,1,0.0333,0.584,0.0,0.322,0.27,102.078,198040


In [18]:
# Data Cleanup - making everything lower case

audio_ft_df['track_name'] = audio_ft_df['track_name'].apply(lambda row: row.lower())
audio_ft_df['artist_name'] = audio_ft_df['artist_name'].apply(lambda row: row.lower())

In [19]:
audio_ft_df.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

source code :[github link](https://github.com/etarakci/music-genre-prediction/blob/master/jupyter_notebooks/Spotify%20API.ipynb)

### **3.** Lyrics of the songs

We will scrape lyrics with lyricsgenius library based on the artist name and track name.

To make it easier later on to make the dataset balanced ( have the same number of songs per each genre) we will divide each music genre into different dataframe.

In [20]:
audio_ft_df['lyrics'] = ''

In [21]:
genius = lyricsgenius.Genius('key')
# set params
genius.verbose = False # Turn off status messages
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.skip_non_songs = False # Include hits thought to be non-songs (e.g. track lists)

In [25]:
rock_df = audio_ft_df[audio_ft_df.genre_name == "rock"][-1300:]
country_df = audio_ft_df[audio_ft_df.genre_name == "country"][-1300:]
pop_df = audio_ft_df[audio_ft_df.genre_name == "pop"][-1300:]
r_n_b_df = audio_ft_df[audio_ft_df.genre_name == "r&b"][-1300:]
metal_df = audio_ft_df[audio_ft_df.genre_name == "metal"][-1300:]
hip_hop_df = audio_ft_df[audio_ft_df.genre_name == "hip-hop/rap"][-1300:]

In [26]:
def add_lyrics(df):
   
    for artist, track in zip(df.artist_name, df.track_name):
        try:
            song = genius.search_song(track,artist)
            row = df[(df.artist_name == artist) & (df.track_name == track)].index.item()
            df.at[row,'lyrics'] = song.lyrics       
        except AttributeError:
            pass
        except ValueError:
            pass

In [29]:
rock_lyrics = add_lyrics(rock_df)

In [30]:
country_lyrics = add_lyrics(country_df)

In [33]:
pop_lyrics = add_lyrics(pop_df)

In [35]:
r_n_b_lyrics = add_lyrics(r_n_b_df)

In [37]:
metal_lyrics = add_lyrics(metal_df)

In [38]:
hip_hop_lyrics = add_lyrics(hip_hop_df)

In [273]:
with_lyrics = rock_df.append([country_df, pop_df, hip_hop_df, r_n_b_df, metal_df])

In [274]:
with_lyrics.genre_name.value_counts()

hip-hop/rap    1300
metal          1300
pop            1300
rock           1300
country        1300
r&b            1300
Name: genre_name, dtype: int64

In [275]:
with_lyrics.groupby(with_lyrics.lyrics == '').size()

lyrics
False    7016
True      784
dtype: int64

In [None]:
### remove empty rows from the dataframes

In [276]:
without_empty_lyrics = with_lyrics[with_lyrics.lyrics != '']

In [277]:
without_empty_lyrics.genre_name.value_counts()

metal          1220
r&b            1213
country        1210
pop            1176
rock           1155
hip-hop/rap    1042
Name: genre_name, dtype: int64

In [278]:
# check if all empy cells are deleted
without_empty_lyrics.groupby(without_empty_lyrics.lyrics == '').size()

lyrics
False    7016
dtype: int64

In [101]:
#cleaned_country_df = country_df[country_df.lyrics != '']

In [103]:
#cleaned_rock_df = rock_df[rock_df.lyrics != '']

In [104]:
#cleaned_pop_df = pop_df[pop_df.lyrics != '']

In [105]:
#cleaned_r_n_b_df = r_n_b_df[r_n_b_df.lyrics != '']

In [106]:
#cleaned_metal_df = metal_df[metal_df.lyrics != '']

In [107]:
#cleaned_hip_hop_df = hip_hop_df[hip_hop_df.lyrics != '']

Some songs have remixes, variants, live

In [279]:

def check_duplicates(key_word, dataset):
    counter = 0
    artist_to_ckeck = {}
    for artist, song in zip(dataset.artist_name,dataset.track_name):
        song_ = song.split(' ')
        artist_ = artist
        if key_word in song_:
            counter+=1
            if artist_ not in artist_to_ckeck:
                artist_to_ckeck[artist_] = []
                artist_to_ckeck[artist_].append(song)
            else:
                artist_to_ckeck[artist_].append(song)
    print(f'{key_word} found {counter}')
    return artist_to_ckeck

In [280]:
def clear_duplicates(key_word, dataset):
    for artist, song in zip(dataset.artist_name,dataset.track_name):
        song_ = song.split(' ')
        artist_ = artist
        if key_word in song_:
            index = dataset[(dataset.artist_name == artist_) & (dataset.track_name == song)].index[0]
            #dataset.track_name.loc[index] = song.split(' - ')[0]
            dataset.at[index,'track_name'] = song.split(' - ')[0]
            

    return dataset

In [281]:
check_duplicates('remix', without_empty_lyrics)

remix found 61


{'paul anka': ['puppy love - remix'],
 'kane brown': ['memory - feather remix', 'memory - said the sky remix'],
 'brantley gilbert': ['bottoms up - remix'],
 'lillix': ['tomorrow - radio remix'],
 'weezer': ['africa - rac remix'],
 'tove lo': ['habits (stay high) - hippie sabotage remix',
  'cool girl - nora en pure remix',
  'talking body - gryffin remix',
  'talking body - kream remix',
  'talking body - the young professionals remix',
  'habits (stay high) - oliver nelson remix'],
 'great good fine ok': ["you're the one for me - digital farm animals remix"],
 'tove styrke': ['borderline - vanic remix'],
 'la roux': ['automatic driver - tyler, the creator remix',
  'in for the kill - skrillex remix',
  "in for the kill - skream's let's get ravey remix"],
 'gunship': ['tech noir - carpenter brut remix'],
 'a-ha': ['take on me - kygo remix'],
 'miike snow': ['animal - mark ronson remix',
  'genghis khan - louis the child remix',
  'silvia - roboberget remix'],
 'chvrches': ['clearest b

In [282]:
without_remixes = clear_duplicates('remix',without_empty_lyrics)

In [283]:
check_duplicates('remastered', without_remixes)

remastered found 50


{'u2': ['with or without you - remastered'],
 'queen': ['fat bottomed girls - remastered 2011',
  'we are the champions - remastered 2011'],
 'little river band': ['reminiscing - remastered'],
 'elton john': ['goodbye yellow brick road - remastered 2014'],
 'paul mccartney': ['wonderful christmastime - edited version / remastered 2011'],
 'the clovers': ['love potion no. 9 - remastered / single version'],
 'bruce channel': ['hey! baby - remastered'],
 'bobby vee': ['take good care of my baby - 1990 remastered'],
 'fats domino': ["ain't that a shame? - remastered 2002",
  'blueberry hill - remastered 2002',
  'walking to new orleans - remastered',
  "i'm walkin' - remastered"],
 'ricky nelson': ['never be anyone else but you - remastered'],
 'elvis presley': ['blue suede shoes - remastered',
  'always on my mind - remastered',
  'fame and fortune - remastered'],
 'dion & the belmonts': ['i wonder why - remastered'],
 'johnny rivers': ['summer rain - remastered'],
 'neil sedaka': ['calen

In [284]:
without_remastered = clear_duplicates('remastered',without_remixes)

In [285]:
check_duplicates('remaster', without_remastered)

remaster found 91


{'fleetwood mac': ['never going back again - 2004 remaster'],
 'david bowie': ['rebel rebel - 2016 remaster',
  'space oddity - 2015 remaster',
  'heroes - 2017 remaster'],
 'phil collins': ["you can't hurry love - 2016 remaster"],
 'neil young': ['heart of gold - 2009 remaster'],
 'led zeppelin': ['going to california - remaster'],
 'genesis': ["that's all - 2007 remaster"],
 'cliff richard': ['devil woman - 2001 remaster',
  'summer holiday - 2003 remaster',
  'ocean deep - 2002 remaster',
  'the young ones - 2005 remaster',
  'move it - 2002 remaster'],
 'the everly brothers': ["cathy's clown - 2007 remaster",
  'love hurts - 2007 remaster'],
 'the dave clark five': ['because - 2019 - remaster',
  'glad all over - 2019 - remaster'],
 'dr. feelgood': ['milk and alcohol - 2002 remaster'],
 'eagles': ['desperado - 2013 remaster'],
 'son volt': ['tear stained eye - 2015 remaster'],
 'the replacements': ['here comes a regular - 2008 remaster'],
 'the cars': ["since you're gone - 2017 rem

In [286]:
without_remaster = clear_duplicates('remaster', without_remastered)

In [287]:
check_duplicates('version', without_remaster)

version found 100


{'santana': ['black magic woman - single version'],
 'chuck berry': ['run rudolph run - single version',
  'let it rock - single version',
  "you can't catch me - single version"],
 'the righteous brothers': ["you've lost that lovin' feelin' - single version"],
 'jerry lee lewis': ['great balls of fire - 1964 version',
  "whole lot of shakin' going on - single version"],
 'the bobby fuller four': ['i fought the law - single version'],
 'the searchers': ['love potion number nine - stereo version',
  'when you walk in the room - mono version',
  'sweets for my sweet - stereo version'],
 'the marcels': ['blue moon - 45 version'],
 'little eva': ['the loco-motion - single version'],
 'roger miller': ['little green apples - single version'],
 'bill haley & his comets': ['shake, rattle and roll - single version',
  "(we're gonna) rock around the clock - single version"],
 'the spencer davis group': ['keep on running - mono version'],
 'martha reeves & the vandellas': ['nowhere to run - singl

In [288]:
without_version = clear_duplicates('version', without_remaster)

In [289]:
check_duplicates('edit', without_version)

edit found 34


{'chris cagle': ['chicks dig it - single edit'],
 'cheap trick': ['the flame - single edit'],
 'paul young': ['every time you go away - radio edit'],
 'erasure': ["oh l'amour - edit"],
 'cutting crew': ["i've been in love before - edit"],
 'modern talking': ["brother louie mix '98 (feat. eric singleton) - radio edit"],
 'icona pop': ['all night - cash cash remix; radio edit'],
 'rick astley': ['cry for help - single edit'],
 'ladytron': ['ghosts - single edit'],
 'naked eyes': ['promises, promises - single edit'],
 'the stranglers': ['walk on by - radio edit'],
 'dead or alive': ['brand new lover - edit'],
 'everything but the girl': ['missing - todd terry club mix / us radio edit'],
 'faithless': ['insomnia - radio edit', 'we come 1 - radio edit'],
 'little dragon': ['lover chanting - edit'],
 'röyksopp': ['remind me - radio edit'],
 'bent': ['magic love - radio edit'],
 'dj shadow': ['scale it back - radio edit'],
 'groove armada': ['i see you baby (feat. gramma funk) - fatboy slim r

In [290]:
without_edit = clear_duplicates('edit', without_version)

In [291]:
check_duplicates('mix', without_edit)

mix found 26


{'elvis presley': ['if i can dream - stereo mix'],
 'morgan wallen': ['heartless - wallen album mix'],
 'modern talking': ["brother louie mix '98 (feat. eric singleton)"],
 'howard jones': ['what is love? - extended mix'],
 'little dragon': ['the other lover - little dragon vocal celebration mix'],
 "'til tuesday": ['voices carry - single mix'],
 'tove lo': ['habits (stay high) - the chainsmokers extended mix'],
 'the stranglers': ['always the sun - sunny side up mix'],
 'wham!': ['last christmas - pudding mix'],
 'air': ['modular mix'],
 'the orb': ['little fluffy clouds - dance mix 2'],
 'propellerheads': ['history repeating - knee length mix'],
 'faithless': ['insomnia - monster mix'],
 'björk': ['i miss you (dobie rub part one) - sunshine mix'],
 'sneaker pimps': ['6 underground - the perfecto mix',
  'spin spin sugar - armands dark garage mix'],
 'everything but the girl': ['missing - todd terry club mix'],
 'lionel richie': ['the one - radio mix',
  'the one - the mix',
  "don't 

In [292]:
without_mix = clear_duplicates('mix', without_edit)

In [293]:
check_duplicates('live', without_mix)

live found 41


{'elvis presley': ["i just can't help believin' - live",
  "trying to get to you - second 'sit-down' show - live",
  "are you lonesome tonight? - first 'sit-down' show - live",
  'sweet caroline - live',
  'the wonder of you - live',
  'burning love - live'],
 'sam cooke': ['try a little tenderness / (i love you) for sentimental reasons / you send me - live',
  'this little light of mine - live'],
 'patsy cline': ['strange - live',
  "leavin' on your mind - live",
  'blue moon of kentucky - live',
  "you're stronger than me - live",
  "why can't he be you - live",
  'imagine that - live',
  'crazy - live',
  'a church, a courtroom, and then goodbye - live'],
 'jerry lee lewis': ['great balls of fire - live'],
 'trampled by turtles': ['brown-eyed women - live'],
 'jerry garcia band': ['i shall be released - live',
  'the way you do the things you do - live',
  'after midnight - live'],
 'the wood brothers': ['ophelia - live'],
 'elephant revival': ['have a cigar - live at red rocks'],
 

In [294]:
without_live = clear_duplicates('live', without_mix)

In [295]:
without_duplicates = without_live.drop_duplicates(subset=['artist_name','track_name'])

In [296]:
without_duplicates.shape

(6878, 19)

In [297]:
check_duplicates('remix', without_duplicates)

remix found 2


{'a$ap ferg': ['plain jane remix (feat. nicki minaj)'],
 'goldlink': ['crew remix (feat. gucci mane, brent faiyaz & shy glizzy)']}

In [298]:
### keep only English lyrics

def map_language(data):
    df = data.copy()
    df['Language'] = df['lyrics'].apply(detect)
    return df

In [422]:
mapped_df = map_language(without_duplicates)

In [423]:
mapped_df.Language.unique()

array(['en', 'ca', 'pt', 'es', 'de', 'so', 'id', 'fr', 'ro', 'nl', 'ru',
       'tl', 'af', 'sk', 'no', 'sw', 'it', 'ja', 'fi', 'pl', 'cs'],
      dtype=object)

In [440]:
english_lyrics = mapped_df [mapped_df.Language == 'en']

In [441]:
english_lyrics.groupby(['genre_name']).size()

genre_name
country        1205
hip-hop/rap     967
metal          1153
pop            1096
r&b             960
rock           1056
dtype: int64

In [442]:
final_english_lyrics = english_lyrics[~english_lyrics['lyrics'].str.contains('last updated:', case=False)]

In [443]:
final = final_english_lyrics[~final_english_lyrics['lyrics'].str.contains('CHAPTER', case=True)]

In [444]:
final.groupby(['genre_name']).size()

genre_name
country        1196
hip-hop/rap     961
metal          1147
pop            1078
r&b             957
rock           1046
dtype: int64

In [445]:
final_rock_df = final[final.genre_name == "rock"][-950:]
final_country_df = final[final.genre_name == "country"][-950:]
final_pop_df = final[final.genre_name == "pop"][-950:]
final_r_n_b_df = final[final.genre_name == "r&b"][-950:]
final_metal_df = final[final.genre_name == "metal"][-950:]
final_hip_hop_df = final[final.genre_name == "hip-hop/rap"][-950:]

In [446]:
final_with_lyrics = final_rock_df.append([final_country_df, final_pop_df, final_hip_hop_df, final_r_n_b_df,final_metal_df])

In [447]:
final_with_lyrics.groupby(['genre_name']).size()

genre_name
country        950
hip-hop/rap    950
metal          950
pop            950
r&b            950
rock           950
dtype: int64

In [450]:
final_with_lyrics.to_csv(r'C:\Users\PC\Desktop\Machine Learning\ML Project\final_with_lyrics.csv', index = False, header=True)

In [417]:
final_english_lyrics[(final_english_lyrics.genre_name == 'metal') & (final_english_lyrics.track_name == 'breaking the mirror - acoustic')].index

Int64Index([20152], dtype='int64')