# Data Gathering

## Credentials

In [1]:
pip install spotipy



In [2]:
CLIENT_ID = 'ENTER YOUR CLIENT ID HERE'
CLIENT_SECRET = 'ENTER YOUR CLIENT SECRET HERE'

import pandas as pd
import numpy as np

import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID,client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Fetching playlists

In [3]:
def get_user_playlist(username, playlist_id, sp):
    limit = 100
    offset = 0
    all_songs = []
    while True:
        playlist_songs = sp.user_playlist_tracks(username, playlist_id, limit=limit, offset=offset)['items']
        if not playlist_songs:
            break
        all_songs.extend(playlist_songs)
        offset += limit
    return all_songs


def get_audio_features_batch(ids):
    features = []
    for i in range(0, len(ids), 100):
        batch = ids[i:i+100]
        audio_features = sp.audio_features(batch)
        features.extend([feature if feature else {'danceability': 0, 'loudness': 0, 'tempo': 0, 'acousticness': 0,
                                                  'energy': 0, 'valence': 0, 'liveness': 0, 'speechiness': 0} for feature in audio_features])
    return features

def create_dataframe(playlist_songs):
    playlist_df = pd.DataFrame()
    playlist_df['artist'] = np.array([song['track']["album"]["artists"][0]["name"] for song in playlist_songs])
    playlist_df['artistid'] = np.array([song['track']["album"]["artists"][0]['external_urls']['spotify'] for song in playlist_songs])
    playlist_df['albumid'] = np.array([song['track']['album']['external_urls']['spotify'] for song in playlist_songs])
    playlist_df['name'] = np.array([song['track']['name'] for song in playlist_songs])
    playlist_df['id'] = np.array([song['track']['id'] for song in playlist_songs])
    playlist_df['username'] = np.array([song['added_by']['id'] for song in playlist_songs])
    playlist_df['explicit'] = np.array([song['track']['explicit'] for song in playlist_songs])
    playlist_df['duration'] = np.array([song['track']['duration_ms'] for song in playlist_songs])
    playlist_df['popularity'] = np.array([song['track']['popularity'] for song in playlist_songs])

    ids = playlist_df['id'].tolist()
    audio_analysis = get_audio_features_batch(ids)

    playlist_df['danceability'] = np.array([audio_info['danceability'] for audio_info in audio_analysis])
    playlist_df['loudness'] = np.array([audio_info['loudness'] for audio_info in audio_analysis])
    playlist_df['tempo'] = np.array([audio_info['tempo'] for audio_info in audio_analysis])
    playlist_df['acousticness'] = np.array([audio_info['acousticness'] for audio_info in audio_analysis])
    playlist_df['energy'] = np.array([audio_info['energy'] for audio_info in audio_analysis])
    playlist_df['valence'] = np.array([audio_info['valence'] for audio_info in audio_analysis])
    playlist_df['liveness'] = np.array([audio_info['liveness'] for audio_info in audio_analysis])
    playlist_df['speechiness'] = np.array([audio_info['speechiness'] for audio_info in audio_analysis])

    return playlist_df

In [4]:
playlists = sp.user_playlists('31dqplp75hzpns3zkvrs64lncguu')

for playlist in playlists['items']:
    print(f"Playlist ID: {playlist['id']}, Name: {playlist['name']}")

Playlist ID: 6X1S5Ap5sH7c2S6EUkvlE8, Name: playlist 3: g(old) maybe
Playlist ID: 0KunxfKbKPWcVF238jSHTR, Name: persian
Playlist ID: 3C7N8WPAxab3qWnNEPod7E, Name: playlist 2: hype maybe
Playlist ID: 4I7SVIKI1Mf777N1dRWDQb, Name: playlist 1: calm maybe


In [5]:
Negin = 'MelodicWanderer'
n1 = get_user_playlist(Negin, 'spotify:playlist:4I7SVIKI1Mf777N1dRWDQb',sp)
n2 = get_user_playlist(Negin, 'spotify:playlist:3C7N8WPAxab3qWnNEPod7E',sp)
n3 = get_user_playlist(Negin, 'spotify:playlist:6X1S5Ap5sH7c2S6EUkvlE8',sp)
n4 = get_user_playlist(Negin, 'spotify:playlist:0KunxfKbKPWcVF238jSHTR',sp)

n_df1 = create_dataframe(n1)
n_df2 = create_dataframe(n2)
n_df3 = create_dataframe(n3)
n_df4 = create_dataframe(n4)

ndf_songs = pd.concat([n_df1, n_df2, n_df3, n_df4])

ndf_songs['username'] = 'MelodicWanderer'
print('MelodicWanderer ok')

MelodicWanderer ok


In [6]:
print(ndf_songs.info())

<class 'pandas.core.frame.DataFrame'>
Index: 440 entries, 0 to 25
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   artist        440 non-null    object 
 1   artistid      440 non-null    object 
 2   albumid       440 non-null    object 
 3   name          440 non-null    object 
 4   id            440 non-null    object 
 5   username      440 non-null    object 
 6   explicit      440 non-null    bool   
 7   duration      440 non-null    int64  
 8   popularity    440 non-null    int64  
 9   danceability  440 non-null    float64
 10  loudness      440 non-null    float64
 11  tempo         440 non-null    float64
 12  acousticness  440 non-null    float64
 13  energy        440 non-null    float64
 14  valence       440 non-null    float64
 15  liveness      440 non-null    float64
 16  speechiness   440 non-null    float64
dtypes: bool(1), float64(8), int64(2), object(6)
memory usage: 58.9+ KB
None


In [7]:
ndf_songs.head()

Unnamed: 0,artist,artistid,albumid,name,id,username,explicit,duration,popularity,danceability,loudness,tempo,acousticness,energy,valence,liveness,speechiness
0,Damien Rice,https://open.spotify.com/artist/14r9dR01KeBLFf...,https://open.spotify.com/album/3ADELRoZ4I8WLE7...,The Blower's Daughter,5yyRH93h4Pm6tXXYTxt7ea,MelodicWanderer,False,286653,61,0.342,-13.569,133.113,0.134,0.2,0.0743,0.341,0.0324
1,Jay-Jay Johanson,https://open.spotify.com/artist/4hzC9WUUy3cFit...,https://open.spotify.com/album/6a2gOz878OZzAA1...,Only for You,2XZPIzXlCo9PJLS8SV7TYX,MelodicWanderer,False,230240,10,0.482,-12.924,122.593,0.852,0.259,0.401,0.105,0.0603
2,Lily Allen,https://open.spotify.com/artist/13saZpZnCDWOI9...,https://open.spotify.com/album/2BWTPrj9yBBC79f...,Somewhere Only We Know,3DDGhkxq3FeUorZppSgeLE,MelodicWanderer,False,208154,54,0.314,-10.191,172.021,0.935,0.286,0.278,0.0999,0.035
3,Jacob Lee,https://open.spotify.com/artist/4xPR9p75zzDlis...,https://open.spotify.com/album/1iOMcI8A0HvLrL2...,Reality,6RbNwr6JUXMEr45kJskQlo,MelodicWanderer,False,235194,33,0.597,-5.838,85.864,0.123,0.642,0.294,0.27,0.0285
4,Stephen Sanchez,https://open.spotify.com/artist/5XKFrudbV4IiuE...,https://open.spotify.com/album/6BUPtXbb2tspYnk...,Easy On My Eyes,2E0Lr1ecydv5MjTYYM0WhN,MelodicWanderer,False,238173,46,0.501,-13.042,122.334,0.948,0.105,0.203,0.0875,0.041


In [8]:
playlists = sp.user_playlists('31najszeulc42eapvkvi2wbrdq4e')

for playlist in playlists['items']:
    print(f"Playlist ID: {playlist['id']}, Name: {playlist['name']}")

Playlist ID: 0Qpv8d228v8brZPvJNGBT5, Name: …


In [9]:
friend = 'Icecreamy008'
f1 = get_user_playlist(friend, 'spotify:playlist:0Qpv8d228v8brZPvJNGBT5',sp)
fdf_songs = create_dataframe(f1)
fdf_songs['username'] = 'Icecreamy008'
print('Icecreamy008 ok')

Icecreamy008 ok


In [10]:
print(fdf_songs.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   artist        139 non-null    object 
 1   artistid      140 non-null    object 
 2   albumid       140 non-null    object 
 3   name          140 non-null    object 
 4   id            140 non-null    object 
 5   username      140 non-null    object 
 6   explicit      140 non-null    bool   
 7   duration      140 non-null    int64  
 8   popularity    140 non-null    int64  
 9   danceability  140 non-null    float64
 10  loudness      140 non-null    float64
 11  tempo         140 non-null    float64
 12  acousticness  140 non-null    float64
 13  energy        140 non-null    float64
 14  valence       140 non-null    float64
 15  liveness      140 non-null    float64
 16  speechiness   140 non-null    float64
dtypes: bool(1), float64(8), int64(2), object(6)
memory usage: 17.8+ KB
None


In [11]:
fdf_songs = fdf_songs.dropna(subset=['artist'])

### Fetching top hits

In [12]:
# today's top hits
s_2024 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DXcBWIGoYBM5M',sp)
s_2024 = create_dataframe(s_2024)
s_2024['username'] = "Spotify"
print('top-hits ok')

top-hits ok


### Fetching All Out playlists of each decade

In [13]:
# spotify All Outs
# most popular songs by decade
s_50 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DWSV3Tk4GO2fq',sp)
s_60 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DXaKIA8E7WcJj',sp)
s_70 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DWTJ7xPn4vNaz',sp)
s_80 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DX4UtSsGT1Sbe',sp)
s_90 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DXbTxeAdrVG2l',sp)
s_00 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DX4o1oenSJRJd',sp)
s_10 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DX5Ejj0EkURtP',sp)
s_20 = get_user_playlist('spotify', 'spotify:playlist:37i9dQZF1DX2M1RktxUUHG', sp)

s_50 = create_dataframe(s_50)
s_60 = create_dataframe(s_60)
s_70 = create_dataframe(s_70)
s_80 = create_dataframe(s_80)
s_90 = create_dataframe(s_90)
s_00 = create_dataframe(s_00)
s_10 = create_dataframe(s_10)
s_20 = create_dataframe(s_20)

s_decades_songs = pd.concat([s_50, s_60, s_70, s_80, s_90, s_00, s_10, s_20])

s_decades_songs['username'] = "Spotify"
print('decades ok')

decades ok


In [14]:
print(
    ndf_songs['username'].value_counts(),
    fdf_songs['username'].value_counts(),
    s_2024.shape,
    s_decades_songs.shape)

username
MelodicWanderer    440
Name: count, dtype: int64 username
Icecreamy008    139
Name: count, dtype: int64 (50, 17) (1200, 17)


In [15]:
df = pd.concat([ndf_songs, fdf_songs, s_2024])

df_decade = s_decades_songs

Spotify does not include the date of published and genres directly through tracks. We have to carve them out from artist and album information.

### Get Genres from artists and add to dataframe

In [16]:
from tqdm import tqdm

# Unique artist IDs
artist_tmp = df['artistid'].unique()

# DataFrame to hold artist genres
artist_genres = pd.DataFrame([], columns=['artistid', 'genres'])

# Fetch genres for each artist
for tmp in tqdm(artist_tmp):
    sp_artist = sp.artist(tmp)
    artist_genres = pd.concat([artist_genres, pd.DataFrame({'artistid': [tmp], 'genres': [sp_artist['genres'][0:5]]})], ignore_index=True)

# Merge genres into main DataFrame
df = pd.merge(df, artist_genres, on='artistid', how='left')

100%|██████████| 331/331 [00:29<00:00, 11.21it/s]


### Get Dates from albums and add to dataframe

In [17]:
from tqdm import tqdm

# Initialize DataFrames
album_tmp = df['albumid'].unique()
album_date = pd.DataFrame([], columns=['albumid', 'date'])

# Fetch album release dates using Spotify
for tmp in tqdm(album_tmp):
    sp_album = sp.album(tmp)
    album_date = pd.concat([album_date, pd.DataFrame({'albumid': [tmp], 'date': sp_album["release_date"]})], ignore_index=True)

# Merge the release dates back to the main dataframe
df = pd.merge(df, album_date, on='albumid', how='left')

100%|██████████| 523/523 [00:54<00:00,  9.67it/s]


In [18]:
# Convert 'date' column to datetime format and extract the year
df['date'] = pd.to_datetime(df['date'], format='mixed')
df['date'] = df['date'].dt.year

In [19]:
df = df.drop(['artistid', 'albumid', 'id'], axis=1)
df

Unnamed: 0,artist,name,username,explicit,duration,popularity,danceability,loudness,tempo,acousticness,energy,valence,liveness,speechiness,genres,date
0,Damien Rice,The Blower's Daughter,MelodicWanderer,False,286653,61,0.342,-13.569,133.113,0.134000,0.200,0.0743,0.3410,0.0324,"[acoustic pop, irish rock, irish singer-songwr...",2003
1,Jay-Jay Johanson,Only for You,MelodicWanderer,False,230240,10,0.482,-12.924,122.593,0.852000,0.259,0.4010,0.1050,0.0603,"[swedish electropop, swedish singer-songwriter...",2007
2,Lily Allen,Somewhere Only We Know,MelodicWanderer,False,208154,54,0.314,-10.191,172.021,0.935000,0.286,0.2780,0.0999,0.0350,"[dance pop, electropop, neo mellow]",2013
3,Jacob Lee,Reality,MelodicWanderer,False,235194,33,0.597,-5.838,85.864,0.123000,0.642,0.2940,0.2700,0.0285,[australian pop],2019
4,Stephen Sanchez,Easy On My Eyes,MelodicWanderer,False,238173,46,0.501,-13.042,122.334,0.948000,0.105,0.2030,0.0875,0.0410,[gen z singer-songwriter],2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,Kendrick Lamar,Not Like Us,Spotify,True,274192,89,0.898,-7.001,101.061,0.010700,0.472,0.2140,0.1410,0.0776,"[conscious hip hop, hip hop, rap, west coast rap]",2024
625,Taylor Swift,Fortnight (feat. Post Malone),Spotify,False,228965,85,0.504,-10.976,192.004,0.502000,0.386,0.2810,0.0961,0.0308,[pop],2024
626,Jimin,Who,Spotify,False,170887,92,0.660,-3.743,116.034,0.002890,0.756,0.8380,0.1930,0.0320,[k-pop],2024
627,Linkin Park,The Emptiness Machine,Spotify,True,190427,90,0.466,-3.344,184.115,0.015600,0.872,0.8060,0.1210,0.0336,"[alternative metal, nu metal, post-grunge, rap...",2024


In [20]:
df.to_csv('/content/Spotify_music_taste.csv', index=False)

# Decades

In [21]:
#Repeat for decade df
artist_tmp = df_decade['artistid'].unique()

artist_genres = pd.DataFrame([], columns=['artistid', 'genres'])

for tmp in tqdm(artist_tmp):
    sp_artist = sp.artist(tmp)
    artist_genres = pd.concat([artist_genres, pd.DataFrame({'artistid': [tmp], 'genres': [sp_artist['genres'][0:5]]})], ignore_index=True)

# Merge genres into main DataFrame
df_decade = pd.merge(df_decade, artist_genres, on='artistid', how='left')

100%|██████████| 659/659 [01:03<00:00, 10.30it/s]


In [22]:
album_tmp = df_decade['albumid'].unique()

album_date = pd.DataFrame([], columns=['albumid', 'date'])

for tmp in tqdm(album_tmp):
    sp_album = sp.album(tmp)
    album_date = pd.concat([album_date, pd.DataFrame({'albumid': [tmp], 'date': sp_album["release_date"]})], ignore_index=True)

# Merge the release dates back to the main dataframe
df_decade = pd.merge(df_decade, album_date, on='albumid', how='left')

100%|██████████| 1039/1039 [01:49<00:00,  9.49it/s]


In [23]:
df_decade['date'] = pd.to_datetime(df_decade['date'], format='mixed')
# Extract the decade
df_decade['date'] = (df_decade['date'].dt.year // 10) * 10

In [24]:
df_decade = df_decade.drop(['artistid', 'albumid', 'id'], axis=1)
df_decade

Unnamed: 0,artist,name,username,explicit,duration,popularity,danceability,loudness,tempo,acousticness,energy,valence,liveness,speechiness,genres,date
0,Elvis Presley,Jailhouse Rock,Spotify,False,146480,65,0.647,-9.538,167.396,0.4100,0.5820,0.915,0.0715,0.0755,"[rock-and-roll, rockabilly]",1950
1,Harry Belafonte,Banana Boat (Day-O),Spotify,False,183133,69,0.797,-16.881,122.545,0.8850,0.1050,0.419,0.1080,0.0969,[calypso],1950
2,Big Mama Thornton,Hound Dog,Spotify,False,181000,54,0.563,-10.862,133.285,0.7060,0.5420,0.785,0.3550,0.0962,"[acoustic blues, blues, harmonica blues, rhyth...",1980
3,Bobby Darin,Beyond the Sea,Spotify,False,172480,69,0.521,-7.456,136.483,0.7230,0.5160,0.569,0.2570,0.0369,"[adult standards, easy listening, lounge, rock...",1950
4,Peggy Lee,Fever,Spotify,False,201333,54,0.742,-19.187,137.473,0.0906,0.0899,0.351,0.1130,0.1580,"[adult standards, torch song, vocal jazz]",1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,Rosa Linn,SNAP,Spotify,False,179551,60,0.565,-8.198,170.010,0.1070,0.6360,0.525,0.4470,0.0638,[alt z],2020
1196,Tom Grennan,Little Bit of Love,Spotify,False,226268,59,0.689,-4.480,106.520,0.0833,0.7060,0.623,0.1050,0.0312,[uk pop],2020
1197,Ed Sheeran,Bad Habits,Spotify,False,230746,59,0.807,-3.745,126.011,0.0451,0.8930,0.537,0.3660,0.0347,"[pop, singer-songwriter pop, uk pop]",2020
1198,Sabrina Carpenter,Nonsense,Spotify,False,163648,58,0.740,-4.912,138.992,0.0268,0.6970,0.732,0.2240,0.0340,[pop],2020


In [25]:
df_decade.to_csv('/content/Spotify_All_Out_decade.csv', index=False)