In [1]:
#!pip install -U python-dotenv
#!pip install spotipy
#!pip install matplotlib

In [72]:
import pandas as pd
import os
import plotly.express as px
import spotipy
from spotipy import oauth2
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import MeanShift
from sklearn.cluster import KMeans

In [3]:
path = os.getcwd()

# setting up authentication for spotipy
load_dotenv(dotenv_path=path + f"\env_vars.env")
os.environ['SPOTIPY_CLIENT_ID'] = os.environ.get('SPOTIPY_CLIENT_ID')
os.environ['SPOTIPY_CLIENT_SECRET'] = os.environ.get('SPOTIPY_CLIENT_SECRET')
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

### Part 1: Loading in the Data
#### We will load in my personal data blah blah, you can replace the string with your file path to analyze your music

In [5]:
num_histories = 2
listening = pd.DataFrame()
for i in range(num_histories):
    temp_df = pd.read_json(path + f"\MyData\StreamingHistory{i}.json")
    listening = pd.concat([listening, temp_df])
print(listening.shape)
listening.head(5)

(18208, 4)


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2022-10-03 16:03,Tori Templet,Butterfly Rain,167166
1,2022-10-04 18:40,Maisie Peters,The List,68976
2,2022-10-07 18:05,Catie Turner,Nothing,4335
3,2022-10-07 18:05,Billie Marten,Mice,1650
4,2022-10-07 18:06,Jacob Collier,In Too Deep (feat. Kiana Ledé),18280


In [6]:
listening.groupby(by=['artistName', 'trackName']).count().shape

(4445, 2)

In [7]:
playlists = pd.read_json(path + f"\MyData\Playlist1.json")
playlists.tail(5)

Unnamed: 0,playlists
24,"{'name': 'Cities', 'lastModifiedDate': '2023-0..."
25,"{'name': 'Saved Mix 2', 'lastModifiedDate': '2..."
26,"{'name': 'Stuck in Jess' Head', 'lastModifiedD..."
27,"{'name': 'Musicccccc', 'lastModifiedDate': '20..."
28,"{'name': 'Acoustic music', 'lastModifiedDate':..."


### Part 2: EDA

In [8]:
playlists['playlists'][3]

{'name': 'Hi',
 'lastModifiedDate': '2023-08-05',
 'items': [{'track': {'trackName': 'What A Wonderful World',
    'artistName': "Israel Kamakawiwo'ole",
    'albumName': 'Wonderful World',
    'trackUri': 'spotify:track:1Fzm9s6Fh1Eumj5tU4q20m'},
   'episode': None,
   'localTrack': None,
   'addedDate': '2023-08-05'},
  {'track': {'trackName': 'A Thousand Years',
    'artistName': 'The Piano Guys',
    'albumName': 'The Piano Guys',
    'trackUri': 'spotify:track:4eYaDRhiL5iesFp2EuoODr'},
   'episode': None,
   'localTrack': None,
   'addedDate': '2023-08-05'}],
 'description': None,
 'numberOfFollowers': 0}

In [9]:
# information based on a single song
track_name, track_artist = 'Hey There Delilah', 'Plain White T\'s'
track_uri = sp.search(q='artist:' + track_artist + ' track:' + track_name, type='track')['tracks']['items'][0]['uri']

# extended_features = sp.audio_analysis(track_uri) # a LOT of information
# extended_features

track_features = sp.audio_features(track_uri)
track_features[0]

{'danceability': 0.656,
 'energy': 0.291,
 'key': 2,
 'loudness': -10.572,
 'mode': 1,
 'speechiness': 0.0293,
 'acousticness': 0.872,
 'instrumentalness': 0,
 'liveness': 0.114,
 'valence': 0.298,
 'tempo': 103.971,
 'type': 'audio_features',
 'id': '4RCWB3V8V0dignt99LZ8vH',
 'uri': 'spotify:track:4RCWB3V8V0dignt99LZ8vH',
 'track_href': 'https://api.spotify.com/v1/tracks/4RCWB3V8V0dignt99LZ8vH',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4RCWB3V8V0dignt99LZ8vH',
 'duration_ms': 232533,
 'time_signature': 4}

In [19]:
counts = listening.groupby(by=['trackName', 'artistName']).size().reset_index()
songs_df = counts.rename(columns={0: 'count'}).query('count >= 5').sort_values('count', ascending=False).reset_index(drop=True) #.sample(n=500, weights='count', random_state=42)#.query('count >= 5').sort_values('count', ascending=False) # either listened to >5 times or top x songs or random x songs
print(songs_df.shape)
songs_df.head(5)#.to_csv('csv_exports/grouped.csv')

(937, 3)


Unnamed: 0,trackName,artistName,count
0,Means Something,Lizzy McAlpine,96
1,How,Elina,78
2,When You Lose Someone,Nina Nesbitt,74
3,ceilings,Lizzy McAlpine,73
4,hate to be lame,Lizzy McAlpine,72


In [21]:
lst_of_cols = list(track_features[0].keys())

# [lst_of_cols.remove(col) for col in ['type', 'uri', 'analysis_url', 'track_href']] # remove columns you dont want

def find_info(song):
    try:
        song_dict = sp.search(q='artist:' + song[1] + ' track:' + song[0], type='track')
        song_uri = song_dict['tracks']['items'][0]['uri']
        track_features = sp.audio_features(song_uri)[0]
        return pd.Series([track_features[i] for i in lst_of_cols])
    except:
        return pd.Series([None] * len(lst_of_cols))
    
totals = pd.DataFrame()

for i in range((songs_df.shape[0] // 100) + 1):
    songs_subset = songs_df.iloc[100*i:100*(i + 1),:]
    songs_subset[lst_of_cols] = songs_subset[['trackName', 'artistName']].apply(find_info, axis=1)
    totals = pd.concat([totals, songs_subset])

totals.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_subset[lst_of_cols] = songs_subset[['trackName', 'artistName']].apply(find_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_subset[lst_of_cols] = songs_subset[['trackName', 'artistName']].apply(find_info, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs_subset[ls

Unnamed: 0,trackName,artistName,count,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Means Something,Lizzy McAlpine,96,0.429,0.2,8.0,-14.629,1.0,0.0512,0.825,...,0.367,0.357,95.319,audio_features,5L3FlmnCvJY2SN2jzvUwxL,spotify:track:5L3FlmnCvJY2SN2jzvUwxL,https://api.spotify.com/v1/tracks/5L3FlmnCvJY2...,https://api.spotify.com/v1/audio-analysis/5L3F...,144632.0,4.0
1,How,Elina,78,0.502,0.127,1.0,-13.642,1.0,0.0587,0.931,...,0.122,0.485,77.198,audio_features,1M2HXn0iXRD7KDJqtdGARb,spotify:track:1M2HXn0iXRD7KDJqtdGARb,https://api.spotify.com/v1/tracks/1M2HXn0iXRD7...,https://api.spotify.com/v1/audio-analysis/1M2H...,143704.0,4.0
2,When You Lose Someone,Nina Nesbitt,74,0.527,0.415,3.0,-4.743,1.0,0.0303,0.264,...,0.093,0.341,130.005,audio_features,3ECh9S9MgoL9SrpZFh0Y5Z,spotify:track:3ECh9S9MgoL9SrpZFh0Y5Z,https://api.spotify.com/v1/tracks/3ECh9S9MgoL9...,https://api.spotify.com/v1/audio-analysis/3ECh...,200282.0,4.0
3,ceilings,Lizzy McAlpine,73,0.516,0.322,9.0,-11.762,1.0,0.0292,0.473,...,0.215,0.261,148.005,audio_features,2L9N0zZnd37dwF0clgxMGI,spotify:track:2L9N0zZnd37dwF0clgxMGI,https://api.spotify.com/v1/tracks/2L9N0zZnd37d...,https://api.spotify.com/v1/audio-analysis/2L9N...,182888.0,3.0
4,hate to be lame,Lizzy McAlpine,72,0.522,0.251,10.0,-12.626,0.0,0.0603,0.871,...,0.0769,0.303,137.366,audio_features,26MJjeJ0NSOQDKeZzrEFMl,spotify:track:26MJjeJ0NSOQDKeZzrEFMl,https://api.spotify.com/v1/tracks/26MJjeJ0NSOQ...,https://api.spotify.com/v1/audio-analysis/26MJ...,156798.0,3.0


In [47]:
songs = totals.copy()
songs.drop(['track_href', 'type', 'uri', 'analysis_url'], axis=1, inplace=True)
songs['inv_sq_energy'] = -1 * (songs['energy'] ** 2)
songs = songs.rename({'artistName': 'artist', 'trackName': 'song', 'count': 'listens'}, axis=1)

print(songs.shape)
songs.head(5)

(937, 18)


Unnamed: 0,song,artist,listens,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,inv_sq_energy
0,Means Something,Lizzy McAlpine,96,0.429,0.2,8.0,-14.629,1.0,0.0512,0.825,2e-06,0.367,0.357,95.319,5L3FlmnCvJY2SN2jzvUwxL,144632.0,4.0,-0.04
1,How,Elina,78,0.502,0.127,1.0,-13.642,1.0,0.0587,0.931,0.0,0.122,0.485,77.198,1M2HXn0iXRD7KDJqtdGARb,143704.0,4.0,-0.016129
2,When You Lose Someone,Nina Nesbitt,74,0.527,0.415,3.0,-4.743,1.0,0.0303,0.264,0.0,0.093,0.341,130.005,3ECh9S9MgoL9SrpZFh0Y5Z,200282.0,4.0,-0.172225
3,ceilings,Lizzy McAlpine,73,0.516,0.322,9.0,-11.762,1.0,0.0292,0.473,0.00194,0.215,0.261,148.005,2L9N0zZnd37dwF0clgxMGI,182888.0,3.0,-0.103684
4,hate to be lame,Lizzy McAlpine,72,0.522,0.251,10.0,-12.626,0.0,0.0603,0.871,0.000258,0.0769,0.303,137.366,26MJjeJ0NSOQDKeZzrEFMl,156798.0,3.0,-0.063001


### PCA attempt/plotting the PCs/scree plot

In [48]:
cleaned_df = songs.set_index(['song', 'artist', 'id']).dropna()
centered_df = (cleaned_df - np.mean(cleaned_df, axis=0)) / np.std(cleaned_df, axis=0)
print(centered_df.shape)
centered_df.head(5)

(907, 15)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,listens,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,inv_sq_energy
song,artist,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Means Something,Lizzy McAlpine,5L3FlmnCvJY2SN2jzvUwxL,6.919833,-0.870867,-0.750366,0.843709,-0.944821,0.405356,0.058407,0.475817,-0.22309,2.620118,0.161512,-0.701598,-1.076392,0.300802,0.672243
How,Elina,1M2HXn0iXRD7KDJqtdGARb,5.414186,-0.289073,-1.17259,-1.074196,-0.688236,0.405356,0.269306,0.880798,-0.223107,-0.256139,0.967138,-1.292388,-1.096565,0.300802,0.834965
When You Lose Someone,Nina Nesbitt,3ECh9S9MgoL9SrpZFh0Y5Z,5.079598,-0.089829,0.493168,-0.526223,1.625184,0.405356,-0.529298,-1.667528,-0.223107,-0.596594,0.060809,0.429253,0.133344,0.300802,-0.229096
ceilings,Lizzy McAlpine,2L9N0zZnd37dwF0clgxMGI,4.995951,-0.177497,-0.044733,1.117696,-0.199504,0.405356,-0.56023,-0.869027,-0.204644,0.835665,-0.442707,1.016098,-0.244772,-1.530255,0.238128
hate to be lame,Lizzy McAlpine,26MJjeJ0NSOQDKeZzrEFMl,4.912304,-0.129678,-0.455389,1.391682,-0.424113,-2.466969,0.314297,0.651564,-0.220651,-0.785605,-0.178361,0.66924,-0.811924,-1.530255,0.515452


In [49]:
u, s, vt = np.linalg.svd(centered_df, full_matrices = False)
u.shape, s.shape, vt.shape

((907, 15), (15,), (15, 15))

In [50]:
np.round(s**2 / sum(s**2), 2)

array([0.23, 0.1 , 0.08, 0.08, 0.07, 0.07, 0.06, 0.06, 0.06, 0.05, 0.05,
       0.04, 0.03, 0.02, 0.  ])

In [51]:
# scree plot
px.line(s**2)

In [69]:
pcs = u * s

songs_with_pcs = pd.DataFrame(pcs)
songs_with_pcs[['song', 'artist', 'id']] = centered_df.reset_index()[['song', 'artist', 'id']]

labels = {}
for i in range(15):
    labels[i] = f'pc{i+1}'

songs_with_pcs = songs_with_pcs.rename(labels, axis=1)

px.scatter_3d(data_frame=songs_with_pcs, x='pc1', y='pc2', z='pc3', hover_data=['song', 'artist'])

### Part 3: Analyzing/clustering the Data
#### PCA/plot/etc, figuring out what matters most in a song, what makes a song similar to another?

In [81]:
cluster_df = songs.set_index(['song', 'artist', 'id']).dropna()
cluster_df = (cluster_df - np.mean(cluster_df, axis=0)) / np.std(cluster_df, axis=0)
cluster_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,listens,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,inv_sq_energy
song,artist,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Means Something,Lizzy McAlpine,5L3FlmnCvJY2SN2jzvUwxL,6.919833,-0.870867,-0.750366,0.843709,-0.944821,0.405356,0.058407,0.475817,-0.22309,2.620118,0.161512,-0.701598,-1.076392,0.300802,0.672243
How,Elina,1M2HXn0iXRD7KDJqtdGARb,5.414186,-0.289073,-1.17259,-1.074196,-0.688236,0.405356,0.269306,0.880798,-0.223107,-0.256139,0.967138,-1.292388,-1.096565,0.300802,0.834965
When You Lose Someone,Nina Nesbitt,3ECh9S9MgoL9SrpZFh0Y5Z,5.079598,-0.089829,0.493168,-0.526223,1.625184,0.405356,-0.529298,-1.667528,-0.223107,-0.596594,0.060809,0.429253,0.133344,0.300802,-0.229096
ceilings,Lizzy McAlpine,2L9N0zZnd37dwF0clgxMGI,4.995951,-0.177497,-0.044733,1.117696,-0.199504,0.405356,-0.56023,-0.869027,-0.204644,0.835665,-0.442707,1.016098,-0.244772,-1.530255,0.238128
hate to be lame,Lizzy McAlpine,26MJjeJ0NSOQDKeZzrEFMl,4.912304,-0.129678,-0.455389,1.391682,-0.424113,-2.466969,0.314297,0.651564,-0.220651,-0.785605,-0.178361,0.66924,-0.811924,-1.530255,0.515452


In [83]:
from sklearn.metrics import silhouette_score

In [87]:
# k = 2
# largest_radius = 10
# radius = np.inf
# silhouette_scores = []

# while((radius > largest_radius) and (k < cluster_df.shape[0] // 10)):
#     cluster = KMeans(n_clusters=k, n_init='auto').fit(cluster_df)
#     labels=cluster.labels_
#     silhouette_scores.append(silhouette_score(cluster_df, labels, metric='euclidean'))
#     # largest_radius = max()
#     k += 1
# print(silhouette_scores)
# print(k)

cluster = KMeans(n_clusters=cluster_df.shape[0] // 10, n_init='auto').fit_predict(cluster_df)
cluster_w_labels = cluster_df.reset_index()
cluster_w_labels['color'] = cluster
px.scatter(cluster_w_labels, x='inv_sq_energy', y='listens', color='color', hover_data=['song', 'artist'])

In [103]:
pc_cluster = KMeans(n_clusters=songs_with_pcs.shape[0] // 10, n_init='auto').fit_predict(songs_with_pcs[['pc1', 'pc2', 'pc3']])
pc_cluster_3 = songs_with_pcs.reset_index()
pc_cluster_3['color'] = pc_cluster
px.scatter_3d(pc_cluster_3, x='pc1', y='pc2', z='pc3', hover_data=['song', 'artist'], color='color')

In [104]:
all_pc_cluster = KMeans(n_clusters=songs_with_pcs.shape[0] // 10, n_init='auto').fit_predict(songs_with_pcs.drop(columns=['song', 'artist', 'id'])) #all_pcs
pc_cluster = songs_with_pcs
pc_cluster['color'] = all_pc_cluster
px.scatter_3d(pc_cluster, x='pc1', y='pc2', z='pc3', hover_data=['song', 'artist'], color='color')

### Part 4: Selecting additional relevant songs for playlist given single seed song
#### Find most similar ones to single input song

In [109]:
def get_playlist(song, cluster_type='regular'):
    # regular uses the raw data not decomposed into principal components
    # 3pc uses the first 3 principal components
    # pc uses all the principal components
    df_to_use = {'regular': cluster_w_labels, '3pc': pc_cluster_3, 'pc': pc_cluster}
    df = df_to_use[cluster_type]

    label = df[df['song'] == song].color.iloc[0] #.loc[0, 'color']
    playlist = df[df['color'] == label]
    return playlist[['song', 'artist', 'id']]

In [111]:
# get_playlist('By Your Side')
# get_playlist('I Lived')
get_playlist('Hey It\'s Delilah')
# get_playlist('Hey There Delilah', '3pc')

Unnamed: 0,song,artist,id
151,Margaux,Matilda Mann,7r8gXkNiJJqv2Wb8i8sT1i
154,I Should Be a Bird,Nina Nesbitt,0ZRZeLNAKPHBwkalUqW1xz
168,what if it's not,Jackson Guthy,6DhWEW0TXBsLZzSllaCfi7
176,Called You Out,Tamzene,2I0hkWJEy3BhVBvkWsnW5S
192,ilym (feat. ROSIE),John K,6iT0ZdQcHwzQDNcAC2oDKX
287,Alright,Gracie Abrams,1wXqbn4OVaYBOhgj7Z4did
334,In The Kitchen,Reneé Rapp,2VFetGqLYq0Pc8ZtRYCaeL
337,String of Lights,Lindsey Lomis,6YSGJUmhAP8CxUZdX8DtrH
465,Gin & Tonics,George David,2yyDkQwTDkrf9B6t53dgb6
534,Flares,The Script,3mTEYjm3kIm4YzJ5zxJZjs
