In [2]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [42]:
spotify_data = pd.read_csv("/data.csv.zip")
genre_data = pd.read_csv('/data_by_genres.csv')
year_data = pd.read_csv('/data_by_year.csv')

In [43]:
!pip install spotipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import spotipy as sp
import requests
import base64

# Set up the authorization headers
client_id = '514cdaebe3844f3b8c2a966d32d1ce9d'
client_secret = 'a52a6ccd37e245d488806c506928d38b'
auth_header = base64.b64encode(f"{client_id}:{client_secret}".encode('ascii')).decode('ascii')
headers = {'Authorization': f"Basic {auth_header}"}

# Get an access token
token_url = 'https://accounts.spotify.com/api/token'
data = {'grant_type': 'client_credentials'}
response = requests.post(token_url, headers=headers, data=data)
token = response.json()['access_token']
headers = {'Authorization': f"Bearer {token}"}

# Use the API to search for a track
search_url = 'https://api.spotify.com/v1/search'
params = {'q': 'Shape of You', 'type': 'track'}
response = requests.get(search_url, headers=headers, params=params)
results = response.json()['tracks']['items']

# Print out the track names and artists
for result in results:
    track_name = result['name']
    artist_name = result['artists'][0]['name']
    print(f"{track_name} by {artist_name}")


Shape of You by Ed Sheeran
Shape of You (feat. Nyla & Kranium) - Major Lazer Remix by Ed Sheeran
Shape of You by Fame on Fire
Shape of You by Ed Sheeran
Shape of You by Boostereo
Shape of You (feat. Zion & Lennox) - Latin Remix by Ed Sheeran
Shape of You - Acoustic by Ed Sheeran
Shape of You by Navagio
Shape of You - Galantis Remix by Ed Sheeran
Shape Of You by Lenowa Parson
Shape of You - Rock by Our Last Night
Shape of You by Nara
Shape of You - Piano Arrangement by Niko Kotoulas
Shape of You by Chill With Lofi
Shape of You by Vitamin String Quartet
Shape of You - Stormzy Remix by Ed Sheeran
Shape of You - Piano Version by Henry Smith
Shape of You - NOTD Remix by Ed Sheeran
Shape Of Your Heart by Hillsong UNITED
Shape of You by Lofi Fruits Music


In [45]:
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [63]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [64]:
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.007s...
[t-SNE] Computed neighbors for 2973 samples in 0.347s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106247
[t-SNE] KL divergence after 1000 iterations: 1.391526


In [71]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20))], verbose=False)

X = spotify_data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [72]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

def get_song_data(song, spotify_data):

    
    try:
      for i in range(len(song_list)):
        song_data = spotify_data[(spotify_data['name'] == song_list[i]['name']) & (spotify_data['year'] == song_list[i]['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [73]:
song_list = [{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993}]
recommend_songs(song_list,  spotify_data)

[{'name': 'In the End', 'year': 2000, 'artists': "['Linkin Park']"},
 {'name': 'Shimmer', 'year': 1997, 'artists': "['Fuel']"},
 {'name': 'Symphony Of Destruction', 'year': 1992, 'artists': "['Megadeth']"},
 {'name': 'Breaking the Habit', 'year': 2003, 'artists': "['Linkin Park']"},
 {'name': 'Lovers Rock', 'year': 2014, 'artists': "['TV Girl']"},
 {'name': 'Fell On Black Days', 'year': 1994, 'artists': "['Soundgarden']"},
 {'name': 'New Divide', 'year': 2009, 'artists': "['Linkin Park']"},
 {'name': 'War of Change',
  'year': 2012,
  'artists': "['Thousand Foot Krutch']"},
 {'name': 'Freak', 'year': 1997, 'artists': "['Silverchair']"}]