# Spotify Song Recommender

> This script uses Spotify's Web API to recommend new songs to users based on their existing playlist. It extracts the audio features of the songs in the playlist and utilizes a K-Nearest Neighbors (KNN) model to provide recommendations either for each track individually or for the entire playlist as a whole.

> Methods:
1. Track-Level Recommendation: Provides recommendations for each song in the playlist.
2. Playlist-Level Recommendation: Provides recommendations based on the overall characteristics of the playlist.

> Dependencies:
- spotipy: For interacting with Spotify's API.
- pandas: For data handling and processing.
- scikit-learn: For building the KNN model and scaling data.


### Choosing the Spotify playlist on which the recommendations will be based

In [1]:
PLAYLIST_URL = "https://open.spotify.com/playlist/2X5rp0W1YqW5CYwpF2GwCs?si=e34e90709fdf4ebd"

### Initialising the Spotify API with client_id and client_secret

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import project_hidden

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=project_hidden.CLIENT_ID, client_secret=project_hidden.CLIENT_SECRET))

### Extracting playlist data

In [3]:
def get_playlist_tracks(playlist_url):
    results = sp.playlist_tracks(playlist_url)
    result_items = results['items']
    track_features = []

    result_length = len(result_items)
    result_remainder = result_length % 100
    result_groups = result_length // 100

# making groups of max 100 tracks to send one large API request at a time
    for i in range(result_groups):
        start = i * 100
        end = start + 100

        temp_list = result_items[start:end]

        for j in range(len(temp_list)):  # for each item in temp_list:
            temp_list[j] = temp_list[j]['track']['id']

        features = sp.audio_features(temp_list)
        track_features.extend(features)

    if result_remainder:
        start = result_groups * 100
        temp_list = result_items[start:]
        
        for j in range(len(temp_list)):
            temp_list[j] = temp_list[j]['track']['id']

        features = sp.audio_features(temp_list)
        track_features.extend(features)


# old inefficient method for getting track features
# calls audio_features method for each track id in the list instead of passing a list of track ids

    # for item in result_items:

    #     #both item and item['track'] are dictionaries
    #     track_id = item['track']['id'] 
        
    #     # the method sp.audio_features(track_id) returns a list of dictionaries, since it can take multiple track ids as parameters
    #     features = sp.audio_features(track_id)[0]

    #     # appends the features dictionary to the track_features list
    #     track_features.append(features)
    #     # print(track_features)

    return track_features

track_features = get_playlist_tracks(PLAYLIST_URL)

display(track_features[:3])


[{'danceability': 0.277,
  'energy': 0.0314,
  'key': 5,
  'loudness': -25.936,
  'mode': 0,
  'speechiness': 0.0522,
  'acousticness': 0.991,
  'instrumentalness': 0.942,
  'liveness': 0.0859,
  'valence': 0.161,
  'tempo': 80.153,
  'type': 'audio_features',
  'id': '2ZSga0mI1m1wNpxj5ZXVs6',
  'uri': 'spotify:track:2ZSga0mI1m1wNpxj5ZXVs6',
  'track_href': 'https://api.spotify.com/v1/tracks/2ZSga0mI1m1wNpxj5ZXVs6',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2ZSga0mI1m1wNpxj5ZXVs6',
  'duration_ms': 341040,
  'time_signature': 4},
 {'danceability': 0.252,
  'energy': 0.0578,
  'key': 1,
  'loudness': -26.184,
  'mode': 1,
  'speechiness': 0.0518,
  'acousticness': 0.953,
  'instrumentalness': 0.0163,
  'liveness': 0.0721,
  'valence': 0.0391,
  'tempo': 108.013,
  'type': 'audio_features',
  'id': '1BbQIyPnuhVIlJEqPcY3Ug',
  'uri': 'spotify:track:1BbQIyPnuhVIlJEqPcY3Ug',
  'track_href': 'https://api.spotify.com/v1/tracks/1BbQIyPnuhVIlJEqPcY3Ug',
  'analysis_url': 'htt

### Data Preprocessing

In [4]:
import pandas as pd

# creating a pandas dataframe with the list of feature dictionaries
playlist_df = pd.DataFrame(track_features)

# ignore *data* dict keys, as well as key and mode features
desired_features_playlist = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    ]


# selecting only the desired features in the dataframe
playlist_df = playlist_df[desired_features_playlist]

display(playlist_df.head())

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.277,0.0314,5,-25.936,0,0.0522,0.991,0.942,0.0859,0.161,80.153
1,0.252,0.0578,1,-26.184,1,0.0518,0.953,0.0163,0.0721,0.0391,108.013
2,0.28,0.287,8,-22.167,0,0.0367,0.931,0.888,0.0762,0.509,87.953
3,0.227,0.0499,9,-27.714,0,0.0415,0.986,0.945,0.108,0.0942,135.248
4,0.234,0.126,0,-25.427,1,0.0446,0.989,0.896,0.102,0.216,133.998


### Spotify Tracks Dataset

In [5]:
dataset_pre_filtering_df = pd.read_csv('spotify_data.csv')

desired_features_dataset = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    ]

desired_features_for_final_display = [
    'artist_name',
    'track_name',
    'track_id',
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    ]


dataset_df = dataset_pre_filtering_df[desired_features_dataset]
dataset_for_final_display_df = dataset_pre_filtering_df[desired_features_for_final_display]

print("Name: dataset_df\nShape: ", dataset_df.shape)
display(dataset_df.head())

print("Name: dataset_for_final_display_df\nShape: ", dataset_for_final_display_df.shape)
display(dataset_for_final_display_df.head())

Name: dataset_df
Shape:  (1159764, 11)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406
1,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182
2,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832
3,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961
4,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864


Name: dataset_for_final_display_df
Shape:  (1159764, 14)


Unnamed: 0,artist_name,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864


### Feature scaling 

Ensures that each feature has the same weight or importance

In [6]:
from sklearn.preprocessing import StandardScaler

myscaler = StandardScaler()

# scaled dataset_df (scaler fit to dataset_df) (turned into a numpy array, loses column names)
dataset_df_scaled = myscaler.fit_transform(dataset_df)

# scaled playlist_df using the same scaler as dataset_df to maintain consistency (turns into a numpy array, loses column names)
playlist_df_scaled = myscaler.transform(playlist_df)

display(dataset_df_scaled)
display(playlist_df_scaled)

array([[-0.29509342, -1.24461718, -0.36222436, ..., -0.53721859,
        -1.17892497,  0.40392778],
       [ 0.18734904, -0.68639325, -0.64350286, ..., -0.62475003,
         0.22134908,  0.63146506],
       [-0.69622536, -1.49969964, -0.64350286, ..., -0.66403971,
        -1.15658017,  0.6197121 ],
       ...,
       [-0.25172781, -0.73814911, -0.08094587, ..., -0.64364886,
        -1.5658624 , -0.71528966],
       [-0.31135553, -0.86753877, -1.48733834, ..., -0.48748482,
        -0.94430458,  0.42001255],
       [ 1.32569642,  0.81822357, -1.20605984, ..., -0.82020375,
         1.4950026 , -1.05407708]])

array([[-1.411758  , -2.24868089, -0.08094587, ..., -0.68194386,
        -1.09699404, -1.38430163],
       [-1.54727555, -2.15108412, -1.20605984, ..., -0.75057647,
        -1.55096587, -0.44876614],
       [-1.3954959 , -1.30376674,  0.76288961, ..., -0.73018562,
         0.19900428, -1.12237856],
       ...,
       [-1.78036573, -2.35370832, -0.92478135, ..., -0.59192574,
        -1.55133828, -1.83655548],
       [-0.96726045, -2.25422616,  0.48161112, ..., -0.69736133,
        -1.10444231, -1.61966302],
       [-1.85083485, -2.26235922, -0.36222436, ..., -0.4924582 ,
        -1.21244217,  1.55642289]])

### Recommendation model using K-Nearest Neighbors

In [7]:
from sklearn.neighbors import NearestNeighbors

# creating a NearestNeighbors model object
knn = NearestNeighbors(n_neighbors=5, algorithm='auto')

# fitting the model to our dataset_df
knn.fit(dataset_df_scaled)

### Method to recommend songs based on user playlist

In [8]:
# method returning indices of recommended songs in the dataset_df
# for now, only returns the first 5 songs for each song of the playlist
def recommend_songs(playlist_df_scaled, number_of_recommendations=5):
    
    recommended_songs_indices = []

    for playlist_track in playlist_df_scaled:
        playlist_track = playlist_track.reshape(1, -1)
        distances, indices = knn.kneighbors(playlist_track, 5)
        # print(indices)
        recommended_songs_indices.append(indices)

    # print(recommended_songs_indices, type(recommended_songs_indices))
    return recommended_songs_indices

recommended_songs_indices = recommend_songs(playlist_df_scaled)

display(recommended_songs_indices[:10])
# print(type(recommended_songs_indices[0]))
# print(len(recommended_songs_indices))

[array([[ 715326,  795735,  610613,   91690, 1073763]]),
 array([[ 374129,  876930, 1016069,  898922,  979147]]),
 array([[1119327,  662412,  221128,  968504,  662411]]),
 array([[374179, 796101, 374427, 585618, 369124]]),
 array([[430114, 374134, 374122, 922562,  93530]]),
 array([[1119317,  215615,  887015,   58011,  293288]]),
 array([[812774, 479979, 585807, 459610, 683030]]),
 array([[ 673245,  538865, 1026788,  886963, 1073649]]),
 array([[ 887129, 1119798,  812779,  843066,  838805]]),
 array([[236415, 145659, 610378, 513191, 146015]])]

### Getting the song details from the dataset

In [9]:
def get_recommendation_details(indices, dataset_df):
    recommendations_details = []

    # to avoid wrong number of dimensions error, flatten the indices 2D array (made 2D in previous step)
    # one-liner to replace double for loop appending the index to indices_1d

    # indices_1d = [item for sublist in indices for item in sublist]
    # print(indices_1d)

    indices_flat = []
    # x=0 #tracking
    for sublist in indices:

        for nparray in sublist:

            for item in nparray:
                indices_flat.append(item)

                # converts the pandas series to a dictionary for easy access
                recommendations_details.append(dataset_df.iloc[item].to_dict())
                # x+=1
                # print(x)
                # print(item)
 

    # for index in indices_1d:
    # recommendations_details.append(dataset_df.iloc[index])
    # print(len(recommendations_details))
    # print(recommendations_details)
    # display(recommendations_details[0])
    return recommendations_details


recommended_songs = get_recommendation_details(recommended_songs_indices, dataset_for_final_display_df)

output_dict = {}
for song in recommended_songs:

    song_key = (song['track_name'], song['artist_name'])

    # first condition checks if not already added by the program, second condition checks if not already in user playlist
    if song_key not in output_dict and all(song['track_id'] != track['id'] for track in track_features):
        # print("new song added: " + str(song_key))
        output_dict[song_key] = song['track_id']

display(list(output_dict.items())[:5])

[(('2 Arabesques (Arr. For Flute And Harp): Arabesque No. 1 (Arr. For Flute And Harp)',
   'Nora Shulman'),
  '4NaZnaRsW9Lj7Ox6vIVGZN'),
 (('La casa nel bosco', 'Ludovico Einaudi'), '2Ta03aSFQOyrIZCRZscdaC'),
 (('I Enjoy Life To The Fullest', 'Beautiful World'),
  '7cn478BUdTXaJswe2kw5Ri'),
 (('Pure Fountain', 'Stanton Lanier'), '798AcfNtltLUTqrWHpxD1d'),
 (('2 Nocturnes, Op. 55: I. Andante', 'Frédéric Chopin'),
  '1q6of8MARmRLg8xIjexscd')]

### 

### Implementing other method to build model on global state of playlist rather than individual tracks

In [10]:
#trying another method by calculating the average values for playlist features

import numpy as np
playlist_average_features = np.mean(playlist_df_scaled, axis=0)
playlist_average_features = playlist_average_features.reshape(1, -1)

def recommend_global(playlist_average_features, numberrecs=5):
    distances, indices = knn.kneighbors(playlist_average_features, 30)
    return indices[0]

recommended_indices_2 = recommend_global(playlist_average_features)

def get_recommendation_details_2(indices, dataset_df):
    recommendations_details_2 = []

    for index in indices:
        recommendations_details_2.append(dataset_df.iloc[index].to_dict())

    return recommendations_details_2

recommended_songs_2 = get_recommendation_details_2(recommended_indices_2, dataset_for_final_display_df)

output_dict_2 = {}
for song in recommended_songs_2:

    song_key = (song['track_name'], song['artist_name'])

    # first condition checks if not already added by the program, second condition checks if not already in user playlist
    if song_key not in output_dict_2 and all(song['track_id'] != track['id'] for track in track_features):
        # print("new song added: " + str(song_key))
        output_dict_2[song_key] = song['track_id']

# display_recommendations(output_dict_2, number_of_recommendations)

### Viewing final recommendations

Change the value of the "number of recommendations" variable to your liking

In [11]:
number_of_recommendations = 10



def display_recommendations(output_dict, number_of_recommendations):
    output_str = "| " +str(len(output_dict)) + " recommendations have been made |"

    print(f" {'_'*(len(output_str) - 1)}")
    print(output_str)
    print(f"|{'_'*(len(output_str) - 2)}|")

    count = number_of_recommendations
    for key in output_dict:
        if count > 0:
            # print(f"Track name : {key[0]}, \nArtist name: {key[1]}, \nID: {key, output_dict[key]}\n ")
            print(f"Track name : {key[0]}, \nArtist name: {key[1]}\n")

            count -= 1

# display_recommendations(output_dict, number_of_recommendations)

### Display results using the individual track-based model

### Display results using the global playlist-based model

In [12]:
display_recommendations(output_dict, 20)


 _____________________________________
| 463 recommendations have been made |
|____________________________________|
Track name : 2 Arabesques (Arr. For Flute And Harp): Arabesque No. 1 (Arr. For Flute And Harp), 
Artist name: Nora Shulman

Track name : La casa nel bosco, 
Artist name: Ludovico Einaudi

Track name : I Enjoy Life To The Fullest, 
Artist name: Beautiful World

Track name : Pure Fountain, 
Artist name: Stanton Lanier

Track name : 2 Nocturnes, Op. 55: I. Andante, 
Artist name: Frédéric Chopin

Track name : Dou way Robyn - Sancta Mater, 
Artist name: Traditional

Track name : Traditional: Lord, I Want to Be a Christian (Arr. Moses Hogan), 
Artist name: Traditional

Track name : Werther / Act 3: Va! Laisse couler mes larmes, 
Artist name: Jules Massenet

Track name : Violin Concerto In E Minor, Op. 64, MWV O14: II. Andante, 
Artist name: Felix Mendelssohn

Track name : Miniature on Russian theme (Variations to V.Kalinnikov music), 
Artist name: Andrei Krylov

Track name : G

In [13]:
display_recommendations(output_dict_2, 20)

 ____________________________________
| 30 recommendations have been made |
|___________________________________|
Track name : Images, Livre II, L. 111, Book I, L. 110: III. Mouvement (avec une légèreté fantasque et précise), 
Artist name: Claude Debussy

Track name : Baby Mine, 
Artist name: Michael Allen Harrison

Track name : I See the Light, 
Artist name: Emile Pandolfi

Track name : I Have Dreamed, 
Artist name: Christian McBride Trio

Track name : 19 Hungarian Rhapsodies, S244/R106: 19 Hungarian Rhapsodies, S244/R106: No. 2 in C sharp minor, 
Artist name: Benno Moiseiwitsch

Track name : Beautiful Garden in Himalayan Mountains, Classical and Indian Guitar, Atmospheric Percussion Music for Relaxation, Yoga, Massage, Sleep Therapy, Pain and Stress Relieve, Reiki, Energy Work, Balancing, 
Artist name: Andrei Krylov

Track name : Camelot, 
Artist name: Louis Landon

Track name : My Mistake, 
Artist name: Burt Mitchell

Track name : Solitary Peace (For Will Ackerman), 
Artist name: Pe

### Verifying one example for the first (track-based) method

In [14]:
results = sp.playlist_tracks(PLAYLIST_URL)
result_items = results['items']
item = result_items[0]

audio_features = sp.audio_features([item['track']['id']])
print(audio_features)

compared_song = recommended_songs[0]
id_compared_song = compared_song['track_id']

audio_features_compared_song = sp.audio_features([id_compared_song])
print(audio_features_compared_song)


[{'danceability': 0.277, 'energy': 0.0314, 'key': 5, 'loudness': -25.936, 'mode': 0, 'speechiness': 0.0522, 'acousticness': 0.991, 'instrumentalness': 0.942, 'liveness': 0.0859, 'valence': 0.161, 'tempo': 80.153, 'type': 'audio_features', 'id': '2ZSga0mI1m1wNpxj5ZXVs6', 'uri': 'spotify:track:2ZSga0mI1m1wNpxj5ZXVs6', 'track_href': 'https://api.spotify.com/v1/tracks/2ZSga0mI1m1wNpxj5ZXVs6', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2ZSga0mI1m1wNpxj5ZXVs6', 'duration_ms': 341040, 'time_signature': 4}]
[{'danceability': 0.284, 'energy': 0.05, 'key': 5, 'loudness': -26.582, 'mode': 0, 'speechiness': 0.0477, 'acousticness': 0.965, 'instrumentalness': 0.92, 'liveness': 0.0741, 'valence': 0.13, 'tempo': 85.541, 'type': 'audio_features', 'id': '4NaZnaRsW9Lj7Ox6vIVGZN', 'uri': 'spotify:track:4NaZnaRsW9Lj7Ox6vIVGZN', 'track_href': 'https://api.spotify.com/v1/tracks/4NaZnaRsW9Lj7Ox6vIVGZN', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4NaZnaRsW9Lj7Ox6vIVGZN', 'durati

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aa4b4bb5-12fe-4f5c-aaf1-499a9c7d24ac' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>