In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('data/SpotifyFeatures.csv')
df                                                #? Data shape (232725 rows × 18 columns)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.962
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969
232722,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.813
232723,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.489


In [3]:
df.columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

## Spotify Tracks Dataset - Column Descriptions

| Column | Description |
|--------|-------------|
| `genre` | The genre of the track (e.g., Pop, Rock, Hip-Hop). This can help group songs by musical style. |
| `artist_name` | Name of the artist or band who performed the track. Useful for artist-based filtering or grouping. |
| `track_name` | Title of the song. Used to identify individual tracks. |
| `track_id` | Unique Spotify ID for the track. Often used when working with Spotify API. |
| `popularity` | A score from 0 to 100 indicating how popular the track is on Spotify. Based on play counts and recent activity. |
| `acousticness` | A float value from 0.0 to 1.0 representing the confidence that the track is acoustic. Higher means more acoustic. |
| `danceability` | Describes how suitable a track is for dancing (0.0–1.0). Higher values mean it's easier to dance to. |
| `duration_ms` | Duration of the track in milliseconds. Can be used to filter short vs long tracks. |
| `energy` | A value from 0.0 to 1.0 indicating intensity and activity. High energy tracks feel fast and loud. |
| `instrumentalness` | Predicts whether a track contains no vocals (0.0–1.0). Higher values mean more instrumental. |
| `key` | The musical key the track is in, expressed as integers (e.g., 0 = C, 1 = C♯/D♭, ..., 11 = B). |
| `liveness` | Detects the presence of an audience in the track. Higher values suggest a live recording. |
| `loudness` | The overall loudness of a track in decibels (dB). Typically ranges from -60 to 0 dB. |
| `mode` | Indicates the modality: major (1) or minor (0). Major mode often sounds more cheerful. |
| `speechiness` | Detects spoken words in a track (0.0–1.0). Higher values indicate more speech-like content (e.g., podcasts, rap). |
| `tempo` | The tempo (beats per minute, BPM) of the track. Higher tempo means faster-paced music. |
| `time_signature` | An estimated time signature (e.g., 4 = 4/4 time). |
| `valence` | A measure of musical positiveness (0.0–1.0). Higher valence means the track sounds more happy/cheerful. |


In [4]:
df.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232724 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 11  liveness          232725 non-null  float64
 12  loudness          232725 non-null  float64
 13  mode              232725 non-null  object 
 14  speechiness       232725 non-null  float64
 15  tempo             232725 non-null  float64
 16  time_signature    23

In [6]:
df.isnull().sum()

genre               0
artist_name         0
track_name          1
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [7]:
df = df.dropna(subset= 'track_name')             #? Since we only have one row with a "null" value, we can easily delete it
df

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.962
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969
232722,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.813
232723,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.489


In [8]:
print(f'All Track_IDs:  {len(df['track_id'])}')
print(f'Only Unique Track_IDs:  {df['track_id'].nunique()}')
data = df.drop_duplicates(subset=['track_id'], keep= 'first')

All Track_IDs:  232724
Only Unique Track_IDs:  176773


In [9]:
data

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.6110,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.2460,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.9520,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.7030,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.9500,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232716,Soul,John Legend,Quickly (feat. Brandy),1U0OMWvR89Cm20vCNar50f,39,0.2310,0.736,222667,0.701,0.000000,A#,0.2030,-4.345,Minor,0.1000,99.991,4/4,0.770
232717,Soul,Belly,P.O.P.,2gGqKJWfWbToha2YmDxnnj,43,0.1040,0.802,201173,0.516,0.000485,D,0.1050,-9.014,Major,0.2130,175.666,4/4,0.482
232719,Soul,"Bobby ""Blue"" Bland",I'll Take Care Of You - Single Version,2iZf3EUedz9MPqbAvXdpdA,32,0.5660,0.423,144667,0.337,0.000000,A#,0.2760,-13.092,Minor,0.0436,80.023,4/4,0.497
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.0329,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969


In [10]:
features = ['danceability', 'energy', 'loudness', 'tempo', 'valence', 'acousticness']
scaled_data = data.copy()
scaled_data[features] = StandardScaler().fit_transform(data[features])
scaled_data

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.564734,-0.798742,99373,1.279640,0.000000,C#,0.3460,1.299282,Major,0.0525,1.588679,4/4,1.353166
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,-0.431714,0.257009,137373,0.652463,0.000000,F#,0.1510,0.715908,Minor,0.0868,1.813228,4/4,1.360634
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,1.495663,0.640441,170267,-1.544470,0.000000,C,0.1030,-0.584997,Minor,0.0362,-0.565540,5/4,-0.312143
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.815894,-1.581363,152427,-0.837536,0.000000,C#,0.0985,-0.319031,Major,0.0395,1.741560,4/4,-0.838620
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,1.490203,-1.103386,82625,-1.203691,0.123000,F,0.2020,-1.721882,Major,0.0456,0.746126,4/4,-0.229998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232716,Soul,John Legend,Quickly (feat. Brandy),1U0OMWvR89Cm20vCNar50f,39,-0.472664,1.023872,222667,0.521952,0.000000,A#,0.2030,0.905727,Minor,0.1000,-0.549482,4/4,1.188875
232717,Soul,Belly,P.O.P.,2gGqKJWfWbToha2YmDxnnj,43,-0.819374,1.370537,201173,-0.148729,0.000485,D,0.1050,0.175688,Major,0.2130,1.866316,4/4,0.113519
232719,Soul,"Bobby ""Blue"" Bland",I'll Take Care Of You - Single Version,2iZf3EUedz9MPqbAvXdpdA,32,0.441884,-0.620157,144667,-0.797658,0.000000,A#,0.2760,-0.461943,Minor,0.0436,-1.186927,4/4,0.169527
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,-1.013477,1.281244,282447,0.456697,0.000880,E,0.2370,0.499351,Minor,0.0337,-0.107695,4/4,1.931917


In [23]:
from sklearn.neighbors import NearestNeighbors

# Fit NearestNeighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(scaled_data[features])

# Recommend similar songs
def recommend(song_name, n=5):
    try:
        idx = data[data['track_name'].str.lower() == song_name.lower()].index[0]
    except IndexError:
        return "Song not found in dataset."

    song_vector = scaled_data.loc[idx, features].values.reshape(1, -1)
    distances, indices = nn_model.kneighbors(song_vector, n_neighbors=n+1)  # +1 to skip the song itself

    recommended = data.iloc[indices[0][1:]]['track_name'].tolist()
    return recommended


In [25]:
recommend("shape of you", n=5)



["Vote 'Em Out",
 'Goodbye, Mr. Personality',
 'Blue Faces',
 'Tanz mit mir',
 'Guayaquil']