In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import pairwise_distances 
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.neighbors import NearestNeighbors 

In [2]:
df = pd.read_csv('../dataset/SpotifyFeatures.csv') 
df.head() 

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4-Apr,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4-Apr,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,4-May,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4-Apr,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4-Apr,0.39


In [3]:
df.shape

(232725, 18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232724 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 11  liveness          232725 non-null  float64
 12  loudness          232725 non-null  float64
 13  mode              232725 non-null  object 
 14  speechiness       232725 non-null  float64
 15  tempo             232725 non-null  float64
 16  time_signature    23

In [5]:
df.isnull().sum() 

genre               0
artist_name         0
track_name          1
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.nunique()

genre                   27
artist_name          14564
track_name          148607
track_id            176774
popularity             101
acousticness          4734
danceability          1295
duration_ms          70749
energy                2517
instrumentalness      5400
key                     12
liveness              1732
loudness             27923
mode                     2
speechiness           1641
tempo                78512
time_signature           5
valence               1692
dtype: int64

In [8]:
df.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [9]:
df[ df['track_name'].isnull() == True]

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
208285,World,Elevation Worship,,7BGQCe62A58Q5ZgpQFX93t,44,0.019,0.287,350027,0.446,0.0,F,0.1,-7.214,Major,0.0317,138.727,4-Apr,0.0387


In [10]:
df = df.drop(208285)

In [11]:
df.isnull().sum() 

genre               0
artist_name         0
track_name          0
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [12]:
# features = df.drop(['genre', 'artist_name', 'track_name', 'track_id'], axis=1).columns 
# plt.subplots(figsize=(15, 40)) 

# for i, col in enumerate(features):
#     plt.subplot(7, 2, i+1) 
#     sns.histplot(x=col, data=df) 

# plt.show()

In [13]:
# features = df.drop(['genre', 'artist_name', 'track_name', 'track_id'], axis=1).columns 
# plt.subplots(figsize=(15, 40)) 

# for i, col in enumerate(features):
#     plt.subplot(7, 2, i+1) 
#     sns.barplot(x=col, data=df) 

# plt.show()

In [14]:
print(df['genre'].value_counts()) 
print("\n=====================================\n")
print(df['artist_name'].value_counts()) 

genre
Comedy              9681
Soundtrack          9646
Indie               9543
Jazz                9441
Pop                 9386
Electronic          9377
Children’s Music    9353
Folk                9299
Hip-Hop             9295
Rock                9272
Alternative         9263
Classical           9256
Rap                 9232
World               9095
Soul                9089
Blues               9023
R&B                 8992
Anime               8936
Reggaeton           8927
Ska                 8874
Reggae              8771
Dance               8701
Country             8664
Opera               8280
Movie               7806
Children's Music    5403
A Capella            119
Name: count, dtype: int64


artist_name
Giuseppe Verdi              1394
Giacomo Puccini             1137
Kimbo Children's Music       971
Nobuo Uematsu                825
Richard Wagner               804
                            ... 
Zubin Mehta                    1
Shawn Lane                     1
Claudio Arrau  

In [15]:
df_train = df.drop(['genre', 'track_name', 'track_id', 'artist_name', 'key', 'time_signature'], axis=1).copy() 

In [16]:
numerical_features = df_train.drop('mode', axis=1).columns 
cate_features = ['mode']

numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

cate_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features), 
        ('cat', cate_transformer, cate_features)
    ]
)

pipeline = Pipeline([
    ('preprocess', preprocessor), 
    ('knn', NearestNeighbors(metric='euclidean', n_neighbors=10)) 
])

pipeline.fit(df_train) 

In [28]:
idx = 125 

def recommend_by_index(idx, top_k=10):
    query = df_train.iloc[[idx]] 

    query_transformed = pipeline.named_steps['preprocess'].transform(query) 
    
    distances, indices = pipeline.named_steps['knn'].kneighbors(
        query_transformed, 
        n_neighbors=top_k+1
    )

    similar_indices = indices[0][1:] 
    similar_distances = distances[0][1:] 

    similar_songs = df.iloc[similar_indices][['track_name', 'artist_name', 'genre']].copy() 

    similar_songs['similarity_score'] = 1 - similar_distances 
    
    return similar_songs.to_dict(orient="records") 

print(recommend_by_index(idx)) 

[{'track_name': "Avec l'ami Bidasse", 'artist_name': 'Dorothée', 'genre': 'Movie', 'similarity_score': 0.465225103774352}, {'track_name': 'Cut Me a Piece of Ham', 'artist_name': 'Nursery Rhymes ABC', 'genre': "Children's Music", 'similarity_score': 0.33544364195886156}, {'track_name': 'Press Along to the Big Corral', 'artist_name': 'Riders In The Sky', 'genre': 'Movie', 'similarity_score': 0.3263348249983835}, {'track_name': 'Chloe', 'artist_name': 'Nursery Rhymes ABC', 'genre': "Children's Music", 'similarity_score': 0.2922702231960672}, {'track_name': 'The Music Man', 'artist_name': 'Sandra Beech', 'genre': "Children's Music", 'similarity_score': 0.2418810457173769}, {'track_name': 'A Tisket a Tasket', 'artist_name': 'HooplaKidz', 'genre': "Children's Music", 'similarity_score': 0.21212298837485133}, {'track_name': 'Toot the Flute', 'artist_name': "Kimbo Children's Music", 'genre': "Children's Music", 'similarity_score': 0.20179158088586968}, {'track_name': 'Mustapha', 'artist_name':