In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from joblib import dump

In [None]:
"""
Primary:
- id (Id of track generated by Spotify)
Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)
Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)
Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)"""

In [2]:
# reading csv
df = pd.read_csv('data/data.csv')
# deciding on columns(subset removed 'id' 'release date') reordered columns
df = df[['artists',  'name', 'id', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'speechiness', 'tempo', 'valence']]

In [3]:
# tiny amount of data cleaning
df['artists'] = df['artists'].str.strip('[]')
df['artists'] = df['artists'].str.strip('""')
df['artists'] = df['artists'].str.strip("''")

In [12]:
df.head(5)

Unnamed: 0,artists,name,id,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,Mamie Smith,Keep A Song In Your Soul,0cS0A1fUEUd1EW3FcF8AEI,1920,0.991,0.598,168333,0.224,0,0.000522,5,0.379,-12.628,0,12,0.0936,149.976,0.634
1,Screamin' Jay Hawkins,I Put A Spell On You,0hbkKFIJm7Z05H8Zl9w30f,1920,0.643,0.852,150200,0.517,0,0.0264,5,0.0809,-7.261,0,7,0.0534,86.889,0.95
2,Mamie Smith,Golfing Papa,11m7laMUgmOKqI3oYzuhne,1920,0.993,0.647,163827,0.186,0,1.8e-05,0,0.519,-12.098,1,4,0.174,97.6,0.689
3,Oscar Velazquez,True House Music - Xavier Santos & Carlos Gomi...,19Lc5SfJJ5O1oaxY0fpwfh,1920,0.000173,0.73,422087,0.798,0,0.801,2,0.128,-7.311,1,17,0.0425,127.997,0.0422
4,Mixe,Xuniverxe,2hJjbsLCytGsnAHfdsLejp,1920,0.295,0.704,165224,0.707,1,0.000246,10,0.402,-6.036,0,2,0.0768,122.076,0.299


In [23]:
track = df['id'][19272]
url = 'http://open.spotify.com/track/' + track
url

'http://open.spotify.com/track/6Nle9hKrkL1wQpwNfEkxjh'

In [24]:
# finding a song I want to use as an input
frank = df[df['artists'] == 'Drake'].sort_values('popularity', ascending=False)

In [33]:
frank.head(15)

Unnamed: 0,artists,name,id,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
19542,Drake,God's Plan,6DCZcSspjsKoFjzjrWoCdn,2018,0.0332,0.754,198973,0.449,1,8.3e-05,7,0.552,-9.211,1,83,0.109,77.169,0.357
58286,Drake,Toosie Slide,127QTOFJsJQp5LbJbu3A1y,2020,0.321,0.834,247059,0.454,1,6e-06,1,0.114,-9.75,0,83,0.201,81.618,0.837
19360,Drake,Passionfruit,5mCPDVBb16L4XQwDdbRUpz,2017,0.256,0.809,298941,0.463,1,0.085,11,0.109,-11.377,1,79,0.0396,111.98,0.364
93860,Drake,Toosie Slide,466cKvZn1j45IpxDdYZqdA,2020,0.289,0.83,247059,0.49,1,3e-06,1,0.113,-8.82,0,78,0.209,81.604,0.845
38914,Drake,In My Feelings,2G7V7zsVDxg1yRsu7Ew9RJ,2018,0.0589,0.835,217925,0.626,1,6e-05,1,0.396,-5.833,1,78,0.125,91.03,0.35
19600,Drake,Nonstop,0TlLq3lA83rQOYtrqBqSct,2018,0.0165,0.912,238614,0.412,1,0.0126,7,0.104,-8.074,1,78,0.123,154.983,0.423
19626,Drake,Nice For What,3CA9pLiwRIGtUBiMjbZmRw,2018,0.0891,0.585,210747,0.909,1,9.7e-05,8,0.119,-6.474,1,77,0.0707,93.372,0.758
19256,Drake,Hotline Bling,0wwPcA6wtMf6HUMpIRdeP7,2016,0.00258,0.891,267067,0.628,0,0.00019,2,0.0504,-7.863,1,76,0.0551,134.966,0.552
19460,Drake,Teenage Fever,6n3HGiq4v35D6eFOSwqYuo,2017,0.111,0.766,219692,0.283,1,0.0172,8,0.115,-10.33,0,75,0.0846,97.04,0.144
18086,Drake,Marvins Room,047fCsbO4NdmwCBn8pcUXl,2011,0.646,0.492,347227,0.26,1,0.00178,9,0.0705,-17.341,0,74,0.0921,111.519,0.312


In [14]:
# target set will be both artist and name
y_set = ['artists', 'name', 'id']

# droping target from data matrix
df_data = df.drop(y_set, axis=1)

# set target
df_target = df[y_set]

In [15]:
# fit on data, 12 neighbors
nn = NearestNeighbors(algorithm='brute', leaf_size =15, n_neighbors=12, n_jobs=-1)
nn.fit(df_data)

NearestNeighbors(algorithm='brute', leaf_size=15, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=12, p=2,
                 radius=1.0)

In [26]:
# sample a song(index) from df_data to use as our query point 
input_index = 19542 # Frank Ocean - Chanel

# vectorize 
data_vect = [df_data.iloc[input_index].values]
data_vect

[array([ 2.01800e+03,  3.32000e-02,  7.54000e-01,  1.98973e+05,
         4.49000e-01,  1.00000e+00,  8.29000e-05,  7.00000e+00,
         5.52000e-01, -9.21100e+00,  1.00000e+00,  8.30000e+01,
         1.09000e-01,  7.71690e+01,  3.57000e-01])]

In [27]:
# Query Using kneighbors 
neigh_dist, neigh_indices = nn.kneighbors(data_vect)

In [28]:
# top 12 closest data vectors to our reference vector, data_vect
neigh_dist

array([[ 0.        , 17.26820938, 18.96429301, 19.21966243, 31.04437415,
        33.90345318, 40.91531754, 45.93986968, 46.00733566, 47.14744038,
        48.18649532, 49.0469261 ]])

In [29]:
# these are the corresponding indicies of the most similar vectors 
neigh_indices

array([[ 19542, 158695, 110000,  93216,  55053, 140403, 141633, 157711,
        108408, 107330,  91636, 139995]], dtype=int64)

In [30]:
# breaking down the array to a list
indexs = neigh_indices.flat[0:12].tolist()
indexs

[19542,
 158695,
 110000,
 93216,
 55053,
 140403,
 141633,
 157711,
 108408,
 107330,
 91636,
 139995]

In [31]:
# result metrics
df_data.iloc[indexs]

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
19542,2018,0.0332,0.754,198973,0.449,1,8.3e-05,7,0.552,-9.211,1,83,0.109,77.169,0.357
158695,2020,0.208,0.722,198966,0.658,1,0.0,0,0.249,-4.94,1,74,0.119,87.004,0.592
110000,2020,0.482,0.806,198973,0.558,1,0.0,9,0.0618,-6.46,1,78,0.0683,95.016,0.678
93216,2017,0.644,0.498,198973,0.408,0,0.0,6,0.0993,-7.946,1,64,0.0331,78.942,0.383
55053,2004,0.00165,0.548,198973,0.889,0,0.00109,9,0.197,-4.682,1,59,0.0382,90.048,0.425
140403,2007,0.00533,0.609,198947,0.743,0,0.0,1,0.32,-5.603,1,67,0.032,83.969,0.493
141633,2013,0.326,0.803,198963,0.577,1,0.000481,2,0.302,-7.823,1,53,0.0433,102.107,0.605
157711,2016,0.121,0.626,198961,0.659,0,0.0,0,0.17,-5.669,1,59,0.0337,113.551,0.594
108408,2010,0.0269,0.762,198960,0.739,0,0.0,0,0.498,-4.178,0,56,0.0328,110.002,0.852
107330,2005,0.333,0.769,199000,0.621,0,0.00233,9,0.157,-7.874,0,55,0.107,100.25,0.649


In [32]:
# result target
df_target.iloc[indexs]

Unnamed: 0,artists,name,id
19542,Drake,God's Plan,6DCZcSspjsKoFjzjrWoCdn
158695,Dixie D’Amelio,Be Happy,1KVQBPyORdhfITlixROtvC
110000,"Lauv', 'Anne-Marie","fuck, i'm lonely (with Anne-Marie)",09PGubKAMryhOWv1LHpCYz
93216,Lee Brice,Boy,7A8OfzqXBHYGk61FZTHoeo
55053,Kelly Clarkson,Behind These Hazel Eyes,0AKAxdNkwq9ZxRdW1DN9zW
140403,Nigga,Te Quiero,59HWEr0or9XkgRaWvnxc6g
141633,Priceless Da Roc,Yiken (Certified),67ncKGqScuJdNUN6bTqclA
157711,TV Girl,Cigarettes out the Window,5GKekzF1YcR2DQd9c3DI8y
108408,Jason Derulo,In My Head,0TyOpxlWwDx98bjkIVHUgY
107330,Gorillaz,Fire Coming out of the Monkey's Head,1S9tfxdFr4TqoqA14gnKj3
