In [28]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from joblib import dump

In [None]:
"""
Primary:
- id (Id of track generated by Spotify)
Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)
Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)
Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)"""

In [16]:
# reading csv
df = pd.read_csv('data/data.csv')
# deciding on columns(subset removed 'id' 'release date') reordered columns
df = df[['artists',  'name', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'speechiness', 'tempo', 'valence']]

In [23]:
# tiny amount of data cleaning
df['artists'] = df['artists'].str.strip('[]')
df['artists'] = df['artists'].str.strip('""')
df['artists'] = df['artists'].str.strip("''")

In [24]:
df.head(20)

Unnamed: 0,artists,name,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,Mamie Smith,Keep A Song In Your Soul,1920,0.991,0.598,168333,0.224,0,0.000522,5,0.379,-12.628,0,12,0.0936,149.976,0.634
1,Screamin' Jay Hawkins,I Put A Spell On You,1920,0.643,0.852,150200,0.517,0,0.0264,5,0.0809,-7.261,0,7,0.0534,86.889,0.95
2,Mamie Smith,Golfing Papa,1920,0.993,0.647,163827,0.186,0,1.8e-05,0,0.519,-12.098,1,4,0.174,97.6,0.689
3,Oscar Velazquez,True House Music - Xavier Santos & Carlos Gomi...,1920,0.000173,0.73,422087,0.798,0,0.801,2,0.128,-7.311,1,17,0.0425,127.997,0.0422
4,Mixe,Xuniverxe,1920,0.295,0.704,165224,0.707,1,0.000246,10,0.402,-6.036,0,2,0.0768,122.076,0.299
5,Mamie Smith & Her Jazz Hounds,Crazy Blues - 78rpm Version,1920,0.996,0.424,198627,0.245,0,0.799,5,0.235,-11.47,1,9,0.0397,103.87,0.477
6,Mamie Smith,Don't You Advertise Your Man,1920,0.992,0.782,195200,0.0573,0,2e-06,5,0.176,-12.453,1,5,0.0592,85.652,0.487
7,Mamie Smith & Her Jazz Hounds,Arkansas Blues,1920,0.996,0.474,186173,0.239,0,0.186,9,0.195,-9.712,1,0,0.0289,78.784,0.366
8,Francisco Canaro,La Chacarera - Remasterizado,1920,0.996,0.469,146840,0.238,0,0.96,8,0.149,-18.717,1,0,0.0741,130.06,0.621
9,Meetya,Broken Puppet - Original Mix,1920,0.00682,0.571,476304,0.753,0,0.873,8,0.092,-6.943,1,0,0.0446,126.993,0.119


In [262]:
# finding a song I want to use as an input
frank = df[df['artists'] == 'Frank Ocean']

In [264]:
frank.head(20)

Unnamed: 0,artists,name,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
18130,Frank Ocean,Novacane,2011,0.0584,0.762,302347,0.508,1,0.00126,3,0.16,-9.112,1,70,0.0919,93.51,0.37
18278,Frank Ocean,Thinkin Bout You,2012,0.409,0.721,200747,0.339,0,0.00153,0,0.0973,-11.195,1,73,0.0532,129.83,0.2
18296,Frank Ocean,Lost,2012,0.0272,0.913,234093,0.603,1,0.000503,8,0.167,-4.892,1,71,0.226,123.061,0.497
19082,Frank Ocean,Nights,2016,0.42,0.466,307151,0.548,1,1e-06,5,0.113,-9.362,0,78,0.118,89.815,0.423
19084,Frank Ocean,Ivy,2016,0.782,0.567,249191,0.388,1,0.000309,9,0.248,-9.579,0,79,0.0384,116.362,0.452
19090,Frank Ocean,Pink + White,2016,0.67,0.544,184516,0.552,0,4.6e-05,9,0.415,-7.45,1,79,0.0991,159.738,0.554
19092,Frank Ocean,Godspeed,2016,0.931,0.421,177922,0.0952,0,0.000201,6,0.126,-12.561,1,79,0.0479,109.698,0.0773
19094,Frank Ocean,White Ferrari,2016,0.784,0.444,248808,0.0924,0,0.0,0,0.304,-15.605,1,77,0.0366,108.917,0.219
19148,Frank Ocean,Self Control,2016,0.765,0.572,249668,0.209,0,0.0,8,0.356,-10.413,1,74,0.0313,80.069,0.446
19272,Frank Ocean,Chanel,2017,0.874,0.776,210285,0.503,1,0.0,0,0.112,-5.732,0,81,0.237,110.134,0.473


In [149]:
# target set will be both artist and name
y_set = ['artists', 'name']

# droping target from data matrix
df_data = df.drop(y_set, axis=1)

# set target
df_target = df[y_set]

In [273]:
# fit on data, 12 neighbors
nn = NearestNeighbors(algorithm='brute', leaf_size =15, n_neighbors=12, n_jobs=-1)
nn.fit(df_data)

NearestNeighbors(algorithm='brute', leaf_size=15, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=12, p=2,
                 radius=1.0)

In [295]:
# sample a song(index) from df_data to use as our query point 
input_index = 19272 # Frank Ocean - Chanel

# vectorize 
data_vect = [df_data.iloc[input_index].values]
data_vect

[array([ 2.01700e+03,  8.74000e-01,  7.76000e-01,  2.10285e+05,
         5.03000e-01,  1.00000e+00,  0.00000e+00,  0.00000e+00,
         1.12000e-01, -5.73200e+00,  0.00000e+00,  8.10000e+01,
         2.37000e-01,  1.10134e+02,  4.73000e-01])]

In [296]:
# Query Using kneighbors 
neigh_dist, neigh_indices = nn.kneighbors(data_vect)

In [297]:
# top 12 closest data vectors to our reference vector, data_vect
neigh_dist

array([[2.76213586e-03, 2.34324959e+01, 2.63210405e+01, 3.07716034e+01,
        3.14111451e+01, 3.22390653e+01, 3.26108410e+01, 3.66270893e+01,
        3.74952521e+01, 3.89251496e+01, 3.96217930e+01, 4.31334181e+01]])

In [298]:
# these are the corresponding indicies of the most similar vectors 
neigh_indices

array([[ 19272, 125795,  92458, 107269,  34993,  56606,  17838, 142383,
         13710,  38602, 171270, 122833]], dtype=int64)

In [299]:
# breaking down the array to a list
indexs = neigh_indices.flat[0:12].tolist()
indexs

[19272,
 125795,
 92458,
 107269,
 34993,
 56606,
 17838,
 142383,
 13710,
 38602,
 171270,
 122833]

In [300]:
# result metrics
df_data.iloc[indexs]

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
19272,2017,0.874,0.776,210285,0.503,1,0.0,0,0.112,-5.732,0,81,0.237,110.134,0.473
125795,2016,0.591,0.488,210293,0.762,0,0.0,8,0.0763,-3.842,1,66,0.0865,123.889,0.709
92458,2013,0.073,0.722,210293,0.821,0,0.0,1,0.631,-3.856,0,64,0.165,127.944,0.721
107269,2004,0.073,0.76,210293,0.77,0,0.0,5,0.109,-6.797,0,55,0.099,113.272,0.895
34993,1998,0.0431,0.677,210267,0.897,0,0.0,2,0.264,-5.458,1,64,0.0393,107.78,0.828
56606,2011,0.00379,0.696,210267,0.546,0,4e-06,5,0.332,-6.55,1,65,0.0414,130.002,0.787
17838,2009,0.0242,0.855,210307,0.668,0,0.0,11,0.102,-4.892,1,69,0.0644,125.846,0.803
142383,2017,0.853,0.389,210308,0.308,0,0.937,7,0.172,-14.46,0,63,0.0751,129.138,0.13
13710,1989,0.596,0.649,210267,0.618,0,1e-06,9,0.192,-10.728,1,68,0.0265,114.688,0.667
38602,2016,0.743,0.707,210320,0.656,0,0.0,4,0.172,-9.279,1,67,0.173,118.037,0.748


In [301]:
# result target
df_target.iloc[indexs]

Unnamed: 0,artists,name
19272,Frank Ocean,Chanel
125795,John Legend,Love Me Now
92458,Selena Gomez,Slow Down
107269,The Jacksons,Blame It on the Boogie
34993,Shakira,Si Te Vas
56606,Britney Spears,I Wanna Go
17838,"Jay Sean', 'Sean Paul', 'Lil Jon",Do You Remember
142383,Hippie Sabotage,Drifter
13710,Roy Orbison,You Got It
38602,"Christopher Jackson', 'Rachel House', 'Nicole ...",Where You Are
