In [26]:
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.neighbors import NearestNeighbors
from joblib import dump

In [None]:
"""
Primary:
- id (Id of track generated by Spotify)
Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)
Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)
Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)"""

In [8]:
# reading csv
df = pd.read_csv('data/edited_data.csv')

In [9]:
# adding direct url to data set by adding url prefix and id
url = 'http://open.spotify.com/track/' + df['id']
df['url'] = url

In [10]:
# reordering columns, leaving out ID and release date
df = df[['artists',  'name', 'url', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'speechiness', 'tempo', 'valence']]

In [11]:
# Previewing datset
df.head(5)

Unnamed: 0,artists,name,url,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,MAMIE SMITH,Keep A Song In Your Soul,http://open.spotify.com/track/0cS0A1fUEUd1EW3F...,1920,0.991,0.598,168333,0.224,0,0.000522,5,0.379,-12.628,0,12,0.0936,149.976,0.634
1,SCREAMIN' JAY HAWKINS,I Put A Spell On You,http://open.spotify.com/track/0hbkKFIJm7Z05H8Z...,1920,0.643,0.852,150200,0.517,0,0.0264,5,0.0809,-7.261,0,7,0.0534,86.889,0.95
2,MAMIE SMITH,Golfing Papa,http://open.spotify.com/track/11m7laMUgmOKqI3o...,1920,0.993,0.647,163827,0.186,0,1.8e-05,0,0.519,-12.098,1,4,0.174,97.6,0.689
3,OSCAR VELAZQUEZ,True House Music - Xavier Santos & Carlos Gomi...,http://open.spotify.com/track/19Lc5SfJJ5O1oaxY...,1920,0.000173,0.73,422087,0.798,0,0.801,2,0.128,-7.311,1,17,0.0425,127.997,0.0422
4,MIXE,Xuniverxe,http://open.spotify.com/track/2hJjbsLCytGsnAHf...,1920,0.295,0.704,165224,0.707,1,0.000246,10,0.402,-6.036,0,2,0.0768,122.076,0.299


In [15]:
# finding a song I want to use as an input
drake = df[df['artists'] == 'DRAKE'].sort_values('popularity', ascending=False)

In [16]:
drake

Unnamed: 0,artists,name,url,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
19542,DRAKE,God's Plan,http://open.spotify.com/track/6DCZcSspjsKoFjzj...,2018,0.0332,0.754,198973,0.449,1,0.000083,7,0.552,-9.211,1,83,0.1090,77.169,0.357
58286,DRAKE,Toosie Slide,http://open.spotify.com/track/127QTOFJsJQp5LbJ...,2020,0.3210,0.834,247059,0.454,1,0.000006,1,0.114,-9.750,0,83,0.2010,81.618,0.837
19360,DRAKE,Passionfruit,http://open.spotify.com/track/5mCPDVBb16L4XQwD...,2017,0.2560,0.809,298941,0.463,1,0.085000,11,0.109,-11.377,1,79,0.0396,111.980,0.364
93860,DRAKE,Toosie Slide,http://open.spotify.com/track/466cKvZn1j45IpxD...,2020,0.2890,0.830,247059,0.490,1,0.000003,1,0.113,-8.820,0,78,0.2090,81.604,0.845
38914,DRAKE,In My Feelings,http://open.spotify.com/track/2G7V7zsVDxg1yRsu...,2018,0.0589,0.835,217925,0.626,1,0.000060,1,0.396,-5.833,1,78,0.1250,91.030,0.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140945,DRAKE,The Resistance,http://open.spotify.com/track/0llA0pYA6GpGk7fT...,2010,0.8220,0.547,225360,0.741,1,0.000000,0,0.117,-8.127,1,49,0.1630,82.589,0.637
156478,DRAKE,Say What's Real,http://open.spotify.com/track/7mPoCVGP752A5DtH...,2009,0.6320,0.462,230717,0.773,1,0.000000,8,0.212,-5.375,1,48,0.2880,86.157,0.455
156388,DRAKE,Congratulations,http://open.spotify.com/track/3SnXwQUrvSacFziU...,2009,0.0185,0.333,332530,0.859,1,0.000000,6,0.220,-2.286,1,48,0.1480,86.882,0.125
156645,DRAKE,Thank Me Now,http://open.spotify.com/track/3cBUv0RIoEyAm2b7...,2010,0.2820,0.495,328573,0.772,1,0.000000,8,0.119,-4.851,1,47,0.4250,71.295,0.748


In [17]:
# target set will be both artist and name
y_set = ['artists', 'name', 'url']

# droping target from data matrix
df_data = df.drop(y_set, axis=1)

# set target
df_target = df[y_set]

In [18]:
# fit on data, 12 neighbors
nn = NearestNeighbors(algorithm='brute', leaf_size =15, n_neighbors=12, n_jobs=-1)
nn.fit(df_data)

NearestNeighbors(algorithm='brute', leaf_size=15, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=12, p=2,
                 radius=1.0)

In [19]:
# sample a song(index) from df_data to use as our query point 
input_index = 58286 # Drake - Tootsie slide

# vectorize 
data_vect = [df_data.iloc[input_index].values]
data_vect

[array([ 2.02000e+03,  3.21000e-01,  8.34000e-01,  2.47059e+05,
         4.54000e-01,  1.00000e+00,  6.15000e-06,  1.00000e+00,
         1.14000e-01, -9.75000e+00,  0.00000e+00,  8.30000e+01,
         2.01000e-01,  8.16180e+01,  8.37000e-01])]

In [20]:
# Query Using kneighbors 
neigh_dist, neigh_indices = nn.kneighbors(data_vect)

In [21]:
# top 12 closest data vectors to our reference vector, data_vect
neigh_dist

array([[ 0.        ,  5.08601625, 30.97209819, 40.26932408, 45.41215041,
        50.32443327, 50.39740189, 52.72150335, 55.27282865, 55.93455796,
        57.05886579, 57.85963648]])

In [22]:
# these are the corresponding indicies of the most similar vectors 
neigh_indices

array([[ 58286,  93860,  55254, 107239,  15938,  56532, 155227,  33730,
         92082, 105990, 120108,  70300]], dtype=int64)

In [23]:
# breaking down the array to a list
indexs = neigh_indices.flat[0:12].tolist()
indexs

[58286,
 93860,
 55254,
 107239,
 15938,
 56532,
 155227,
 33730,
 92082,
 105990,
 120108,
 70300]

In [24]:
# result metrics
df_data.iloc[indexs]

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
58286,2020,0.321,0.834,247059,0.454,1,6e-06,1,0.114,-9.75,0,83,0.201,81.618,0.837
93860,2020,0.289,0.83,247059,0.49,1,3e-06,1,0.113,-8.82,0,78,0.209,81.604,0.845
55254,2005,0.619,0.52,247053,0.379,0,2e-06,0,0.108,-9.906,1,57,0.0279,85.934,0.208
107239,2004,0.118,0.68,247040,0.659,0,0.0,0,0.107,-6.855,1,53,0.211,91.259,0.411
15938,2000,0.59,0.266,247040,0.333,0,0.0,10,0.121,-9.876,1,62,0.031,109.479,0.0796
56532,2011,0.0301,0.226,247027,0.261,0,0.0677,2,0.0628,-11.276,1,57,0.0339,108.915,0.084
155227,2002,0.518,0.553,247027,0.53,0,0.0,4,0.232,-5.332,0,49,0.0246,84.017,0.162
33730,1992,0.038,0.691,247040,0.497,0,6e-06,11,0.0859,-13.248,1,51,0.0473,103.893,0.701
92082,2011,0.846,0.414,247053,0.295,0,0.0,10,0.079,-9.271,0,53,0.033,125.833,0.218
105990,1998,0.19,0.694,247040,0.368,0,0.0,8,0.14,-7.264,1,41,0.0655,60.117,0.465


In [25]:
# result target
df_target.iloc[indexs]

Unnamed: 0,artists,name,url
58286,DRAKE,Toosie Slide,http://open.spotify.com/track/127QTOFJsJQp5LbJ...
93860,DRAKE,Toosie Slide,http://open.spotify.com/track/466cKvZn1j45IpxD...
55254,"BRAD PAISLEY', 'DOLLY PARTON",When I Get Where I'm Going (feat. Dolly Parton),http://open.spotify.com/track/3VLCtStwYsAL4LKZ...
107239,"CASSIDY', 'R. KELLY",Hotel (feat. R. Kelly),http://open.spotify.com/track/4hHXhCRSnOKd6nMG...
15938,FAITH HILL,"Where Are You Christmas - From ""Dr. Seuss' How...",http://open.spotify.com/track/1msuiw6pnXYfxZ7E...
56532,M83,Outro,http://open.spotify.com/track/2QVmiA93GVhWNTWQ...
155227,KELLY CHEN,記事本,http://open.spotify.com/track/3FbzN8826gMAaMxU...
33730,THE BELLAMY BROTHERS,Old Hippie,http://open.spotify.com/track/5fv9qtXwNV6Xx3P9...
92082,STAIND,Something to Remind You,http://open.spotify.com/track/4wOQ8upbW1GzHCWj...
105990,DESTINY'S CHILD,"No, No, No, Pt. 1",http://open.spotify.com/track/2pdzseh7ELZCKlXX...


In [27]:
# creates the model into a pickle file
dump(nn, 'model.joblib', compress=True)

['model.joblib']