In [1]:
import numpy as np
import pandas as pd

#display results to 3 decimal points, not in scientific notations
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
#load dataset 
user_data = pd.read_table('D:/usersha1-artmbid-artname-plays.tsv', header = None, nrows = 2e7, names =['users', 'musicbrainz-artist-id', 'artist-name', 'plays'], usecols = ['users', 'artist-name', 'plays'] )
user_profile = pd.read_table('D:/usersha1-profile.tsv', header = None, names = ['users', 'gender', 'age', 'country', 'signup'], usecols = ['users', 'country'])

In [3]:
#display first few rows
user_data.head()

Unnamed: 0,users,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [4]:
user_profile.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [5]:
#to get only null values in artist-name but complete dataset
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays
244853,039e5d61d65bbf5e6d95b07b1b3b67f7fd287a62,,18
431015,065a001be5a8a55971042077933e263d0d5cde46,,186
455721,06b17c50402d06a497cb13a0375992fd1e90b392,,3
504026,0757ac29973aab69bb31cd164c6df975bf4df9a1,,38
607282,08e102b376abe856a3d4be5ea14ad6b37395fe82,,208
...,...,...,...
17227026,fb7aec57827b2bd6152b84ef2034bc5aa023fe89,,13
17306503,fcaa2f605a2c6d2cd21942a20f80c7e1c14e1818,,5
17404362,fe1503af166a337f6a572da66b99b4cd0da362b2,,62
17429832,fe72cbf58e485fab12211834244ff8dbf314b590,,63


In [6]:
""" to drop the rows which have null value in artist-name column, we check using isnull i.e. true or false
and .sum() converts true to 1 and false to 0 after it using dropna and putting the axis = 0, it drops every row where axis is 1
specifying column name in subset, in which we need to check null values"""
if user_data['artist-name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist-name'])

In [7]:
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays


In [8]:
"""use groupby to get sum of plays .sum and reset it after using reset_index(), 
rename to rename column, double brackets to get specified columns"""
artist_plays = (user_data.groupby(by=['artist-name'])['plays'].sum().reset_index().rename(columns = {'plays':'total_artist_plays'})[['artist-name', 'total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist-name,total_artist_plays
0,04)],6
1,2,1606
2,58725ab=>,23
3,80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari,70
4,amy winehouse,23


In [10]:
#merge two tables
user_data_with_artist_plays = user_data.merge(artist_plays, on = 'artist-name', how = 'left')
user_data_with_artist_plays.head()


Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [12]:
#stats, only used in columns with numerical values
artist_plays['total_artist_plays'].describe()

count     292363.000
mean       12907.022
std       185981.631
min            1.000
25%           53.000
50%          208.000
75%         1048.000
max     30466827.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_plays['total_artist_plays'].max()

30466827

In [14]:
artist_plays[artist_plays['total_artist_plays'] == artist_plays['total_artist_plays'].max()]

Unnamed: 0,artist-name,total_artist_plays
252494,the beatles,30466827


In [16]:
# use quantile function to represent data distribution
artist_plays['total_artist_plays'].quantile(np.arange(.9, 1, .01))

0.900     6137.800
0.910     7409.420
0.920     9102.040
0.930    11474.660
0.940    14898.000
0.950    19964.500
0.960    28420.120
0.970    43541.420
0.980    79403.560
0.990   198483.660
Name: total_artist_plays, dtype: float64

In [17]:
# to count values
artist_plays['total_artist_plays'].value_counts()

total_artist_plays
1        2816
2        2724
3        2365
4        2211
5        2123
         ... 
13635       1
36958       1
47740       1
13197       1
28195       1
Name: count, Length: 28112, dtype: int64

In [18]:
#Taking only top artists(where no of plays are more)
popularity_threshold = 40000
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
user_data_popular_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,13547741


In [19]:
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [20]:
user_data_popular_artists.head(100)

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,13547741
...,...,...,...,...
110,00004d2ac9316e22dc007ab2243d6fcb239e707d,nick cave & the bad seeds,135,592844
115,00004d2ac9316e22dc007ab2243d6fcb239e707d,antony and the johnsons,107,1516288
116,00004d2ac9316e22dc007ab2243d6fcb239e707d,marissa nadler,107,185751
117,00004d2ac9316e22dc007ab2243d6fcb239e707d,a silver mt. zion,106,504328


In [21]:
user_data_with_artist_plays.query('plays>40000')

Unnamed: 0,users,artist-name,plays,total_artist_plays
27358,006261139d787c1e43b4c69d304f2772367c1005,garbage,62054,2461628
43276,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,2432188
166489,0268c4ff8eba994c93fc0e49644bac7b49caa068,mindless self indulgence,43251,3172270
175680,028b91859a012251da23c3dbfd2215154a789f9f,afi,59169,3918876
191656,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,2680164
...,...,...,...,...
17217137,fb587892425d6ce7a939ddbd84ab337aeee172d9,the used,53008,2861379
17239960,fbab9ccd006ea82729b527cdd9b1549a7314e5a6,布袋寅泰,47053,79023
17305007,fca2614e3834feb94726f6334b4948d776a767a1,oasis,60618,6348953
17346197,fd3baa3d1fc07a4e078f33773dbb1b27ae88c756,chamillionaire,57310,813382


In [24]:
# use query to filter and merge a pd function
combined = user_data_popular_artists.merge(user_profile, on = 'users', how = 'left')
usa_data = combined.query('country == "United States"')
usa_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
156,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,2366807,United States
157,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,6115545,United States
158,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,2194862,United States
159,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,4248296,United States
160,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,3495537,United States


In [25]:
combined.query('plays>100000')

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
34568,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,2432188,United States
155324,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,2680164,Brazil
542544,09d12dfa05a0852053a9017121034a837fa4019e,alice cooper,134993,1542185,United Kingdom
617078,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,in flames,112989,11288367,Russian Federation
1159053,14ea4c6f3c2e86b4937f1158bd13d3173d780bd7,dean martin,288375,655025,United States
1298741,177653480857c3bb69b9a71b4f7166b7cd62129c,rush,100846,2518951,United States
1353498,1872585e74857e4888dfa63bd1186d210aae7681,tokio hotel,141661,952834,United States
1821162,20d54d757ff07da456dfaa26e9077f5fa12fe71a,marilyn manson,111455,6417868,Poland
1914297,228eb001a7ad5408dce7d40859e5935081518ff1,the rasmus,100080,1156417,Russian Federation
2170271,274f8ab91b73503c3a18cb5c230affa56e0a677d,u2,116025,8111215,France


In [26]:
#.shape to get rows and columns count
initial_rows = usa_data.shape[0]
print('Initial Dataframe Shape {0}'.format(usa_data.shape))
#use drop_duplicates to drop down rows having duplicate values
usa_data = usa_data.drop_duplicates(['users', 'artist-name'])
current_rows = usa_data.shape[0]
print('New Dataframe Shape {0}'.format(usa_data.shape))
print('Removed {0} Rows'.format(initial_rows - current_rows))

Initial Dataframe Shape (2788019, 5)
New Dataframe Shape (2788013, 5)
Removed 6 Rows


In [30]:
pip install scipy

Collecting scipy
  Obtaining dependency information for scipy from https://files.pythonhosted.org/packages/06/15/e73734f9170b66c6a84a0bd7e03586e87e77404e2eb8e34749fc49fa43f7/scipy-1.11.2-cp311-cp311-win_amd64.whl.metadata
  Downloading scipy-1.11.2-cp311-cp311-win_amd64.whl.metadata (59 kB)
     ---------------------------------------- 0.0/59.1 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.1 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.1 kB ? eta -:--:--
     ------------- ------------------------ 20.5/59.1 kB 108.9 kB/s eta 0:00:01
     -------------------------- ----------- 41.0/59.1 kB 163.4 kB/s eta 0:00:01
     -------------------------------- ----- 51.2/59.1 kB 201.8 kB/s eta 0:00:01
     -------------------------------------- 59.1/59.1 kB 183.7 kB/s eta 0:00:00
Downloading scipy-1.11.2-cp311-cp311-win_amd64.whl (44.0 MB)
   ---------------------------------------- 0.0/44.0 MB ? eta -:--:--
   ------------------------------------

In [32]:
from scipy.sparse import csr_matrix

In [38]:
# sampling data 
sample_usa_data = usa_data.sample(frac=0.001)

In [40]:
# Reshape data into sparse matrix
wide_artist_data = sample_usa_data.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values) 

In [41]:
wide_artist_data.head()

users,002b63c6eb63945fcbde6e842a399ce107f1bd35,002d3f566c5ed4e36bf3332285a7b2ec0d433586,0050d2483a5573b7256f84fa085c6a4682bfe8b4,0056f9124c776ff1dd158c2d12440cee874ff3cb,005d8987be11acc60b0742b48e83eaa4528b5af0,00b6d6c87345a38718aaa028647a3ad08e78cf91,00ccb469c03056eaf2bbdcbe2463690d2f793511,00ceb431517fe93dbb701752ddc51ae9d6a7123f,0155a3a7398aa65d8d924c38a74bb444b7cb80d4,01a0ed8062e1c833b3d16aa237d502347a5a93ff,...,ff3aed279ceb9a00c15b7b958cc00b1bb3bc073a,ff480c4cf197e6fd9c525cb627b287444d4d706c,ff49add8ffbf6cfcca11ccf807938e03a7af1f76,ff679c0a24b75ba27a9debf76bd536a4334fb3e6,ff70e7e0a99441c910cb53fedf055ff787aa5da8,ff8842150157e07f848801544911efb348b53808,ff9269f5debd942b86ef2a35f2c5a00fa17fc793,ff9d855523b290088795f2ff22155353b9dc092f,ffa8f4dd3ec2dba999f693c0bcfd7f4dca1bd1a7,ffd41e64d50ea0e7ef1c480faef0f2ba4bd87a0b
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...and you will know us by the trail of dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 years,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
wide_artist_data.shape

(1556, 2733)

In [43]:
# use data to retrieve non-zero element and 1 for row 1
wide_artist_data_sparse[1].data

array([213.,  17.])

In [44]:
wide_artist_data_sparse

<1556x2733 sparse matrix of type '<class 'numpy.float64'>'
	with 2788 stored elements in Compressed Sparse Row format>

In [45]:
wide_artist_data_sparse.data

array([128., 213.,  17., ..., 262., 138., 391.])

In [47]:
pip install scikit-learn

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/77/85/bff3a1e818ec6aa3dd466ff4f4b0a727db9fdb41f2e849747ad902ddbe95/scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-p

In [48]:
#Apply KNN model
from sklearn.neighbors  import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

In [49]:
model_knn

In [50]:
model_knn.fit(wide_artist_data_sparse)

In [51]:
model_knn

In [52]:
query_index = np.random.choice(wide_artist_data.shape[0])

In [53]:
query_index

1177

In [54]:
# label based indexing in dataframe
wide_artist_data.loc['the beatles']

users
002b63c6eb63945fcbde6e842a399ce107f1bd35   0.000
002d3f566c5ed4e36bf3332285a7b2ec0d433586   0.000
0050d2483a5573b7256f84fa085c6a4682bfe8b4   0.000
0056f9124c776ff1dd158c2d12440cee874ff3cb   0.000
005d8987be11acc60b0742b48e83eaa4528b5af0   0.000
                                            ... 
ff8842150157e07f848801544911efb348b53808   0.000
ff9269f5debd942b86ef2a35f2c5a00fa17fc793   0.000
ff9d855523b290088795f2ff22155353b9dc092f   0.000
ffa8f4dd3ec2dba999f693c0bcfd7f4dca1bd1a7   0.000
ffd41e64d50ea0e7ef1c480faef0f2ba4bd87a0b   0.000
Name: the beatles, Length: 2733, dtype: float64

In [55]:
#accessing 1000th row and .values to convert into numpy array from pandas series and reshape to get all columns values
wide_artist_data.iloc[1000, :].values.reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

In [60]:
query_index = 1100
# .flatten to convert 2d array to 1d
# n_neighbors to get top 6 
# .index to get artist-name stored at index
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('recomendations {0}'.format(wide_artist_data.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

recomendations roy harper
1: propagandhi, with distance of 1.0
2: pretty girls make graves, with distance of 1.0
3: primus, with distance of 1.0
4: prince, with distance of 1.0
5: prefab sprout, with distance of 1.0


In [61]:
distances

array([[0., 1., 1., 1., 1., 1.]])

In [62]:
indices

array([[1100, 1040, 1037, 1038, 1039, 1035]], dtype=int64)

In [63]:
query_index

1100

In [65]:
def print_recomendations(query_index):
    distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('recomendations {0}'.format(wide_artist_data.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))
    

In [67]:
inlist = wide_artist_data.index
artist_name = 'michael jackson'

query_index = [x for x in range(len(inlist)) if inlist[x]==artist_name  ]
query_index

[867]

In [68]:
print_recomendations(query_index)

recomendations Index(['michael jackson'], dtype='object', name='artist-name')
1: prince, with distance of 1.0
2: propellerheads, with distance of 1.0
3: pretty girls make graves, with distance of 1.0
4: primus, with distance of 1.0
5: prefab sprout, with distance of 1.0


In [69]:
inlist

Index(['!!!', '*nsync', '...and you will know us by the trail of dead',
       '10 years', '112', '2pac', '3', '3 doors down', '30 seconds to mars',
       '311',
       ...
       'Аквариум', 'すぎやまこういち', 'モーニング娘。', '下村陽子', '久石譲', '倖田來未', '大島ミチル',
       '菅野よう子', '菊田裕樹', '동방신기'],
      dtype='object', name='artist-name', length=1556)

In [73]:
# Make all plays count binary
wide_artist_data_zero_one = wide_artist_data.apply(np.sign)
wide_artist_data_zero_one_sparse = csr_matrix(wide_artist_data_zero_one.values)

In [74]:
wide_artist_data_zero_one.head()

users,002b63c6eb63945fcbde6e842a399ce107f1bd35,002d3f566c5ed4e36bf3332285a7b2ec0d433586,0050d2483a5573b7256f84fa085c6a4682bfe8b4,0056f9124c776ff1dd158c2d12440cee874ff3cb,005d8987be11acc60b0742b48e83eaa4528b5af0,00b6d6c87345a38718aaa028647a3ad08e78cf91,00ccb469c03056eaf2bbdcbe2463690d2f793511,00ceb431517fe93dbb701752ddc51ae9d6a7123f,0155a3a7398aa65d8d924c38a74bb444b7cb80d4,01a0ed8062e1c833b3d16aa237d502347a5a93ff,...,ff3aed279ceb9a00c15b7b958cc00b1bb3bc073a,ff480c4cf197e6fd9c525cb627b287444d4d706c,ff49add8ffbf6cfcca11ccf807938e03a7af1f76,ff679c0a24b75ba27a9debf76bd536a4334fb3e6,ff70e7e0a99441c910cb53fedf055ff787aa5da8,ff8842150157e07f848801544911efb348b53808,ff9269f5debd942b86ef2a35f2c5a00fa17fc793,ff9d855523b290088795f2ff22155353b9dc092f,ffa8f4dd3ec2dba999f693c0bcfd7f4dca1bd1a7,ffd41e64d50ea0e7ef1c480faef0f2ba4bd87a0b
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...and you will know us by the trail of dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 years,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
wide_artist_data_zero_one_sparse.data

array([1., 1., 1., ..., 1., 1., 1.])

In [78]:
wide_artist_data_zero_one_sparse[7].data

array([1.])

In [83]:
# New KNN-Model on 0-1 values
from sklearn.neighbors import NearestNeighbors
model_mn_binary = NearestNeighbors(metric = 'cosine' , algorithm = 'brute' )
model_mn_binary.fit(wide_artist_data_zero_one_sparse)

In [87]:
query_index = 900
distances, indices = model_mn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6)

for i in range(0,len(distances.flatten())):
    if i == 0:
        print('Recommendations with binary play data {0}\n'.format(wide_artist_data_zero_one.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations with binary play data mstrkrft

1: mirah, with distance of 0.42264973081037416
2: primus, with distance of 1.0
3: prince, with distance of 1.0
4: propagandhi, with distance of 1.0
5: protest the hero, with distance of 1.0


In [104]:
def print_recomendations(query_index):
    distances, indices = model_mn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].values.reshape(1,-1), n_neighbors = 6)
    
    for i in range(0,len(distances.flatten())):
        if i==0:
            print('Recommendations with binary play data {0}\n'.format(wide_artist_data_zero_one.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

In [105]:
in_list = wide_artist_data.index
artist_name = '2pac'

query_index=[x for x in range(len(in_list)) if in_list[x]==artist_name] 
query_index

[5]

In [106]:
print_recomendations(query_index)

Recommendations with binary play data Index(['2pac'], dtype='object', name='artist-name')

1: propagandhi, with distance of 1.0
2: pretty girls make graves, with distance of 1.0
3: primus, with distance of 1.0
4: prince, with distance of 1.0
5: prefab sprout, with distance of 1.0
