# Recommendor system using Nearest Neighbours 

In [4]:
import pandas as pd
import numpy as np

#Importing our data files
txt_file = pd.read_table(r'C:\Users\sv784\Desktop\SongRecommendor\triplet_file.txt')
#Giving name to columns.
txt_file.columns = ['user_id', 'song_id', 'listen_count']
txt_file.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOHQWYZ12A6D4FA701,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOIYTOA12A6D4F9A23,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOIZAZL12A6701C53B,5
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOJNNUA12A8AE48C7A,1


In [5]:
excel_file = pd.read_excel(r'C:\Users\sv784\Desktop\SongRecommendor\metadata_file.xlsx')
excel_file.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,KarkuteillÃ¤,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos QuerÃ©s,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [6]:
#We have song_id column as common in both files
#Merging both file wrt song_id.
mergedFile = pd.merge(excel_file, txt_file, on = 'song_id')
mergedFile.head()

Unnamed: 0,song_id,title,release,artist_name,year,user_id,listen_count
0,SOSZNRJ12A8AE46E38,Anyone Else But You,Juno - Music From The Motion Picture,Michael Cera & Ellen Page,2007,d6589314c0a9bcbca4fee0c93b14bc402363afea,6
1,SOSZNRJ12A8AE46E38,Anyone Else But You,Juno - Music From The Motion Picture,Michael Cera & Ellen Page,2007,484b69dd013df1ec0cfd504886d4f647cb32b08f,1
2,SOSZNRJ12A8AE46E38,Anyone Else But You,Juno - Music From The Motion Picture,Michael Cera & Ellen Page,2007,3f9ed694a79835c921ef6d94acd28f876c1d901e,4
3,SOGKGLB12A81C22AFA,Drunk and Hot Girls,Graduation,Kanye West / Mos Def,2007,529b42cdbc379ad2e765aec6d3bad8a192038741,2
4,SOGKGLB12A81C22AFA,Drunk and Hot Girls,Graduation,Kanye West / Mos Def,2007,73e9e981d5fc2a6453cdfb5025e2fa0a144a4142,1


In [7]:
#Number of users listened a song
#Rating = count of users who have listened the same song
combineSong_Rating = mergedFile.dropna(axis = 0, subset = ['song_id'])

#number of users listening to same song
song_ratingCount = (combineSong_Rating.
                    groupby(by = ['title'])['listen_count'].
                    count().
                    reset_index().
                    rename(columns = {'listen_count' : 'totalUser_Listened'})
                    [['title', 'totalUser_Listened']]
                   )
song_ratingCount.head()

Unnamed: 0,title,totalUser_Listened
0,A Dream,2
1,All Men Are Liars,5
2,Alley Oop (Girls Version) (2007 Digital Remaster),2
3,Amazing,2
4,Anyone Else But You,3


In [8]:
#Total number of times a song is listened
totalListen_count = mergedFile.groupby(['title']).agg({'listen_count':'count'}).reset_index()
group_sum = mergedFile['listen_count'].sum()

#grouping them in ascending order according to their title
total_listens = mergedFile.sort_values(['listen_count', 'title'], ascending = [0,1]).rename(columns = {'listen_count' : 'totalListen_count'})[['title', 'totalListen_count']]
total_listens.head()

Unnamed: 0,title,totalListen_count
105,The Maestro,35
130,Amazing,21
149,Tidal Wave,19
72,All Men Are Liars,18
103,The Maestro,15


In [9]:
#Merging the above dataset for the making of "Pivot Table"
rating_with_totalRatingCount = combineSong_Rating.merge(song_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.drop(["listen_count", "year", "release"], axis = 1, inplace = True)
rating_with_totalRatingCount.head()

Unnamed: 0,song_id,title,artist_name,user_id,totalUser_Listened
0,SOSZNRJ12A8AE46E38,Anyone Else But You,Michael Cera & Ellen Page,d6589314c0a9bcbca4fee0c93b14bc402363afea,3
1,SOSZNRJ12A8AE46E38,Anyone Else But You,Michael Cera & Ellen Page,484b69dd013df1ec0cfd504886d4f647cb32b08f,3
2,SOSZNRJ12A8AE46E38,Anyone Else But You,Michael Cera & Ellen Page,3f9ed694a79835c921ef6d94acd28f876c1d901e,3
3,SOGKGLB12A81C22AFA,Drunk and Hot Girls,Kanye West / Mos Def,529b42cdbc379ad2e765aec6d3bad8a192038741,2
4,SOGKGLB12A81C22AFA,Drunk and Hot Girls,Kanye West / Mos Def,73e9e981d5fc2a6453cdfb5025e2fa0a144a4142,2


In [10]:
pd.set_option('display.float_format', lambda x:'%.3f' % x)
print(song_ratingCount['totalUser_Listened'].describe())

count   77.000
mean     2.325
std      2.678
min      1.000
25%      1.000
50%      1.000
75%      3.000
max     21.000
Name: totalUser_Listened, dtype: float64


In [11]:
#if a song is listened by more than 5 user then only we will recommend it.
#Sorting data taking threshold value as "5"
popularity_threshold = 5
rating_popular_song = rating_with_totalRatingCount.query('totalUser_Listened >= @popularity_threshold')
rating_popular_song.head()

#rating_popular_song dataser is showing only those song which are listened by more than 5 users.

Unnamed: 0,song_id,title,artist_name,user_id,totalUser_Listened
18,SOUEGBF12AB017EFD5,Van Helsing Boombox,Man Man,15415fa2745b344bce958967c346f2a89f792f63,5
19,SOUEGBF12AB017EFD5,Van Helsing Boombox,Man Man,95942345306393998eb3a051ae5fa3c4d5afbaa4,5
20,SOUEGBF12AB017EFD5,Van Helsing Boombox,Man Man,e3937c7c32f5b68422808a854a4a7a824ee448a5,5
21,SOUEGBF12AB017EFD5,Van Helsing Boombox,Man Man,cc62e608a1a21353e1673817973235cde4116bad,5
22,SOUEGBF12AB017EFD5,Van Helsing Boombox,Man Man,8e8b32a1621d2950fe9a6384d8fb594fd25525df,5


Creating PIVOT table.

In [12]:
#Values in this table are totalUser_Listened
song_feature_df = rating_popular_song.pivot_table(index = 'title', columns = 'user_id', values = 'totalUser_Listened').fillna(0)
song_feature_df.head()

user_id,019d0d1c7a01f8736ba59a124160e5fc70666db7,0d176eb95537800a1e67ed5fe82eab3d2caafca9,0ef42a19efb74d0a05c308d00636c8d8d41bec0c,0f6b8ea5ba6f7df04932cee775b4fc2eb9281dbb,15415fa2745b344bce958967c346f2a89f792f63,16f5dc37b96c153c462bf306ceef36112d36346e,1ee591a388274035a4fd8a4ae40a9589d320bb9d,295773c4193b190d3527c6fcc228e879809fee1a,343fc3fb987ca12c3c5df154c2b4721ca111f696,38c11af0c42bb21cf5b9ffb535f76c7967241b52,...,d605b7fe1645e05ff094b364b603456ce8126643,e07a79f2d3e0db17991f6eb8d5a3314e22795748,e21477efb83bd323205ce6f5bd662f3df9d477e5,e2ab3d64f84052226d05c89c4fff779a52bff97f,e3937c7c32f5b68422808a854a4a7a824ee448a5,e3e9de8f712b435cdbe4e767ac2c414f585400f1,e697f7b095a4f7cd83f00575741cf5d1a1850b37,e934f5a246024dea38ed8067da83d50a67683e7a,ea64e003562d2f0f39e5a7dd84af5b1969e0fea3,fb6548e795d1c866f3079619da22fbcf1360e5a4
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
All Men Are Liars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
Kiss (LP Version),0.0,0.0,0.0,6.0,0.0,0.0,0.0,6.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
Silent Shout,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
SinceritÃ© Et Jalousie,21.0,21.0,21.0,0.0,0.0,21.0,0.0,0.0,21.0,21.0,...,0.0,21.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,21.0
The Maestro,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#Sparse matrix
from scipy.sparse import csr_matrix
song_feature_df_matrix = csr_matrix(song_feature_df.values)

#For Nearest Neighbors
from sklearn.neighbors import NearestNeighbors #Unsupervised ML

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
# fitting is equal to training. Then, after it is trained, 
#the model can be used to make predictions
model_knn.fit(song_feature_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [14]:
song_feature_df.shape

(8, 54)

To randomly pick a song we used "np.random.choice()"

Below ".Kneighbors" will find out the songs similar to randomly selected song using "Cosine Similarity"

In [15]:
#picking up a random song
query_index = np.random.choice(song_feature_df.shape[0])
print(query_index)
# "iloc" will select a particular cell of dataset
distances, indices = model_knn.kneighbors(song_feature_df.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 3)

4


In [16]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        #song which is selected randomly
        print('Recommendations for {0}:\n'.format(song_feature_df.index[query_index]))
    else:
        #Showing it's 2 nearest neighnors 
        print('{1}, with distance of {2}:'.format(i, song_feature_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for The Maestro:

Silent Shout, with distance of 0.7278344730240913:
SinceritÃ© Et Jalousie, with distance of 0.8545214065093385:
