In [13]:
'''
This method is based on considering features of both users and songs
We have taken feature matrix for both users and songs
We will try to predict the listen count of unlistened songs
We will then recommend songs with highest listen count
this method assumes that more listen count means more liking of user towards song
So,we are bascically treating listen count of songs as rating for movies
this method focuses on optimizing both user and song feature matrix until both fit our known data
Results can be improved by using full dataset.
'''
import numpy as np
import pandas as pd
from scipy import optimize
from sklearn.model_selection import train_test_split

In [7]:
#this function will initilize features for user and song
def initialize(train,all_user,all_song,k):
	#Y contains songs column and user rows
	Y=np.zeros((len(all_song),len(all_user)))
	#R will contain information about which Y[i][j] is not null
	R=np.zeros((len(all_song),len(all_user)))
	#theta is feature matrix for users
	#X is feature matrix for songs
	theta=np.random.randn(len(all_user),k)
	X=np.random.randn(len(all_song),k)
	for i in range(0,len(all_user)):
		#if i%1000==0:
			
		#print(len(all_user)-i)
		temp=train[['song','listen_count']][train['user_id']==all_user[i]]
		for index,row in temp.iterrows():
			Y[all_song.index(row['song'])][i]=row['listen_count']
			R[all_song.index(row['song'])][i]=1
	return Y,theta,X,R		


#this function normalizes the listen_count matrix for faster optimum results
def normalize(Y,R,n,m):
	Y_mean=np.zeros((m,1))
	Y_var=np.zeros((m,n))
	for i in range(0,m):
		#if i%1000==0:
		#print(m-i)
		idx=np.where(R[i]==1)
		Y_mean[i][0]=np.mean(Y[i][idx])
		Y_var[i][idx]=Y[i][idx]-Y_mean[i][0]
	return Y_var,Y_mean


#this function converts total feature vector in respective X and theta matrixes   
def unroll(features,user,songs,k):
	temp=features[:songs*k]
	X=temp.reshape((k,songs)).transpose()
	temp=features[songs*k:]
	theta=temp.reshape((k,user)).transpose()
	return X,theta


#this function will calculate the error between our predicted listen_count matrix and original matrix	
def cost(features,Y,R,user,songs,k,alpha):
	X,theta=unroll(features,user,songs,k)
	cost=np.sum((np.dot(X,theta.transpose())*R-Y)**2)/2
	regu=(alpha/2)*(np.sum(X**2)+np.sum(Y**2))
	return cost



#optimizing function using gradient descent algorithm
def gradient(features,Y,R,user,songs,k,alpha):
	X,theta=unroll(features,user,songs,k)
	diff=np.dot(X,theta.transpose())*R-Y
	X_grad=np.dot(diff,theta)+alpha*X
	theta_grad=np.dot(diff.transpose(),X)+alpha*theta
	return np.r_[(X_grad.transpose()).flatten(),(theta_grad.transpose()).flatten()]


#this function will predict the songs based on highest listen count
def matrix_result(train,rating,user,all_song,for_user):
	user_song=(train[['song','listen_count']][train['user_id']==for_user].drop_duplicates())
	user_song.sort_values(by=['listen_count'],ascending=False,inplace=True)
	print('#### Songs already in the user playlist(Top 10) ####')
	print()
	print(user_song['song'][0:10])
	print()
	user_song=user_song['song'].drop_duplicates().tolist()
	score_list=list(rating[:,user.index(for_user)])
	score_list=sorted([(score,index) for index,score in enumerate(score_list)],reverse=True)
	column=['user_id','song','score','rank']
	recommend=pd.DataFrame(columns=column)
	rank=0
	for i,j in score_list:
		if all_song[j] not in user_song:
			recommend.loc[rank]=[for_user,all_song[j],i,rank]
			rank=rank+1
			if rank==10:
				break	
	return recommend	


In [8]:
music_data=pd.read_csv(r'C:\Users\shashank\Desktop\music_data_for_first_2.csv')
music_data=music_data[1:1000000]
print('#### Size of full dataset ####')
print()
print(music_data.shape)
print()
train,test=train_test_split(music_data, test_size=0.2)
#taken only a subset of data for faster processing
train=music_data[1:10000]
print('#### Size of subset of dataset taken ####')
print()
print(train.shape)
print()

#### Size of full dataset ####

(999999, 6)

#### Size of subset of dataset taken ####

(9999, 6)



In [9]:
unique_user_id=train[['user_id']].drop_duplicates()
#all_song contains list of all songs
all_song=train['song'].drop_duplicates().tolist()
#user contains list of all user id
user=list(unique_user_id['user_id'])
print('#### Total number of songs in our subset ####')
print()
print(len(all_song))
print()
print('#### Total number of users in our subset ####')
print()
print(len(user))
print()

#### Total number of songs in our subset ####

5153

#### Total number of users in our subset ####

595



In [11]:
#k is total number of features we want in X and theta matrix 
k=40
#initialize
Y,theta,X,R=initialize(train,user,all_song,k)
#feature scaling
Y,mean=normalize(Y,R,len(user),len(all_song))
for i in range(1,201):
 if (i-1)%10==0:
  #X and theta converted into vector for fedding into optimizer 	
  features=np.r_[(X.transpose()).flatten(),(theta.transpose()).flatten()]
  #optimization
  vector=optimize.fmin_cg(cost,fprime=gradient,x0=features,args=(Y,R,len(user),len(all_song),k,0.01),maxiter=1,disp=False,full_output=True) 
  print('Cost at Iteration no.= '+str(i)+' = '+str(vector[1]))
  X,theta=unroll(vector[0],len(user),len(all_song),k)
  rating=np.dot(X,theta.transpose())
  rating=rating+mean
result=matrix_result(train,rating,user,all_song,user[0])
#predict songs for first user id
print('#### New songs recommended to user(Top 10) ####')
print()
print(result[0:10])

Cost at Iteration no.= 1 = 45538.9289135836
Cost at Iteration no.= 11 = 14230.520914210858
Cost at Iteration no.= 21 = 9256.65941579035
Cost at Iteration no.= 31 = 4475.330869757905
Cost at Iteration no.= 41 = 2660.492536380659
Cost at Iteration no.= 51 = 1756.2464023884845
Cost at Iteration no.= 61 = 1210.030173996894
Cost at Iteration no.= 71 = 850.712543118274
Cost at Iteration no.= 81 = 621.9439364830075
Cost at Iteration no.= 91 = 139.84961277510794
Cost at Iteration no.= 101 = 105.37528461958709
Cost at Iteration no.= 111 = 78.01664621097382
Cost at Iteration no.= 121 = 66.52340410779183
Cost at Iteration no.= 131 = 56.992678787031934
Cost at Iteration no.= 141 = 49.10185317467237
Cost at Iteration no.= 151 = 42.30289970432958
Cost at Iteration no.= 161 = 36.59364743140847
Cost at Iteration no.= 171 = 31.642013488737078
Cost at Iteration no.= 181 = 27.46642715166212
Cost at Iteration no.= 191 = 23.825817922360397
#### Songs already in the user playlist(Top 10) ####

43           

In [12]:
#results can be improved by using full dataset.