In [None]:
import pandas as pd 
import numpy as np
import os 
pd.options.display.max_colwidth = 10000
artists = "artists.dat"
tags = "tags.txt"
user_artists = "user_artists.dat"
user_friends = "user_friends.dat"
user_taggedartists_timestamps = "user_taggedartists-timestamps.dat"
user_taggedartists = "user_taggedartists.dat"

**Data statistics**
---------------
   1892 users
   17632 artists
      
   12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs
         avg. 13.443 friend relations per user
         
   92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
         avg. 49.067 artists most listened by each user
         avg. 5.265 users who listened each artist
            
   11946 tags  
   
  186479 tag assignments (tas), i.e. tuples [user, tag, artist]
         avg. 98.562 tas per user
         avg. 14.891 tas per artist
         avg. 18.930 distinct tags used by each user
         avg. 8.764 distinct tags used for each artist

**Files**
----- 
   * artists.dat
   
    This file contains information about music artists listened and tagged by the users.
   
   * tags.dat
   
   	This file contains the set of tags available in the dataset.

   * user_artists.dat
   
        This file contains the artists listened by each user.
        
        It also provides a listening count for each [user, artist] pair.

   * user_taggedartists.dat - user_taggedartists-timestamps.dat
   
        These files contain the tag assignments of artists provided by each particular user.
        
        They also contain the timestamps when the tag assignments were done.
   
   * user_friends.dat
   
   	These files contain the friend relations between users in the database.
     

In [None]:
print(pd.read_table(user_artists).head())
print("---------------------------------")
print(pd.read_table(user_taggedartists).head())
print("---------------------------------")
print(pd.read_table(tags,encoding = "utf-8").head())
print("---------------------------------")
print(pd.read_table(user_friends).head())
print("---------------------------------")
print(pd.read_table(artists).head())
print("---------------------------------")
#print(pd.read_table(user_taggedartists_timestamps).head()) #don't need timestamps

In [None]:
t = pd.read_table(user_taggedartists)
t[t.artistID == 995]

In [None]:
#Setting a total table of merge 2 sub tables
total_table = pd.read_table(user_taggedartists)
user_artists_df = pd.read_table(user_artists)
tags_df = pd.read_table(tags,encoding = "utf-8")
user_friends_df = pd.read_table(user_friends)

total_table.drop(["day","month","year"],axis = 1,inplace = True)
total_table = pd.merge(total_table,tags_df,on = "tagID")

def fun(word):
    word = "%s" % ' '.join(word)
    return word

artist_tags = total_table.groupby("artistID")["tagValue"].apply(fun)

list_word = []
for i in artist_tags[0:]:
    list_word.append(list(set(str.split(i))))
    
new_artist_tags = list(map(" ".join,list_word))
artist_tags_df = pd.DataFrame(artist_tags)
artist_tags_df["tagValue"] = new_artist_tags

In [None]:
total_table[total_table.artistID == 995]

In [None]:
#artist_tags_df.head()
artist_tags_df['artistID'] = artist_tags_df.index
total_table = total_table.drop("tagValue",axis = 1)
total_table = pd.merge(artist_tags_df,total_table,how = "left")
total_table = total_table[["userID","artistID","tagID","tagValue"]]

In [None]:
#total_table = pd.merge(total_table,user_artists_df,how="left") #如果需要weight的数据，再去并
#total_table = pd.merge(total_table,user_friends_df,how="left") #如果需要friend的数据，再去并

In [None]:
total_table.head() ##最终总表

In [None]:
'''
#total_table.tagID
list_str = list(map(str,total_table.tagID))
tagID_df = pd.DataFrame(total_table.tagID)
tagID_df["str"] = list_str
'''

In [None]:
total_table_c = total_table.drop_duplicates(subset = ["userID","artistID","tagValue"])

In [None]:
if __name__ == '__main__':
    total_table_c.drop("tagID",axis = 1,inplace = True)

In [None]:
total_table_c.head()

In [None]:
total_table = pd.merge(total_table_c,user_artists_df,how="left") #如果需要weight的数据，再去并
total_table = pd.merge(total_table_c,user_friends_df,how="left") #如果需要friend的数据，再去并

In [None]:
print(total_table_c[total_table_c.artistID == 2].tagValue)

##### 方法1: 相似度评价 人和人之间的余弦值

In [None]:
total_table_c.head()
total_table_c[total_table_c.userID == 2]
all_tags = total_table_c.groupby("userID").tagValue.sum()

In [None]:
total_table_d = pd.DataFrame(all_tags)
total_table_d["userID"] = total_table_d.index

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

tf = TfidfVectorizer(ngram_range = (1,3),stop_words = 'english')
dtm = tf.fit_transform(total_table_d.tagValue)

In [None]:
from scipy.sparse.linalg import svds
u1,s1,vt1 = svds(dtm , k = 500)
u1.shape, s1.shape, vt1.shape

In [None]:
pred = cosine_similarity(u1)
print(pred[0,].argsort())
total_table_d = total_table_d.reset_index(drop=True)

In [None]:
total_table_d

In [None]:
a = str(total_table_d[total_table_d.index == 0]['tagValue'])
b = str(total_table_d[total_table_d.index == 185]['tagValue'])

In [None]:
a_l = str.split(a)
b_l = str.split(b)

In [None]:
len(list(set(a_l).intersection(set(b_l))))

In [None]:
print(a)
print(b)

##### 因为标签太杂，所以这种办法，被评价为太随意
##### 但我也没有分数，没法加权去用sim矩阵乘以评分向量，得到一个加权的东西

##### 仅仅根据用户和艺术家的对应关系，来评价

In [None]:
total_table_c["like"] = 1.00

In [None]:
total_table_c
###sim = linear_kernel(dtm)
from scipy.sparse import csc_matrix
m = csc_matrix((total_table_c.like,(total_table_c.userID,total_table_c.artistID)))
m

In [None]:
from scipy.sparse.linalg import svds
u,s,vt = svds(m , k = 20)
u.shape, s.shape, vt.shape

In [None]:
highest = np.argsort(vt[0,:])[-5:]
total_table_c[total_table_c.artistID.isin(highest)]
pred = np.dot(np.dot(u, np .diag(s)), vt)

In [None]:
pred

In [None]:
recommended = np.argsort(pred[461, :])[-5:]
#total_table_c[total_table_c.artistID.isin(recommended)].artistID.

In [None]:
have_pay_attention = np.array(total_table_c[total_table_c.userID == 461].artistID)

In [None]:
list(set(recommended).difference(set(have_pay_attention)))
list(set(b).difference(set(a))) 