In [1]:
import pandas as pd 
import numpy as np
import os 

artists = "artists.dat"
tags = "tags.txt"
user_artists = "user_artists.dat"
user_friends = "user_friends.dat"
user_taggedartists_timestamps = "user_taggedartists-timestamps.dat"
user_taggedartists = "user_taggedartists.dat"

**Data statistics**
---------------
   1892 users
   17632 artists
      
   12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs
         avg. 13.443 friend relations per user
         
   92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
         avg. 49.067 artists most listened by each user
         avg. 5.265 users who listened each artist
            
   11946 tags  
   
  186479 tag assignments (tas), i.e. tuples [user, tag, artist]
         avg. 98.562 tas per user
         avg. 14.891 tas per artist
         avg. 18.930 distinct tags used by each user
         avg. 8.764 distinct tags used for each artist

**Files**
----- 
   * artists.dat
   
    This file contains information about music artists listened and tagged by the users.
   
   * tags.dat
   
   	This file contains the set of tags available in the dataset.

   * user_artists.dat
   
        This file contains the artists listened by each user.
        
        It also provides a listening count for each [user, artist] pair.

   * user_taggedartists.dat - user_taggedartists-timestamps.dat
   
        These files contain the tag assignments of artists provided by each particular user.
        
        They also contain the timestamps when the tag assignments were done.
   
   * user_friends.dat
   
   	These files contain the friend relations between users in the database.
     

In [2]:
print(pd.read_table(user_artists).head())
print("---------------------------------")
print(pd.read_table(user_taggedartists).head())
print("---------------------------------")
print(pd.read_table(tags,encoding = "utf-8").head())
print("---------------------------------")
print(pd.read_table(user_friends).head())
print("---------------------------------")
print(pd.read_table(artists).head())
print("---------------------------------")
#print(pd.read_table(user_taggedartists_timestamps).head()) #don't need timestamps

   userID  artistID  weight
0       2        51   13883
1       2        52   11690
2       2        53   11351
3       2        54   10300
4       2        55    8983
---------------------------------
   userID  artistID  tagID  day  month  year
0       2        52     13    1      4  2009
1       2        52     15    1      4  2009
2       2        52     18    1      4  2009
3       2        52     21    1      4  2009
4       2        52     41    1      4  2009
---------------------------------
   tagID           tagValue
0      1              metal
1      2  alternative metal
2      3          goth rock
3      4        black metal
4      5        death metal
---------------------------------
   userID  friendID
0       2       275
1       2       428
2       2       515
3       2       761
4       2       831
---------------------------------
   id               name                                         url  \
0   1       MALICE MIZER       http://www.last.fm/music/MALICE+MIZ

In [3]:
#Setting a total table of merge 2 sub tables
total_table = pd.read_table(user_taggedartists)
user_artists_df = pd.read_table(user_artists)
tags_df = pd.read_table(tags,encoding = "utf-8")
user_friends_df = pd.read_table(user_friends)

total_table.drop(["day","month","year"],axis = 1,inplace = True)
total_table = pd.merge(total_table,tags_df,on = "tagID")

def fun(word):
    word = "%s" % ' '.join(word)
    return word

artist_tags = total_table.groupby("artistID")["tagValue"].apply(fun)

list_word = []
for i in artist_tags[0:]:
    list_word.append(list(set(str.split(i))))
    
new_artist_tags = list(map(" ".join,list_word))
artist_tags_df = pd.DataFrame(artist_tags)
artist_tags_df["tagValue"] = new_artist_tags

In [4]:
#artist_tags_df.head()
artist_tags_df['artistID'] = artist_tags_df.index
total_table = total_table.drop("tagValue",axis = 1)
total_table = pd.merge(artist_tags_df,total_table,how = "left")
total_table = total_table[["userID","artistID","tagID","tagValue"]]

In [5]:
#total_table = pd.merge(total_table,user_artists_df,how="left") #如果需要weight的数据，再去并
#total_table = pd.merge(total_table,user_friends_df,how="left") #如果需要friend的数据，再去并

In [6]:
total_table.head() ##最终总表

Unnamed: 0,userID,artistID,tagID,tagValue
0,681,1,139,gaga gothic lady japanese visual weeabo j-rock...
1,1545,1,139,gaga gothic lady japanese visual weeabo j-rock...
2,1730,1,139,gaga gothic lady japanese visual weeabo j-rock...
3,1929,1,139,gaga gothic lady japanese visual weeabo j-rock...
4,1984,1,139,gaga gothic lady japanese visual weeabo j-rock...


In [7]:
'''
#total_table.tagID
list_str = list(map(str,total_table.tagID))
tagID_df = pd.DataFrame(total_table.tagID)
tagID_df["str"] = list_str
'''

'\n#total_table.tagID\nlist_str = list(map(str,total_table.tagID))\ntagID_df = pd.DataFrame(total_table.tagID)\ntagID_df["str"] = list_str\n'

In [8]:
total_table_c = total_table.drop_duplicates(subset = ["userID","artistID","tagValue"])

In [9]:
if __name__ == '__main__':
    total_table_c.drop("tagID",axis = 1,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [10]:
total_table_c.head()

Unnamed: 0,userID,artistID,tagValue
0,681,1,gaga gothic lady japanese visual weeabo j-rock...
1,1545,1,gaga gothic lady japanese visual weeabo j-rock...
2,1730,1,gaga gothic lady japanese visual weeabo j-rock...
3,1929,1,gaga gothic lady japanese visual weeabo j-rock...
4,1984,1,gaga gothic lady japanese visual weeabo j-rock...


In [11]:
total_table = pd.merge(total_table_c,user_artists_df,how="left") #如果需要weight的数据，再去并
total_table = pd.merge(total_table_c,user_friends_df,how="left") #如果需要friend的数据，再去并

In [35]:
print(total_table_c[total_table_c.artistID == 2].tagValue)

15    vocal industrial german gothic true goth rock ...
16    vocal industrial german gothic true goth rock ...
17    vocal industrial german gothic true goth rock ...
20    vocal industrial german gothic true goth rock ...
21    vocal industrial german gothic true goth rock ...
23    vocal industrial german gothic true goth rock ...
24    vocal industrial german gothic true goth rock ...
26    vocal industrial german gothic true goth rock ...
29    vocal industrial german gothic true goth rock ...
36    vocal industrial german gothic true goth rock ...
37    vocal industrial german gothic true goth rock ...
41    vocal industrial german gothic true goth rock ...
Name: tagValue, dtype: object


#### 相似度评价

In [13]:
total_table_c.head()

Unnamed: 0,userID,artistID,tagValue
0,681,1,gaga gothic lady japanese visual weeabo j-rock...
1,1545,1,gaga gothic lady japanese visual weeabo j-rock...
2,1730,1,gaga gothic lady japanese visual weeabo j-rock...
3,1929,1,gaga gothic lady japanese visual weeabo j-rock...
4,1984,1,gaga gothic lady japanese visual weeabo j-rock...


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tf = TfidfVectorizer(ngram_range = (1,3),stop_words = 'english')
dtm = tf.fit_transform(total_table_c.tagValue)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dtm[0,0:],dtm[10000,0:])

array([[ 0.00055041]])

In [15]:
###sim = linear_kernel(dtm)

In [16]:
#from sklearn.metrics.pairwise import cosine_similarity
#sim = cosine_similarity(dtm)