In [5]:
import pandas as pd
import numpy as np
from collections import Counter
import math
import json

In [6]:
df = pd.read_csv("user-song_rating.tsv", sep='\t', header=None,
names=[
        'user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name'
    ],
skiprows=[
        2120260-1, 2446318-1, 11141081-1,
        11152099-1, 11152402-1, 11882087-1,
        12902539-1, 12935044-1, 17589539-1
    ]
,parse_dates=["timestamp"])

In [7]:
df.head()

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2009-05-04 23:08:57+00:00,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [8]:
df["year"]=df["timestamp"].dt.year
df["month"]=df["timestamp"].dt.month

In [9]:
# major portion of the data is from 3 years (2006-2008) and hence we will use these 3 years
df.groupby(["year"])[["user_id"]].count()

Unnamed: 0_level_0,user_id
year,Unnamed: 1_level_1
2005,1070320
2006,4251472
2007,5323832
2008,5917696
2009,2535531
2010,1
2013,1


In [10]:
df=df.loc[df.year.isin([2006,2007,2008])].reset_index(drop=True)

In [11]:
df.head()

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name,year,month
0,user_000001,2008-12-31 16:53:59+00:00,4967c0a1-b9f3-465e-8440-4598fd9fc33c,Enya,bd915801-db41-4b75-9b0a-94c34d1fa387,Trains And Winter Rains,2008,12
1,user_000001,2008-12-31 16:50:19+00:00,4967c0a1-b9f3-465e-8440-4598fd9fc33c,Enya,,O Come O Come Emmanuel [Album]/Album,2008,12
2,user_000001,2008-12-31 16:47:19+00:00,4967c0a1-b9f3-465e-8440-4598fd9fc33c,Enya,307307e5-aa4a-4de4-a96e-5a260fb3bb80,White Is In The Winter Night,2008,12
3,user_000001,2008-12-31 16:42:32+00:00,4967c0a1-b9f3-465e-8440-4598fd9fc33c,Enya,14f3f3b8-5750-4343-ab9b-30de8ed78ebb,Journey Of The Angels,2008,12
4,user_000001,2008-12-31 16:39:16+00:00,4967c0a1-b9f3-465e-8440-4598fd9fc33c,Enya,,And Winter Came [Album]/Album,2008,12


In [12]:
df.user_id.nunique()

942

In [13]:
df.loc[df["track_id"].isna()]["track_name"].value_counts()

Intro                           2097
Untitled                        1834
Bonus Track                     1680
There There                      952
La Paloma                        881
                                ... 
Montana Ft. Deceptikon             1
02 - Into The Light                1
The Pod. El Manor De La Nova       1
A1 - Me Vs. Caplocks               1
Truly & Really                     1
Name: track_name, Length: 382498, dtype: int64

In [14]:
df=df.loc[~df["track_name"].isna()]

In [15]:
df=df.loc[~df["track_name"].isna()].reset_index(drop=True)

In [16]:
df.groupby(["year"])[["user_id"]].count()

Unnamed: 0_level_0,user_id
year,Unnamed: 1_level_1
2006,4251461
2007,5323832
2008,5917696


In [17]:
user2idx={}
song2idx={}
#count=0
user_set=set(df.user_id.values)
song_set=set(df.track_name.values)

In [18]:
count=0
for i in user_set:
    user2idx[i]=count
    count+=1

In [19]:
count=0
for i in song_set:
    song2idx[i]=count
    count+=1

In [20]:
df["user_idx"]=df.apply(lambda x: user2idx[x.user_id],axis=1)
df["song_idx"]=df.apply(lambda x: song2idx[x.track_name],axis=1)

In [21]:
pd.Series(song2idx.keys())[pd.Series(song2idx.keys()).str.contains('ghetto|Ghetto')]

5117             Symphony In C Major: Larghetto
7802             Ghetto Fabulous (Dopamine Mix)
9145                     Don'T Call It A Ghetto
9817                 Ghetto Box (Album Version)
10653                               Dave Ghetto
                          ...                  
939938    Ghetto Children Ft. Element Scripture
940013                              Ghetto Blue
940226                       Born In Tha Ghetto
941343                          Dub Inna Ghetto
949467                 Ghetto Musick (Club Mix)
Length: 605, dtype: object

In [22]:
max_year=df['year'].max()
max_month=12

In [23]:
df.loc[:,"freq"]=1

In [24]:
df_subset=df.groupby(["user_idx","song_idx","year","month"],as_index=False).agg({"freq":"sum"}).assign(rel_weight=lambda x: 1-((12*(max_year-x.year) + (max_month-x.month))/36))

In [25]:
df_subset.loc[:,"weighted_freq"]=df_subset["freq"]*df_subset["rel_weight"]

In [26]:
df_interaction=df_subset.groupby(["user_idx","song_idx"],as_index=False).agg({"freq":'sum',"weighted_freq":"sum"})

In [27]:
df_subset

Unnamed: 0,user_idx,song_idx,year,month,freq,rel_weight,weighted_freq
0,0,83,2007,12,1,0.666667,0.666667
1,0,83,2008,4,2,0.777778,1.555556
2,0,117,2007,12,4,0.666667,2.666667
3,0,117,2008,12,1,1.000000,1.000000
4,0,165,2007,9,1,0.583333,0.583333
...,...,...,...,...,...,...,...
8520524,941,946129,2008,10,3,0.944444,2.833333
8520525,941,947470,2007,9,1,0.583333,0.583333
8520526,941,949006,2007,3,1,0.416667,0.416667
8520527,941,949188,2007,3,1,0.416667,0.416667


## Computing Inverse Song Frequency

In [28]:
df_1=df_subset.groupby(["year","month"],as_index=False)["user_idx"].nunique().rename(columns={"user_idx":"n_users"})
df_2=df_subset.groupby(["year","month","song_idx"],as_index=False)["user_idx"].nunique().rename(columns={"user_idx":"df"})
df_merge=df_2.merge(df_1,how="inner",on=["year","month"])

In [29]:
df_merge["inverse_song_freq"]=df_merge.apply(lambda x:math.log((x.n_users/(1+x.df)),2),axis=1)

## Computing TF-IDF

In [30]:
df_final=df_subset.merge(df_merge,how="inner",on=["year","month","song_idx"])

In [31]:
df_final.loc[:,"tf-idf"]=df_final["weighted_freq"]*df_final["inverse_song_freq"]

In [32]:
df_final=df_final.groupby(["user_idx","song_idx"],as_index=False)["tf-idf"].sum()

In [56]:
df_final.shape

(3696755, 3)

In [35]:
# a dictionary to look up ratings
usersong2tfidf = {}
print("Calling: update_user2song_and_song2user")
count = 0

Calling: update_user2song_and_song2user


In [58]:
df_final[df_final["user_idx"]==0]

Unnamed: 0,user_idx,song_idx,tf-idf
0,0,83,17.286297
1,0,117,30.094128
2,0,165,17.772142
3,0,474,2.023054
4,0,720,9.506027
...,...,...,...
5348,0,949669,6.798296
5349,0,949700,22.020305
5350,0,950086,8.375039
5351,0,950548,12.604108


In [37]:
def update_user2song_and_song2user(row):
    
    global count
    count += 1
#     if count % 100000 == 0:
#         print("processed: %.3f" % (float(count)/cutoff))

    i = int(row.user_idx)
    j = int(row.song_idx)

    usersong2tfidf[(i,j)] = row["tf-idf"]

In [40]:
df_final.apply(update_user2song_and_song2user,axis=1)

0          None
1          None
2          None
3          None
4          None
           ... 
3696750    None
3696751    None
3696752    None
3696753    None
3696754    None
Length: 3696755, dtype: object

In [47]:
usersong2tfidf_new = {str(key): val for key, val in usersong2tfidf.items()}

In [50]:
with open('user_song_tfidf.txt', 'w') as convert_file:
     convert_file.write(json.dumps(usersong2tfidf_new))

In [51]:
data = json.load(open('user_song_tfidf.txt'))

In [54]:
import ast
a = '(0,2)'
ast.literal_eval(a)

(0, 2)

In [55]:
new_data = {ast.literal_eval(key): val for key, val in data.items()}
new_data

{(0, 83): 17.28629721437562,
 (0, 117): 30.094128078898613,
 (0, 165): 17.77214150723554,
 (0, 474): 2.023054008169132,
 (0, 720): 9.506027043518767,
 (0, 746): 7.556242818183537,
 (0, 773): 7.818535599430455,
 (0, 779): 14.536337920106739,
 (0, 800): 7.832641118955218,
 (0, 1435): 44.25023658808155,
 (0, 1461): 60.248783415427454,
 (0, 1538): 5.887761274999198,
 (0, 1559): 11.119505807354601,
 (0, 1576): 6.383233360138082,
 (0, 1702): 5.465288357811696,
 (0, 2134): 16.554792783085247,
 (0, 2374): 9.479551152853983,
 (0, 2414): 19.05597161959744,
 (0, 2418): 4.9330354286916585,
 (0, 2460): 4.963336753432104,
 (0, 2691): 11.405106600444757,
 (0, 2731): 10.859544323775845,
 (0, 2955): 5.657781197876548,
 (0, 2970): 5.429772161887922,
 (0, 3131): 7.114581672402099,
 (0, 3161): 4.743583227021795,
 (0, 3264): 7.818535599430455,
 (0, 3396): 11.210911164720606,
 (0, 3518): 14.149681249843969,
 (0, 3888): 12.731808751248991,
 (0, 3965): 8.620193878359146,
 (0, 3983): 8.829237820589636,
 (0, 40

In [68]:
g=df_final.groupby(['user_idx'])["tf-idf"]

In [69]:
df_final.loc[:,"Rating"]=g.transform(lambda x:pd.cut(x.rank(pct=True),bins=5,labels=[1,2,3,4,5]))

In [70]:
df_final[["user_idx","song_idx","Rating"]].to_csv("user_song_interacting.csv")

In [71]:
df_final.head()

Unnamed: 0,user_idx,song_idx,tf-idf,Rating
0,0,83,17.286297,5
1,0,117,30.094128,5
2,0,165,17.772142,5
3,0,474,2.023054,1
4,0,720,9.506027,3


In [54]:
with open('song2idx.txt', 'w') as convert_file:
     convert_file.write(json.dumps(song2idx))

In [55]:
with open('user2idx.txt', 'w') as convert_file:
     convert_file.write(json.dumps(user2idx))