In [1]:
import numpy as np
import pandas as pd
import networkx as nx

### Create Graph as dictionary

In [6]:
df = pd.read_csv('data/selected_triplet.csv') 
df.head()

Unnamed: 0,user,song,weight
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBVFZR12A6D4F8AE3,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGVZY12A58A7857E,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEKWEA12A6D4F5DC3,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEOBYG12A6D4F8AE2,1


In [29]:
df.sort_values('weight', ascending=False)

Unnamed: 0,user,song,weight
4011287,a263000355e6a46de29ec637820771ac7620369f,SONSTND12AB018516E,2368
629006,ebed4965c03cb572d599bd851108e7ff320c287e,SOGVKZV12A8C143863,2130
2744,0d0f80a34807aab31a3521424d456d30bf2c93d9,SOFRRFT12A8C140C5C,1890
297775,45d26d78fe726118497aca6de26a5b91a929fddf,SOZVPPQ12A8C13F7A8,1539
3449816,3eef9b511ae8b3bcf6adf0bccc6547b55bc447b2,SOXFQJW12A8C1335E2,1460
...,...,...,...
2208749,e587f88c3e16f21706f24e0265139eb7d6aac654,SOJSISI12A8AE49E2C,1
2208750,e587f88c3e16f21706f24e0265139eb7d6aac654,SOJXQSY12AB017E227,1
2208752,e587f88c3e16f21706f24e0265139eb7d6aac654,SOKOAYH12A58A7B270,1
2208754,e587f88c3e16f21706f24e0265139eb7d6aac654,SOLHUAX12A6D4F9A22,1


In [10]:
user_dic = {}
song_dic = {}
for edge in df.values.tolist():
    user_id, song_id, listened_count = edge[0], edge[1], edge[2]
    if user_id not in user_dic:
        user_dic[user_id] = {}
    user_dic[user_id][song_id] = listened_count    
    
    if song_id not in song_dic:
        song_dic[song_id] = {}
    song_dic[song_id][user_id] = listened_count


In [15]:
for user, songs in user_dic.items():
    print(user, songs)
    break

b80344d063b5ccb3212f76538f3d9e43d87dca9e {'SOBNZDC12A6D4FC103': 1, 'SOBVFZR12A6D4F8AE3': 1, 'SOEGVZY12A58A7857E': 1, 'SOEKWEA12A6D4F5DC3': 1, 'SOEOBYG12A6D4F8AE2': 1, 'SOFFJPX12A6D4F7456': 1, 'SOFTKSZ12A6D4F5DC5': 1, 'SOFZFQU12A8C13CAB8': 1, 'SOHQIAG12A8C136F64': 1, 'SOHQZCA12A6D4FB317': 1, 'SOIAOBY12A8C13BF75': 1, 'SOIQOQT12A8C136F96': 1, 'SOKSIKA12A6D4F5DC7': 1, 'SONYTAN12A8C13BF88': 1, 'SOOSIVQ12A6D4F8AE0': 1, 'SOPBCSY12A6D4F5DC4': 1, 'SOQEMEN12A8C13BF8B': 1, 'SORDDVI12A8C136F53': 1, 'SORJNVW12A8C13BF90': 1, 'SORSAJY12A6D4F7457': 1, 'SOTCPHF12A8C13BF9B': 1, 'SOTHMIK12A8C136FA1': 1, 'SOUQUBU12AF72A47B3': 1, 'SOVAJXX12A8AE47D5C': 1, 'SOVZRXZ12A58A77A88': 1, 'SOXSPON12A6D4F5DC2': 1, 'SOZGCUB12A8C133997': 1, 'SOZZHXI12A8C13BF7D': 1}


In [26]:
'''
Calculates the sum of the weights of an edgeslist of the bipartite graph
Takes in a list of edges (the ones adjacent to the user or the song.)
'''

def cal_adj_weights(dic):
#     edge_data = [G.get_edge_data(e[0],e[1]) for e in edges]
    total_weight = 0.0
    for key, weight in dic.items():
        total_weight += weight

    return (total_weight)

'''
Finds the average number of song listens by a user u.
bipart_g: is a weighted bipartite networkx Graph, where the weight = # of listens of song s by user u
It returns 1 / |S_u| Σ r_(u,s) => the average listens for a user u
'''

def mean_ru(user_u):
    songs_u = user_dic[user_u]
#     edges_u = [e for e in bipart_g.edges(user_u)]
    song_count = len(songs_u) # R_u
    listen_count = cal_adj_weights(songs_u)
    return (listen_count / song_count)


'''
Finds the 2-step random walk recommendation power for an arbitrary user v from a choosen user u
We know both users u,v
This is P(u -> s -> v), as this is a bipartite graph
v is always 2 steps from u
P(Transition from user u to song s) = r_(u,s) / R_u

P(Transition from song s to user v) = r_(v,s) / R_s
'''
def rec_power_u_v(user_u, user_v):
#     edges_u = [e for e in bipart_g.edges(user_u)]
#     edges_v = [e for e in bipart_g.edges(user_v)]
#     songs_u = [n for n in bipart_g.neighbors(user_u)]
    songs_u = user_dic[user_u]
    songs_v = user_dic[user_v]
    
    song_count_u = len(songs_u)
    listen_count_u = cal_adj_weights(songs_u) # R_u
    
    running_value = 0.0
    
    for s in songs_u:
#         users_s = [e for e in bipart_g.edges(song)] # Users that listened to song s
        users_s = song_dic[s]
        listen_count_s = cal_adj_weights(song_dic[s]) # R_s
        
        if user_v in users_s:
            user_u_listens_s = user_dic[user_u][s] # r_{u,s}
            user_v_listens_s = user_dic[user_v][s] # r_{v,s}
            running_value += user_v_listens_s * user_u_listens_s / listen_count_s
#         if bipart_g.has_edge(s, user_v):
#             user_v_listens_s = bipart_g.get_edge_data(user_v, s)['weight'] # r_v,s
#             user_u_listens_s = bipart_g.get_edge_data(user_u, s)['weight'] # r_u,s
#             running_value += user_v_listens_s * user_u_listens_s / listen_count_s
    return (running_value / listen_count_u)



'''
Predicts the number of listens that user u will give to song s
'''
def predict_rating(user_u, song_s, total_users):
    u_bar = mean_ru(user_u)
    running_value = 0.0
    for v in total_users:
        if v in song_dic[song_s]:
            v_bar = mean_ru(v)
            user_v_listens_s = user_dic[v][song_s]
            rp_u_v = rec_power_u_v(user_u, v)
            running_value += (rp_u_v * (user_v_listens_s - v_bar))            
#             user_v_listens_s = bipart_g.get_edge_data(v, s)['weight']
#             rp_u_v = rec_power_u_v(user_u, user_v, bipart_g)
#             running_value += (rp_u_v * (user_v_listens_s - v_bar))
    return u_bar + running_value

In [None]:
song_rating = []
for song in song_dic:
    rating = predict_rating('a263000355e6a46de29ec637820771ac7620369f', 'SONSTND12AB018516E', list(user_dic.keys()))
    song_rating.append((song, rating))
print(sorted(song_rating, lambda e: e[1], reversed=True)[:10])

In [4]:
G = nx.Graph()
G.add_edge('a', 'b', weight=3) 
G.add_edge('b', 'c', weight=4)
G.add_edge('a', 'c', weight=1)

In [5]:
edges = [e for e in G.edges('a')]
print(len(edges))
edge_data = [G.get_edge_data(e[0],e[1]) for e in edges]
total_weight = 0.0
for i in edge_data:
    total_weight += i['weight']
print(total_weight)

2
4.0


In [6]:
cal_adj_weights(edges)

4.0

In [6]:
mean_ru('a', G)

2.0

In [7]:
df = pd.read_csv('../data/selected_triplet_new.csv') 
df

Unnamed: 0,user,song,weight
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBVFZR12A6D4F8AE3,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGVZY12A58A7857E,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEKWEA12A6D4F5DC3,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEOBYG12A6D4F8AE2,1
...,...,...,...
5354952,cf8289419383259189afe6bb50c5115fd84f1064,SOYSPLQ12AB0185D3F,7
5354953,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOHHKGO12AC3DF57BF,1
5354954,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SONWLIS12A8C140865,2
5354955,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOQBOWE12A8C13CC2E,1


In [27]:
BiPart_G = nx.Graph()

for idx, row in df.iterrows():
    if idx % 100000 == 0: print(idx)
    if idx == 500000: break
    u = row['user']
    s = row['song']
    BiPart_G.add_node(u)
    BiPart_G.add_node(s)
    BiPart_G.add_edge(u, s, weight = int(row['weight']))


0
100000
200000
300000
400000
500000


In [30]:
predict_rating('b80344d063b5ccb3212f76538f3d9e43d87dca9e', 'SOTNMFD12A58A7789E', total_users = 500000, bipart_g = BiPart_G)

TypeError: predict_rating() missing 1 required positional argument: 'total_users'