In [1]:
!pip3 install joblib

Collecting joblib
  Using cached https://files.pythonhosted.org/packages/8f/42/155696f85f344c066e17af287359c9786b436b1bf86029bb3411283274f3/joblib-0.14.0-py2.py3-none-any.whl
Installing collected packages: joblib
Successfully installed joblib-0.14.0


In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import os
from joblib import Parallel, delayed
os.system("taskset -p 0xff %d" % os.getpid())

32512

### Create Graph as dictionary

In [3]:
df = pd.read_csv('./data/selected_triplet_full_dataset.csv')
df.head()

Unnamed: 0,user,song,weight
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [4]:
df.sort_values('weight', ascending=False).head()

Unnamed: 0,user,song,weight
31847879,093cb74eb3c517c5179ae24caf0ebec51b24d2a2,SOAOSDF12A58A779F1,9667
18651017,c11dea7d1f4d227b98c5f2a79561bf76884fcf10,SOZTEZR12A8C14204B,3534
12857055,d8e6fa08d73821f305b9a3af1cf1e0a704473d82,SOBONKR12A58A7A7E0,3532
48274295,1854daf178674bbac9a8ed3d481f95b76676b414,SOVLAWN12A81C234AB,2948
30142955,69807196f964e5b063af898fd1cb098fab2e6a1f,SOVQQJO12AB0182328,2381


In [5]:
user_dic = {}
song_dic = {}
for edge in df.values.tolist():
    user_id, song_id, listened_count = edge[0], edge[1], edge[2]
    if user_id not in user_dic:
        user_dic[user_id] = {}
    user_dic[user_id][song_id] = listened_count    
    
    if song_id not in song_dic:
        song_dic[song_id] = {}
    song_dic[song_id][user_id] = listened_count


In [6]:
'''
Calculates the sum of the weights of an edgeslist of the bipartite graph
Takes in a list of edges (the ones adjacent to the user or the song.)
'''

def cal_adj_weights(dic):
    total_weight = 0.0
    for key, weight in dic.items():
        total_weight += weight

    return (total_weight)

'''
Finds the average number of song listens by a user u.
bipart_g: is a weighted bipartite networkx Graph, where the weight = # of listens of song s by user u
It returns 1 / |S_u| Σ r_(u,s) => the average listens for a user u
'''

def mean_ru(user_u):
    songs_u = user_dic[user_u]
    song_count = len(songs_u) # R_u
    listen_count = cal_adj_weights(songs_u)
    return (listen_count / song_count)


'''
Finds the 2-step random walk recommendation power for an arbitrary user v from a choosen user u
We know both users u,v
This is P(u -> s -> v), as this is a bipartite graph
v is always 2 steps from u
P(Transition from user u to song s) = r_(u,s) / R_u

P(Transition from song s to user v) = r_(v,s) / R_s
'''
def rec_power_u_v(user_u, user_v):
    songs_u = user_dic[user_u]
    songs_v = user_dic[user_v]
    
    song_count_u = len(songs_u)
    listen_count_u = cal_adj_weights(songs_u) # R_u
    
    running_value = 0.0
    
#     for s in songs_u:
#         users_s = song_dic[s]
#         listen_count_s = cal_adj_weights(song_dic[s]) # R_s
        
#         if user_v in users_s:
#             user_u_listens_s = user_dic[user_u][s] # r_{u,s}
#             user_v_listens_s = user_dic[user_v][s] # r_{v,s}
#             running_value += user_v_listens_s * user_u_listens_s / listen_count_s
            
    intersect_songs = set(list(songs_u.keys())) & set(list(songs_v.keys()))
    for s in intersect_songs:
        listen_count_s = cal_adj_weights(song_dic[s])
        user_u_listens_s = song_dic[s][user_u] # r_{u,s}
        user_v_listens_s = song_dic[s][user_v] # r_{v,s}
        running_value += user_v_listens_s * user_u_listens_s / listen_count_s
    
    return (running_value / listen_count_u)



'''
Predicts the number of listens that user u will give to song s
'''
def predict_listens(user_u, song_s):
    u_bar = mean_ru(user_u)
    running_value = 0.0
    for v in song_dic[song_s]:
        v_bar = mean_ru(v)
        user_v_listens_s = user_dic[v][song_s]
        rp_u_v = rec_power_u_v(user_u, v)
        running_value += (rp_u_v * (user_v_listens_s - v_bar)) 
    return u_bar + running_value

In [7]:
rating = predict_listens('a263000355e6a46de29ec637820771ac7620369f', 'SONSTND12AB018516E')
print(rating)

1538.8014243082944


In [8]:
def predict_wrapper(user_u, song_s):
    pred_listen = predict_listens(user_u, song_s)
    return (song_s, pred_listen)

'''
Parallel wrapper to predict the number of listens that a user will listen to a given song
n = number of users to go through
'''
def song_wrapper(user, n, song_dic, user_dic):
    listens = Parallel(n_jobs = 10, verbose = 20, prefer='processes', backend = 'threading')(delayed(predict_wrapper)(user_u = user, song_s = song) for song in song_dic)
    return sorted(listens, key=lambda x: x[1], reverse=True)[:n]

In [None]:
data = song_wrapper('a263000355e6a46de29ec637820771ac7620369f', 100, song_dic, user_dic)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=10)]: Done   2 tasks      | elapsed:    6.1s
[Parallel(n_jobs=10)]: Done   3 tasks      | elapsed:   18.6s
[Parallel(n_jobs=10)]: Done   4 tasks      | elapsed:   22.1s
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:   32.7s
[Parallel(n_jobs=10)]: Done   6 tasks      | elapsed:   37.1s
[Parallel(n_jobs=10)]: Done   7 tasks      | elapsed:   38.0s
[Parallel(n_jobs=10)]: Done   8 tasks      | elapsed:   43.5s
[Parallel(n_jobs=10)]: Done   9 tasks      | elapsed:   45.7s
[Parallel(n_jobs=10)]: Done  10 tasks      | elapsed:   50.2s
[Parallel(n_jobs=10)]: Done  11 tasks      | elapsed:   54.0s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done  13 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done  14 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done  15 tasks      | elaps

In [None]:
# print(sorted(data, key = lambda x: x[1]))
print(data)