# Filtragem colaborativa baseada em usuario music dataset

In [1]:
#imports 
import pandas as pd
import numpy as np
import random
import time
from datetime import datetime, timedelta

#dataframe of songs
df = pd.read_csv(
    'song_data.csv', 
     names =['user_id', 'song_id', 'listen_count', 'title', 'artist']
)
df.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters


In [2]:
all_songs = pd.read_csv(
    'top_songs.csv' 
     )
all_songs.drop('index',1,inplace=True)

def show_song(dataframe,song_id):
    song_df = dataframe.loc[dataframe['song_id'] == song_id]
    artist = "".join(song_df['artist'].astype('str').tail(1).tolist()) 
    title = "".join(song_df['title'].astype('str').tail(1).tolist())
    return (title,artist)


all_songs.head()

Unnamed: 0,song_id,listen_count,artist,title
0,SOBONKR12A58A7A7E0,40619,Dwight Yoakam,You're The One
1,SOAUWYT12A81C206F1,36059,Björk,Undo
2,SOSXLTC12AF72A7F54,30391,Kings Of Leon,Revelry
3,SOEGIYH12A6D4FC0E3,21953,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
4,SOFRQTD12A81C233C0,21646,Harmonia,Sehr kosmisch


Dado o dataframe all_songs e a funcao show_song podemos usar apenas a coluna song_id

In [3]:
df.drop(['title','artist'],1,inplace=True)
df.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [70]:
def get_user_prefs(dataframe, user):
    new_df =  dataframe.loc[dataframe['user_id'] == user]
    new_df = new_df.drop('user_id',1)
    new_df = new_df.set_index('song_id')
    quasi_dic = new_df.to_dict()
    dic = quasi_dic['listen_count']
    return dic


def sim_distance(person1,person2):
    intersection = []
    for item in person1:
        if item in person2:
            intersection.append(item)
    if len(intersection)==0:
        return 0
    sum_of_squares = np.sum([(person1[item] - person2[item])**2\
                     for item in intersection])
    return 1/(1+ sum_of_squares)


def naive_pearson(person1,person2):
    intersection = []
    for item in person1:
        if item in person2:
            intersection.append(item)
    number_of_itens = len(intersection) 
    if number_of_itens ==0:
        return 0
    sum1 = np.sum([person1[item] for item in intersection])/number_of_itens
    sum2 = np.sum([person2[item] for item in intersection])/number_of_itens
    sum1_mean = np.sum([(person1[item] - sum1)**2 for item in intersection])
    sum2_mean = np.sum([(person2[item] - sum2)**2 for item in intersection])
    pSum = np.sum([(person1[item]- sum1) * (person2[item]- sum2)\
                     for item in intersection])
    numerator = pSum 
    denominator = np.sqrt(sum1_mean)*np.sqrt(sum2_mean)
    if denominator == 0:
        return 0
    print(numerator/denominator)
    return numerator/denominator   

def sim_pearson(person1,person2):
    person1_musics = set(person1.keys()) 
    person2_musics = set(person2.keys())
    intersection = list(person1_musics.intersection(person2_musics))
    number_of_itens = len(intersection)
    if number_of_itens ==0:
        return 0
    sum1 = np.sum([person1[item] for item in intersection])
    sum2 = np.sum([person2[item] for item in intersection])
    sum1sq = np.sum([(person1[item])**2 for item in intersection])
    sum2sq = np.sum([(person2[item])**2 for item in intersection])
    pSum = np.sum([(person1[item] * person2[item])\
                     for item in intersection])
    numerator = pSum - (sum1*sum2/number_of_itens)
    denominator = np.sqrt( ((sum1sq - sum1**2)/number_of_itens) *\
                           ((sum2sq - sum2**2)/number_of_itens))
    if denominator == 0:
        return 0
    print(numerator/denominator)
    return numerator/denominator

def topMatches(prefs, person, n=5, similarity=naive_pearson):
    target_user = get_user_prefs(df,person) 
    scores = [(similarity(target_user,get_user_prefs(df,other_person)),other_person)\
              for other_person in prefs if other_person!= person]
    scores.sort(reverse=True)    
    return scores[0:n]


def list_with_no_rep(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

def recommed_items_based_on_user(prefs,person,number_of_rec=5,n=5,similarity=naive_pearson):
    start_time = time.time()
    most_similar_users = [y for (x,y) in topMatches(prefs,person,n,similarity)]
    new_films = []
    dic_target = get_user_prefs(df,person) 
    for user in most_similar_users:
        dic_user = get_user_prefs(df,user)
        for item in dic_user:
            if item not in dic_target:
                new_films.append((dic_user[item],item))
    
    new_films.sort(reverse=True)
    recomendation = list_with_no_rep([y for (x,y) in new_films])
    duration = time.time() - start_time
    sec = timedelta(seconds=int(duration))
    d_time = datetime(1,1,1) + sec
    print("duration:  %d:%d:%d:%d" % (d_time.day-1, d_time.hour, d_time.minute, d_time.second), end='')
    print(" (DAYS:HOURS:MIN:SEC)")        
    if recomendation==[]:
        return 'There is no recomendadtions for {}'.format(person)
    return recomendation[0:number_of_rec]




# Nesse caso a recomendacao baseada em usuario nao compensa

In [21]:
users = df['user_id'].unique()
size = len(users)
random1, random2 = random.sample(range(size), 2)
randon_users = [users[i] for i in random.sample(range(size), 20)]
target1, target2 = users[random1], users[random2] 
dft1 = df.loc[df['user_id'] == target1]
dft1 = dft1.sort_values('listen_count',ascending=False)
best_songs_t1 = list(map(lambda x: show_song(all_songs,x),list(dft1['song_id'])))
if len(best_songs_t1) > 5:
    print("The user target1 likes: \n", best_songs_t1[0:5])
else:
    print("The user target1 likes: \n",best_songs_t1)
print('And for him/her we recommend: \n')
print(list(map(lambda x: show_song(all_songs,x),recommed_items_based_on_user(randon_users,target1))))
print('And for him we recomend: \n')



The user target1 likes: 
 [('Invincible', 'Muse'), ('Supermassive Black Hole (Twilight Soundtrack Version)', 'Muse'), ('Creep (Explicit)', 'Radiohead'), ('Let Down', 'Radiohead'), ('Bitter Sweet Symphony', 'The Verve')]
And for him/her we recommend: 

duration:  0:0:0:2 (DAYS:HOURS:MIN:SEC)
[('Old Man Time (Album Version)', 'O.A.R.'), ('Masquerade (Reality Check Album Version)', 'Reality Check'), ("You're The One", 'Dwight Yoakam'), ('Reign Of The Tyrants', 'Jag Panzer'), ('Numb (Album Version)', 'Disturbed')]
And for him we recomend: 



# Precisamos criar o dicionario de musicas para fazer a recomendacao baseada em items que nesse caso eh bem mais factivel

In [None]:
songs = df['song_id'].unique()

dic_songs = {}

for i,song in enumerate(songs[0:100]):
    if i%10==0:
        print_parm = True
        print("\n SONG:{} ".format(i),end=' ')
    else:
        print_parm = False
    dic_songs.setdefault(song,{})
    for e,user in enumerate(users[0:100]):
        if e%10==0 and print_parm:
            print("user:{} ".format(e),end=' ')
        prefs = get_user_prefs(df,user)
        if song in prefs.keys():
            dic_songs[song][user] = prefs[song]
        

In [54]:
def sim_distance(prefs,person1,person2):
    intersection = []
    for item in prefs[person1]:
        if item in prefs[person2]:
            intersection.append(item)
    if len(intersection)==0:
        return 0
    sum_of_squares = np.sum([(prefs[person1][item] - prefs[person2][item])**2\
                     for item in intersection])
    return 1/(1+ sum_of_squares)


def topMatches(prefs, person, n=5, similarity=sim_distance):
    scores = [(similarity(prefs, person,other_person),other_person)\
              for other_person in prefs if other_person!= person]
    scores.sort(reverse=True)
    return scores[0:n]


def calculateSimilarItems(itemPrefs,n=10):
    result = {}
    c = 0
    for item in itemPrefs:
        c += 1
        if c%100==0:
            print('%d / %d' % (c,len(itemPrefs)))
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
        result[item]=scores
    return result

itemsim = calculateSimilarItems(dic_songs)
print(itemsim)

100 / 100
{'SOUSQCN12A8C133302': [(0.5, 'SOKOXWU12AF72AD1BC'), (0.5, 'SOJNQZO12AF72AAE32'), (0.5, 'SOGEKGR12A6D4F81E8'), (0.20000000000000001, 'SOWEHOM12A6BD4E09E'), (0.20000000000000001, 'SOVEUVC12A6310EAF1'), (0.20000000000000001, 'SOMNTMT12A8C1400F6'), (0.10000000000000001, 'SOGDDKR12A6701E8FA'), (0.10000000000000001, 'SODGVGW12AC9075A8D'), (0.038461538461538464, 'SOWGXOP12A6701E93A'), (0.012195121951219513, 'SOXGQEM12AB0181D35')], 'SOJNQZO12AF72AAE32': [(0.5, 'SOWEHOM12A6BD4E09E'), (0.5, 'SOUSQCN12A8C133302'), (0.5, 'SOMNTMT12A8C1400F6'), (0.20000000000000001, 'SOKOXWU12AF72AD1BC'), (0.20000000000000001, 'SOGEKGR12A6D4F81E8'), (0.10000000000000001, 'SOVEUVC12A6310EAF1'), (0.058823529411764705, 'SOGDDKR12A6701E8FA'), (0.058823529411764705, 'SODGVGW12AC9075A8D'), (0.027027027027027029, 'SOWGXOP12A6701E93A'), (0.0099009900990099011, 'SOXGQEM12AB0181D35')], 'SODACBL12A8C13C273': [(1.0, 'SOZOBWN12A8C130999'), (1.0, 'SOXZQDE12A8C135833'), (1.0, 'SOXRXDG12A8C131DE5'), (1.0, 'SOXMIUS12A8C1

In [64]:
def getRecommendedItems(user, itemMatch):
    userRatings = get_user_prefs(df,user)
    scores ={}
    totalSim ={}
    for (item,rating) in userRatings.items():
        for (similarity,item2) in itemMatch[item]:
            if item2 in userRatings:
                continue
            scores.setdefault(item2,0)
            scores[item2] += similarity*rating
            totalSim.setdefault(item2,0)
            totalSim[item2] += similarity
        
    rankings = [(score/totalSim[item],item) for item,score in scores.items()]      
    rankings.sort(reverse=True)
    return rankings

In [68]:
getRecommendedItems(users[3],itemsim)



[(5.0, 'SOAUWYT12A81C206F1'),
 (nan, 'SOXHIDK12A58A7CFB3'),
 (5.0, 'SOBONKR12A58A7A7E0'),
 (nan, 'SOZOBWN12A8C130999'),
 (nan, 'SOXMIUS12A8C13CD59'),
 (nan, 'SOZMJFG12AB017BDAF'),
 (nan, 'SOXZQDE12A8C135833'),
 (nan, 'SOXRXDG12A8C131DE5'),
 (nan, 'SOXIIIM12A6D4F66C8'),
 (nan, 'SOXGQEM12AB0181D35'),
 (nan, 'SOYHEPA12A8C13097F'),
 (5.0, 'SOEGIYH12A6D4FC0E3')]

In [69]:
show_song(all_songs,'SOAUWYT12A81C206F1')

('Undo', 'Björk')

In [71]:
dft1 = df.loc[df['user_id'] == users[3]]
dft1 = dft1.sort_values('listen_count',ascending=False)
best_songs_t1 = list(map(lambda x: show_song(all_songs,x),list(dft1['song_id'])))
if len(best_songs_t1) > 5:
    print("The user target1 likes: \n", best_songs_t1[0:5])
else:
    print("The user target1 likes: \n",best_songs_t1)

The user target1 likes: 
 [('Streets On Lock', 'Young Jeezy'), ('The Way Things Go', 'Octopus Project')]


In [72]:
reco = [y for (x,y) in getRecommendedItems(users[3],itemsim)]
print(list(map(lambda x: show_song(all_songs,x),reco)))

[('Undo', 'Björk'), ("Who's Real", 'Jadakiss / Swizz Beatz / OJ Da Juiceman'), ("You're The One", 'Dwight Yoakam'), ('Holes To Heaven', 'Jack Johnson'), ('Sun Giant', 'Fleet Foxes'), ('Swimming In The Flood', 'Passion Pit'), ('Right Back', 'Sublime'), ('City Love', 'John Mayer'), ('All My Friends', 'LCD Soundsystem'), ('Speechless', 'Lady GaGa'), ('Moonshine', 'Jack Johnson'), ('Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)', 'Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner')]


