# Recommendation by Item popularity

In [1]:
#imports 
import pandas as pd
import numpy as np

#dataframe of songs
df = pd.read_csv(
    'song_data.csv', 
     names =['user_id', 'song_id', 'listen_count', 'title', 'artist']
)


#function that creates a dataframe of items.
#the items are sorted by the feature count in descending order.
#in the list list_of_product_features the user
#can choose what features he wants to display
#if the parameter all_items = True, this function will return the
#dataframe with all the items (so the k will be ignored); otherwise will 
#return the dataframe with only the top k items

def top_k_items(dataframe,
                k,
                user,
                item,
                count,
                list_of_product_features=[],
                all_items=False):
    
    new_df = dataframe.drop(user,1)
    new_df = new_df.reset_index().groupby(item,as_index=False).sum()
    new_df.sort_values(count,ascending=False, inplace=True)
    new_df.drop('index',1,inplace=True)
    new_df.reset_index(inplace=True)
    new_df.drop('index',1,inplace=True)
    if list_of_product_features == []:
        if all_items == False:
            new_df = new_df.loc[range(k)] 
        return new_df
    list_of_product_features = [item] + list_of_product_features
    quasi_df_features = dataframe[list_of_product_features]
    df_features = quasi_df_features.drop_duplicates()
    new_df = pd.merge(new_df,df_features, on=[item])
    if all_items == False:
        new_df = new_df.loc[range(k)] 
    return new_df


This is our original dataframe

In [2]:
df.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters


For the first test we want the top 10 items with no information about them, we want only their id

In [3]:
user, item, count = 'user_id', 'song_id','listen_count' 
test1 = top_k_items(df,10,user,item,count)
test1

Unnamed: 0,song_id,listen_count
0,SOBONKR12A58A7A7E0,40619
1,SOAUWYT12A81C206F1,36059
2,SOSXLTC12AF72A7F54,30391
3,SOEGIYH12A6D4FC0E3,21953
4,SOFRQTD12A81C233C0,21646
5,SOAXGDH12A8C13F8A1,15889
6,SONYKOW12AB01849C9,14149
7,SOVDSJC12A58A7A271,14029
8,SOUFTBI12AB0183F65,13701
9,SOHTKMO12AB01843B0,12506


The function can return also the dataframe with all the items

In [4]:
test2 = top_k_items(df,10,user,item,count,all_items=True)
test2.tail()

Unnamed: 0,song_id,listen_count
9995,SOFFCOP12A8C1422E3,18
9996,SOOUXUD12AB0188D97,17
9997,SOBZVFU12A6702162C,16
9998,SOBDQMB12AB0189045,14
9999,SOPWWHY12A58A7B015,12


Now we add some features to have more information about the items

In [5]:
features = ['artist','title'] 
test3 = top_k_items(df,30,user,item,count,features)
test3

Unnamed: 0,song_id,listen_count,artist,title
0,SOBONKR12A58A7A7E0,40619,Dwight Yoakam,You're The One
1,SOAUWYT12A81C206F1,36059,Björk,Undo
2,SOSXLTC12AF72A7F54,30391,Kings Of Leon,Revelry
3,SOEGIYH12A6D4FC0E3,21953,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
4,SOFRQTD12A81C233C0,21646,Harmonia,Sehr kosmisch
5,SOAXGDH12A8C13F8A1,15889,Florence + The Machine,Dog Days Are Over (Radio Edit)
6,SONYKOW12AB01849C9,14149,OneRepublic,Secrets
7,SOVDSJC12A58A7A271,14029,Sam Cooke,Ain't Misbehavin
8,SOUFTBI12AB0183F65,13701,Tub Ring,Invalid
9,SOHTKMO12AB01843B0,12506,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...
