# Build a song recommender system

In [53]:
import turicreate

# Load some music data

In [54]:
song_data = turicreate.SFrame('song_data.sframe/')

# Explore our data

In [55]:
song_data

user_id,song_id,listen_count,title,artist
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBXHDL12A81C204C0,1,Stronger,Kanye West
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll ...,Héroes del Silencio
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODXRTY12AB0180F3B,1,Paper Gangsta,Lady GaGa
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFGUAY12AB017B0A8,1,Stacked Actors,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFRQTD12A81C233C0,1,Sehr kosmisch,Harmonia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes ...,Thievery Corporation feat. Emiliana Torrini ...

song
The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia ...
Stronger - Kanye West
Constellations - Jack Johnson ...
Learn To Fly - Foo Fighters ...
Apuesta Por El Rock 'N' Roll - Héroes del ...
Paper Gangsta - Lady GaGa
Stacked Actors - Foo Fighters ...
Sehr kosmisch - Harmonia
Heaven's gonna burn your eyes - Thievery ...


## Show the most popular songs in the dataset

In [56]:
#song_data['song'].show()

# Count the number of unique users in the data

In [57]:
users = song_data['user_id'].unique()

In [58]:
len(users)

66346

# Create a song recommender

In [59]:
train_data,test_data = song_data.random_split(.8,seed=0)

## Create a very simple popularity recommender

In [60]:
popularity_model = turicreate.popularity_recommender.create(train_data,
                                                           user_id = 'user_id',
                                                           item_id = 'song')

## Use the popularity model to make some predictions

In [61]:
popularity_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sehr kosmisch - Harmonia,4754.0,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Undo - Björk,4227.0,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,You're The One - Dwight Yoakam ...,3781.0,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Revelry - Kings Of Leon,3527.0,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Secrets - OneRepublic,3148.0,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Hey_ Soul Sister - Train,2538.0,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Tive Sim - Cartola,2521.0,10


In [62]:
popularity_model.recommend(users=[users[1]])

user_id,song,score,rank
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Sehr kosmisch - Harmonia,4754.0,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Undo - Björk,4227.0,2
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,You're The One - Dwight Yoakam ...,3781.0,3
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Revelry - Kings Of Leon,3527.0,5
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Secrets - OneRepublic,3148.0,7
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Hey_ Soul Sister - Train,2538.0,8
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Tive Sim - Cartola,2521.0,10


# Build a recommender with personalization

In [63]:
personalized_model = turicreate.item_similarity_recommender.create(train_data,
                                                                  user_id = 'user_id',
                                                                  item_id = 'song')

## Apply personalized model to make song recommendations

In [64]:
personalized_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Riot In Cell Block Number Nine - Dr Feelgood ...,0.0374999940395355,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sei Lá Mangueira - Elizeth Cardoso ...,0.0331632643938064,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,The Stallion - Ween,0.0322580635547637,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Rain - Subhumans,0.0314159244298934,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,West One (Shine On Me) - The Ruts ...,0.0306771993637084,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Back Against The Wall - Cage The Elephant ...,0.0301204770803451,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Life Less Frightening - Rise Against ...,0.0284431129693985,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,A Beggar On A Beach Of Gold - Mike And The ...,0.023002490401268,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Audience Of One - Rise Against ...,0.0193938463926315,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Blame It On The Boogie - The Jacksons ...,0.0189873427152633,10


In [65]:
personalized_model.recommend(users=[users[1]])

user_id,song,score,rank
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Grind With Me (Explicit Version) - Pretty Ricky ...,0.0459424376487731,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,There Goes My Baby - Usher ...,0.0331920742988586,2
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Panty Droppa [Intro] (Album Version) - Trey ...,0.031856620311737,3
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Nobody (Featuring Athena Cage) (LP Version) - ...,0.0278467655181884,4
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Youth Against Fascism - Sonic Youth ...,0.0262914180755615,5
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Nice & Slow - Usher,0.0239639401435852,6
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Making Love (Into The Night) - Usher ...,0.0238176941871643,7
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Naked - Marques Houston,0.0228925704956054,8
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,I.nner Indulgence - DESTRUCTION ...,0.0220767498016357,9
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Love Lost (Album Version) - Trey Songz ...,0.0204497694969177,10


# Apply model to find similar songs in the data set

In [66]:
personalized_model.get_similar_items(['With Or Without You - U2'])

song,similar,score,rank
With Or Without You - U2,I Still Haven't Found What I'm Looking For ...,0.0428571701049804,1
With Or Without You - U2,Hold Me_ Thrill Me_ Kiss Me_ Kill Me - U2 ...,0.033734917640686,2
With Or Without You - U2,Window In The Skies - U2,0.032835841178894,3
With Or Without You - U2,Vertigo - U2,0.030075192451477,4
With Or Without You - U2,Sunday Bloody Sunday - U2,0.0271317958831787,5
With Or Without You - U2,Bad - U2,0.0251798629760742,6
With Or Without You - U2,A Day Without Me - U2,0.0237154364585876,7
With Or Without You - U2,Another Time Another Place - U2 ...,0.0203251838684082,8
With Or Without You - U2,Walk On - U2,0.0202020406723022,9
With Or Without You - U2,Get On Your Boots - U2,0.0196850299835205,10


In [67]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

song,similar,score,rank
Chan Chan (Live) - Buena Vista Social Club ...,Murmullo - Buena Vista Social Club ...,0.1881188154220581,1
Chan Chan (Live) - Buena Vista Social Club ...,La Bayamesa - Buena Vista Social Club ...,0.1871921420097351,2
Chan Chan (Live) - Buena Vista Social Club ...,Amor de Loca Juventud - Buena Vista Social Club ...,0.1848341226577758,3
Chan Chan (Live) - Buena Vista Social Club ...,Diferente - Gotan Project,0.0214592218399047,4
Chan Chan (Live) - Buena Vista Social Club ...,Mistica - Orishas,0.0205761194229125,5
Chan Chan (Live) - Buena Vista Social Club ...,Hotel California - Gipsy Kings ...,0.0193049907684326,6
Chan Chan (Live) - Buena Vista Social Club ...,Nací Orishas - Orishas,0.0191571116447448,7
Chan Chan (Live) - Buena Vista Social Club ...,Gitana - Willie Colon,0.0187969803810119,8
Chan Chan (Live) - Buena Vista Social Club ...,Le Moulin - Yann Tiersen,0.0187969803810119,9
Chan Chan (Live) - Buena Vista Social Club ...,Criminal - Gotan Project,0.0187793374061584,10


# Compare the models quantitatively
We now formally compare the popularity and the personalized models using precision-recall curves. 

In [68]:
model_performance = turicreate.recommender.util.compare_models(test_data, [popularity_model, personalized_model], user_sample=.05)

compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.022517911975435005 | 0.006259841034661911 |
|   2    | 0.019106107130672127 | 0.009426583121567758 |
|   3    | 0.018082565677243263 | 0.013516124877435028 |
|   4    | 0.01808256567724325  | 0.01817458386035667  |
|   5    | 0.016308427157966567 | 0.020488307041531217 |
|   6    | 0.015694302285909255 | 0.02397510039526417  |
|   7    | 0.015304381732222075 | 0.027605829384232675 |
|   8    | 0.014286932787444556 | 0.02964668439335789  |
|   9    | 0.013647219379051518 | 0.03254632465226122  |
|   10   | 0.01303309450699423  | 0.03515161674066485  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.021835551006482438 | 0.008020034478581047 |
|   2    | 0.01910610713067215  | 0.012622454690008432 |
|   3    | 0.017627658364608216 | 0.016410700353893793 |
|   4    | 0.016376663254861836 | 0.019217861349386453 |
|   5    | 0.015216649607642446 | 0.02143650683824686  |
|   6    | 0.01421585351984533  | 0.02418559443892095  |
|   7    | 0.013354778963786132 | 0.026029729113147743 |
|   8    | 0.012964858410098942 | 0.02769946182177504  |
|   9    | 0.012509951097463891 | 0.029383028832875326 |
|   10   | 0.012350733538041633 | 0.031843256728108324 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]



The table shows that the personalized model provides much better performance.

# Assignment

## Counting unique users
The method .unique() can be used to select the unique elements in a column of data. In this question, you will compute the number of unique users who have listened to songs by various artists.  For example, to find out the number of unique users who listened to songs by 'Kanye West', all you need to do is select the rows of the song data where the artist is 'Kanye West', and then count the number of unique entries in the ‘user_id’ column.  Compute the number of unique users for each of these artists:  'Kanye West', 'Foo Fighters', 'Taylor Swift' and 'Lady GaGa'. 

In [69]:
kayne_west = song_data[song_data['artist'] ==  'Kanye West']['user_id'].unique()
len(kayne_west)

2522

In [70]:
foo_fighters = song_data[song_data['artist'] ==  'Foo Fighters']['user_id'].unique()
len(foo_fighters)

2055

In [71]:
taylor_swift = song_data[song_data['artist'] ==  'Taylor Swift']['user_id'].unique()
len(taylor_swift)

3246

In [72]:
lady_gaGa = song_data[song_data['artist'] ==  'Lady GaGa']['user_id'].unique()
len(lady_gaGa)

2928

## Using groupby-aggregate to find the most popular and least popular artist
each row of song_data contains the number of times a user listened to particular song by a particular artist.  If we would like to know how many times any song by 'Kanye West' was listened to, we need to select all the rows where ‘artist’=='Kanye West' and sum the ‘listen_count’ column.  If we would like to find the most popular artist, we would need to follow this procedure for each artist, which would be very slow.  Instead, you will learn about a very important method: .groupby()<br>
The .groupby method computes an aggregate (in our case, the sum of the ‘listen_count’) for each distinct value in a column (in our case, the ‘artist’ column). <br><br>
Follow these steps to find the most popular artist in the dataset: 
- The .groupby method has two important parameters:<br>
    1. key_columns, which takes the column we want to group, in our case, ‘artist’<br>
    2. operations, where we define the aggregation operation we using, in our case, we want to sum over the ‘listen_count’.  

- With this in mind, the following command will compute the sum listen_count for each artist and return an SFrame with the results:
    ```song_data.groupby(key_columns='artist', operations={'total_count': turicreate.aggregate.SUM('listen_count')})```
    the total number of listens for each artist will be stored in ‘total_count’.
- Sort the resulting SFrame according to the ‘total_count’, and find the artist with the most popular and least popular artist in the dataset. 

In [73]:
total_count = song_data.groupby(key_column_names='artist', 
                                operations={'total_count': turicreate.aggregate.SUM('listen_count')})

In [74]:
total_count.sort('total_count', ascending=False)

artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387
Justin Bieber,29715
Alliance Ethnik,26689
OneRepublic,25754
Train,25402
The Black Keys,22184


In [75]:
total_count.sort('total_count', ascending=True)

artist,total_count
William Tabbert,14
Reel Feelings,24
Beyoncé feat. Bun B and Slim Thug ...,26
Boggle Karaoke,30
Diplo,30
harvey summers,31
Nâdiya,36
Jody Bernal,38
Aneta Langerova,38
Kanye West / Talib Kweli / Q-Tip / Common / ...,38


## [OPTIONAL] Using groupby-aggregate to find the most recommended songs:
Now that we learned how to use .groupby() to compute aggregates for each value in a column, let’s use to find the song that is most recommended by the personalized_model model we learned in the Jupyter notebook above.  Follow these steps to find the most recommended song:
- Split the data into 80% training, 20% testing, using seed=0, as was done in the Jupyter notebook above.
- Train an item_similarity_recommender, as done in the Jupyter notebook, using the training data.
- Next, we are going to make recommendations for the users in the test data, but  there are over 200,000 users (58,628 unique users) in the test set.  Computing recommendations for these many users can be slow in some computers.  Thus, we will use only the first 10,000 users only in this question.  Using this command to select this subset of users:
    - ```subset_test_users = test_data['user_id'].unique()[0:10000]```
- Let’s compute one recommended song for each of these test users.  Use this command to compute these recommendations:
    - ```personalized_model.recommend(subset_test_users,k=1)```
- Finally, we can use .groupby() to find the most recommended song!  :)  When we used .groupby() in the previous question, we summed up the total ‘listen_count’ for each artist, by setting the parameter SUM in the aggregator:
    - ```operations={'total_count': turicreate
         .aggregate.SUM('listen_count')}```

For this question, we simply want to count how often each song is recommended, so we will use the COUNT aggregator instead of SUM, and store the results in a column we will call ‘count’ by using:<br>
```operations={'count': turicreate.aggregate.COUNT()}```
<br>
And, since we want to use the song titles as the key to the aggregator instead of of the ‘artist’, we use:<br>
```key_columns='song'```
<br>
By sorting the results, you will find out the most recommended song to the first 10,000 users in the test data! Due to randomness in train-test split, the most recommended song may come out differently for different people. This is why we chose not to assign a quiz question for this section.

In [76]:
train_data,test_data = song_data.random_split(.8,seed=0)

In [77]:
personalized_model = turicreate.item_similarity_recommender.create(train_data,
                                                                user_id='user_id',
                                                                item_id='song')

In [78]:
subset_test_users = test_data['user_id'].unique()[0:10000]

In [79]:
subset_test_model = personalized_model.recommend(subset_test_users,k=1)

In [80]:
song_count = subset_test_model.groupby(key_column_names='song', 
                                       operations={'count': turicreate.aggregate.COUNT()})

In [81]:
song_count.sort('count', ascending=False)

song,count
Undo - Björk,439
Secrets - OneRepublic,380
Revelry - Kings Of Leon,222
You're The One - Dwight Yoakam ...,165
Fireflies - Charttraxx Karaoke ...,110
Hey_ Soul Sister - Train,99
Sehr kosmisch - Harmonia,90
Horn Concerto No. 4 in E flat K495: II. Romance ...,89
OMG - Usher featuring will.i.am ...,63
Bigger - Justin Bieber,46
