# Build a song recommender system

In [1]:
import turicreate

# Load some music data

In [2]:
song_data = turicreate.SFrame('song_data.sframe')

# Explore our data

In [3]:
song_data

user_id,song_id,listen_count,title,artist
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBXHDL12A81C204C0,1,Stronger,Kanye West
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll ...,Héroes del Silencio
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODXRTY12AB0180F3B,1,Paper Gangsta,Lady GaGa
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFGUAY12AB017B0A8,1,Stacked Actors,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFRQTD12A81C233C0,1,Sehr kosmisch,Harmonia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes ...,Thievery Corporation feat. Emiliana Torrini ...

song
The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia ...
Stronger - Kanye West
Constellations - Jack Johnson ...
Learn To Fly - Foo Fighters ...
Apuesta Por El Rock 'N' Roll - Héroes del ...
Paper Gangsta - Lady GaGa
Stacked Actors - Foo Fighters ...
Sehr kosmisch - Harmonia
Heaven's gonna burn your eyes - Thievery ...


# Count the number of unique users in the data

In [4]:
users = song_data['user_id'].unique()

In [5]:
len(users)

66346

In [6]:
kanye_data = song_data[song_data['artist'] == 'Kanye West']
foo_data = song_data[song_data['artist'] == 'Foo Fighters']
taylor_data = song_data[song_data['artist'] == 'Taylor Swift']
lady_data = song_data[song_data['artist'] == 'Lady GaGa']

user_kanye = len(kanye_data['user_id'].unique())
user_foo = len(foo_data['user_id'].unique())
user_taylor = len(taylor_data['user_id'].unique())
user_lady = len(lady_data['user_id'].unique())

print('Kanye: ' + str(user_kanye))
print('Foo: ' + str(user_foo))
print('Taylor: ' + str(user_taylor))
print('Lady: ' + str(user_lady))

Kanye: 2522
Foo: 2055
Taylor: 3246
Lady: 2928


In [7]:
new_song_data = song_data.groupby(key_column_names = 'artist', 
                                  operations = {'total_count': turicreate.aggregate.SUM('listen_count')})
new_song_data

artist,total_count
The Dells,274
Lil Jon / The East Side Boyz ...,197
Tom Petty And The Heartbreakers ...,2867
Blackstreet,747
Ratatat,3727
Shotta,82
Airscape,130
Mecano,172
Moimir Papalescu & The Nihilists ...,177
Brad Paisley,2731


In [8]:
new_song_data.sort('total_count', ascending = False)

artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387
Justin Bieber,29715
Alliance Ethnik,26689
OneRepublic,25754
Train,25402
The Black Keys,22184


In [9]:
new_song_data.sort('total_count', ascending = True)

artist,total_count
William Tabbert,14
Reel Feelings,24
Beyoncé feat. Bun B and Slim Thug ...,26
Diplo,30
Boggle Karaoke,30
harvey summers,31
Nâdiya,36
Kanye West / Talib Kweli / Q-Tip / Common / ...,38
Aneta Langerova,38
Jody Bernal,38


# Create a song recommender

In [10]:
train_data,test_data = song_data.random_split(.8, seed = 0)

## Create a very simple popularity recommender

In [11]:
popularity_model = turicreate.popularity_recommender.create(train_data,
                                                            user_id = 'user_id',
                                                            item_id = 'song')

# Build a recommender with personalization

In [12]:
personalized_model = turicreate.item_similarity_recommender.create(train_data,
                                                                   user_id = 'user_id',
                                                                   item_id = 'song')

## Apply personalized model to make song recommendations

In [13]:
# personalized_model.recommend(users = [users[0]])
# personalized_model.recommend(users = [users[0]])

subset_test_users = test_data['user_id'].unique()[0:10000]
recommended_songs = personalized_model.recommend(subset_test_users, k = 1)
recommended_songs

user_id,song,score,rank
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Cuando Pase El Temblor - Soda Stereo ...,0.0194504536115206,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Grind With Me (Explicit Version) - Pretty Ricky ...,0.0459424376487731,1
f6c596a519698c97f1591ad89 f540d76f6a04f1a ...,Hey_ Soul Sister - Train,0.0238929539918899,1
696787172dd3f5169dc94deef 97e427cee86147d ...,Senza Una Donna (Without A Woman) - Zucchero / ...,0.0170265776770455,1
3a7111f4cdf3c5a85fd4053e3 cc2333562e1e0cb ...,Heartbreak Warfare - John Mayer ...,0.0298416515191396,1
532e98155cbfd1e1a474a28ed 96e59e50f7c5baf ...,Jive Talkin' (Album Version) - Bee Gees ...,0.0118288653237479,1
ee43b175ed753b2e2bce806c9 03d4661ad351a91 ...,Ricordati Di Noi - Valerio Scanu ...,0.0305171211560567,1
e372c27f6cb071518ae500589 ae02c126954c148 ...,Fall Out - The Police,0.0819672048091888,1
83b1428917b47a6b130ed471b 09033820be78a8c ...,Clocks - Coldplay,0.0428588390350341,1
39487deef9345b1e22881245c abf4e7c53b6cf6e ...,Black Mirror - Arcade Fire ...,0.0417737685717069,1


In [14]:
the_most_recommended_song_table = recommended_songs.groupby(key_column_names = 'song', 
                                                      operations = {'count': turicreate.aggregate.COUNT()})
the_most_recommended_song_table.sort('count', ascending = False)

song,count
Undo - Björk,428
Secrets - OneRepublic,384
Revelry - Kings Of Leon,232
You're The One - Dwight Yoakam ...,160
Fireflies - Charttraxx Karaoke ...,125
Sehr kosmisch - Harmonia,99
Horn Concerto No. 4 in E flat K495: II. Romance ...,97
Hey_ Soul Sister - Train,92
OMG - Usher featuring will.i.am ...,63
The Scientist - Coldplay,42


# Apply model to find similar songs in the data set

In [15]:
personalized_model.get_similar_items(['With Or Without You - U2'])

song,similar,score,rank
With Or Without You - U2,I Still Haven't Found What I'm Looking For ...,0.0428571701049804,1
With Or Without You - U2,Hold Me_ Thrill Me_ Kiss Me_ Kill Me - U2 ...,0.033734917640686,2
With Or Without You - U2,Window In The Skies - U2,0.032835841178894,3
With Or Without You - U2,Vertigo - U2,0.030075192451477,4
With Or Without You - U2,Sunday Bloody Sunday - U2,0.0271317958831787,5
With Or Without You - U2,Bad - U2,0.0251798629760742,6
With Or Without You - U2,A Day Without Me - U2,0.0237154364585876,7
With Or Without You - U2,Another Time Another Place - U2 ...,0.0203251838684082,8
With Or Without You - U2,Walk On - U2,0.0202020406723022,9
With Or Without You - U2,Get On Your Boots - U2,0.0196850299835205,10


In [16]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

song,similar,score,rank
Chan Chan (Live) - Buena Vista Social Club ...,Murmullo - Buena Vista Social Club ...,0.1881188154220581,1
Chan Chan (Live) - Buena Vista Social Club ...,La Bayamesa - Buena Vista Social Club ...,0.1871921420097351,2
Chan Chan (Live) - Buena Vista Social Club ...,Amor de Loca Juventud - Buena Vista Social Club ...,0.1848341226577758,3
Chan Chan (Live) - Buena Vista Social Club ...,Diferente - Gotan Project,0.0214592218399047,4
Chan Chan (Live) - Buena Vista Social Club ...,Mistica - Orishas,0.0205761194229125,5
Chan Chan (Live) - Buena Vista Social Club ...,Hotel California - Gipsy Kings ...,0.0193049907684326,6
Chan Chan (Live) - Buena Vista Social Club ...,Nací Orishas - Orishas,0.0191571116447448,7
Chan Chan (Live) - Buena Vista Social Club ...,Le Moulin - Yann Tiersen,0.0187969803810119,8
Chan Chan (Live) - Buena Vista Social Club ...,Gitana - Willie Colon,0.0187969803810119,9
Chan Chan (Live) - Buena Vista Social Club ...,Criminal - Gotan Project,0.0187793374061584,10


# Compare the models quantitatively
We now formally compare the popularity and the personalized models using precision-recall curves. 

In [17]:
model_performance = turicreate.recommender.util.compare_models(test_data, [popularity_model, 
                                                                           personalized_model], user_sample=.05)

compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.023200272944387562 | 0.005738153767836466 |
|   2    | 0.021323780279768022 | 0.010857362622971623 |
|   3    | 0.01933356078698967  | 0.015077554842140321 |
|   4    | 0.018338451040600473 | 0.018089081868508727 |
|   5    | 0.016240191061071302 | 0.02053501248741783  |
|   6    | 0.015466848629591702 | 0.02428408384140216  |
|   7    | 0.01530438173222209  | 0.027375976595526253 |
|   8    | 0.015054588877516206 | 0.03163055764130484  |
|   9    | 0.01463285188976081  |  0.0348878958525837  |
|   10   | 0.013920163766632585 | 0.037459340670701946 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.05356533606277721  | 0.017588571631560374 |
|   2    | 0.04640054588877518  | 0.029446000377423107 |
|   3    | 0.039804389855566946 | 0.03621720975866322  |
|   4    | 0.03539747526441486  | 0.041457074901291906 |
|   5    | 0.03288979870351422  | 0.04835735552214574  |
|   6    | 0.029910155805754585 |  0.0525751931074347  |
|   7    | 0.028123019934688312 | 0.05695747067292617  |
|   8    | 0.02618560218355511  | 0.06017485679716995  |
|   9    | 0.024678721710451514 | 0.06315724440084736  |
|   10   | 0.023132036847492297 | 0.06519934252789934  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]



The table shows that the personalized model provides much better performance.