# All imports necessary

In [1]:
import os

In [2]:
import sys

In [3]:
import warnings

In [4]:
import numpy as np

In [5]:
import pandas as pd

In [6]:
sys.path.append('..')

In [7]:
import seaborn as sns

In [8]:
import matplotlib.pyplot as plt

In [9]:
from sklearn.metrics import auc

In [10]:
warnings.filterwarnings("ignore")

In [11]:
from sklearn.metrics import roc_curve

In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
from source.code.models.svdbasedrecommender import SVDBasedRecommender

In [15]:
from source.code.models.songfrequencybasedrecommender import SongFrequencyBasedRecommender

# Read the data

In [16]:
data_directory = '../data/datasets/'

## Song extra info

In [17]:
song_extra_info = pd.read_csv(os.path.join(data_directory, 'song_extra_info.csv'))

In [18]:
song_extra_info.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


In [19]:
song_extra_info.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295971 entries, 0 to 2295970
Data columns (total 3 columns):
song_id    2295971 non-null object
name       2295969 non-null object
isrc       2159423 non-null object
dtypes: object(3)
memory usage: 52.6+ MB


## Train

In [20]:
train = pd.read_csv(os.path.join(data_directory, 'train.csv'), engine='python')

In [21]:
train.head().T

Unnamed: 0,0,1,2,3,4
msno,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
song_id,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=
source_system_tab,explore,my library,my library,my library,explore
source_screen_name,Explore,Local playlist more,Local playlist more,Local playlist more,Explore
source_type,online-playlist,local-playlist,local-playlist,local-playlist,online-playlist
target,1,1,1,1,1


In [22]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  7377418 non-null object
song_id               7377418 non-null object
source_system_tab     7352569 non-null object
source_screen_name    6962614 non-null object
source_type           7355879 non-null object
target                7377418 non-null int64
dtypes: int64(1), object(5)
memory usage: 337.7+ MB


In [23]:
train['data_from'] = 'train'

In [24]:
train.msno.nunique()

30755

In [25]:
train.song_id.nunique()

359966

In [26]:
train.source_system_tab.nunique()

8

In [27]:
train.source_system_tab.value_counts()

my library      3684730
discover        2179252
search           623286
radio            476701
listen with      212266
explore          167949
notification       6185
settings           2200
Name: source_system_tab, dtype: int64

In [28]:
train.source_screen_name.nunique()

20

In [29]:
train.source_screen_name.value_counts()

Local playlist more     3228202
Online playlist more    1294689
Radio                    474467
Album more               420156
Search                   298487
Artist more              252429
Discover Feature         244246
Discover Chart           213658
Others profile more      201795
Discover Genre            82202
My library                75980
Explore                   72342
Unknown                   54170
Discover New              15955
Search Trends             13632
Search Home               13482
My library_Search          6451
Self profile more           212
Concert                      47
Payment                      12
Name: source_screen_name, dtype: int64

In [30]:
train.source_type.nunique()

12

In [31]:
train.source_type.value_counts()

local-library             2261399
online-playlist           1967924
local-playlist            1079503
radio                      483109
album                      477344
top-hits-for-artist        423614
song                       244722
song-based-playlist        210527
listen-with                192842
topic-article-playlist      11194
artist                       3038
my-daily-playlist             663
Name: source_type, dtype: int64

In [32]:
train.fillna('unknown', inplace=True)

In [33]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 7 columns):
msno                  7377418 non-null object
song_id               7377418 non-null object
source_system_tab     7377418 non-null object
source_screen_name    7377418 non-null object
source_type           7377418 non-null object
target                7377418 non-null int64
data_from             7377418 non-null object
dtypes: int64(1), object(6)
memory usage: 394.0+ MB


## Test

In [34]:
test = pd.read_csv(os.path.join(data_directory, 'test.csv'), engine='python', index_col=0)

In [35]:
test.head().T

id,0,1,2,3,4
msno,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=
song_id,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=
source_system_tab,my library,my library,discover,radio,radio
source_screen_name,Local playlist more,Local playlist more,,Radio,Radio
source_type,local-library,local-library,song-based-playlist,radio,radio


In [36]:
test.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 5 columns):
msno                  2556790 non-null object
song_id               2556790 non-null object
source_system_tab     2548348 non-null object
source_screen_name    2393907 non-null object
source_type           2549493 non-null object
dtypes: object(5)
memory usage: 117.0+ MB


In [37]:
test['target'] = None

In [38]:
test['data_from'] = 'test'

In [39]:
test.msno.nunique()

25131

In [40]:
test.song_id.nunique()

224753

In [41]:
test.source_system_tab.nunique()

8

In [42]:
test.source_system_tab.value_counts()

my library      1019492
discover         871068
search           277615
radio            212765
listen with       98628
explore           66023
notification       2124
settings            633
Name: source_system_tab, dtype: int64

In [43]:
test.source_screen_name.nunique()

22

In [44]:
test.source_screen_name.value_counts()

Local playlist more     845115
Online playlist more    529807
Radio                   211201
Album more              176129
Search                  121982
Artist more             110999
Discover Feature         93401
Others profile more      90457
Discover Chart           78999
Discover Genre           41617
Explore                  27872
My library               25559
Unknown                  23620
Discover New              5277
Search Trends             4883
Search Home               4705
My library_Search         2114
Self profile more          131
People local                13
Concert                     13
Payment                     12
People global                1
Name: source_screen_name, dtype: int64

In [45]:
test.source_type.nunique()

12

In [46]:
test.source_type.value_counts()

online-playlist           774532
local-library             582346
local-playlist            294537
radio                     215164
album                     195190
top-hits-for-artist       179360
song                      129153
song-based-playlist        87179
listen-with                84499
topic-article-playlist      5082
my-daily-playlist           2023
artist                       428
Name: source_type, dtype: int64

In [47]:
test.fillna('unknown', inplace=True)

In [48]:
test.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 7 columns):
msno                  2556790 non-null object
song_id               2556790 non-null object
source_system_tab     2556790 non-null object
source_screen_name    2556790 non-null object
source_type           2556790 non-null object
target                2556790 non-null object
data_from             2556790 non-null object
dtypes: object(7)
memory usage: 156.1+ MB


## Songs

In [49]:
songs = pd.read_csv(os.path.join(data_directory, 'songs.csv'))

In [50]:
songs.head().T

Unnamed: 0,0,1,2,3,4
song_id,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=
song_length,247640,197328,231781,273554,140329
genre_ids,465,444,465,465,726
artist_name,張信哲 (Jeff Chang),BLACKPINK,SUPER JUNIOR,S.H.E,貴族精選
composer,董貞,TEDDY| FUTURE BOUNCE| Bekuh BOOM,,湯小康,Traditional
lyricist,何啟弘,TEDDY,,徐世珍,Traditional
language,3,31,31,3,52


In [51]:
songs.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        2296320 non-null object
song_length    2296320 non-null int64
genre_ids      2202204 non-null object
artist_name    2296320 non-null object
composer       1224966 non-null object
lyricist       351052 non-null object
language       2296319 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 122.6+ MB


In [52]:
songs.lyricist.fillna('unknown', inplace=True)

In [53]:
songs.composer.fillna('unknown', inplace=True)

In [54]:
songs.language.fillna(-1, inplace=True)

In [55]:
songs.language = songs.language.astype(np.int64)

In [56]:
songs.head().T

Unnamed: 0,0,1,2,3,4
song_id,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=
song_length,247640,197328,231781,273554,140329
genre_ids,465,444,465,465,726
artist_name,張信哲 (Jeff Chang),BLACKPINK,SUPER JUNIOR,S.H.E,貴族精選
composer,董貞,TEDDY| FUTURE BOUNCE| Bekuh BOOM,unknown,湯小康,Traditional
lyricist,何啟弘,TEDDY,unknown,徐世珍,Traditional
language,3,31,31,3,52


In [57]:
songs.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        2296320 non-null object
song_length    2296320 non-null int64
genre_ids      2202204 non-null object
artist_name    2296320 non-null object
composer       2296320 non-null object
lyricist       2296320 non-null object
language       2296320 non-null int64
dtypes: int64(2), object(5)
memory usage: 122.6+ MB


In [58]:
songs.song_length.nunique()

146534

In [59]:
songs.genre_ids.nunique()

1045

In [60]:
songs.artist_name.nunique()

222363

In [61]:
songs.composer.nunique()

329823

In [62]:
songs.lyricist.nunique()

110925

In [63]:
songs.language.nunique()

10

## Members

In [64]:
members = pd.read_csv(os.path.join(data_directory, 'members.csv'))

In [65]:
members.head().T

Unnamed: 0,0,1,2,3,4
msno,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=
city,1,1,1,1,1
bd,0,0,0,0,0
gender,,,,,
registered_via,7,7,4,9,4
registration_init_time,20110820,20150628,20160411,20150906,20170126
expiration_date,20170920,20170622,20170712,20150907,20170613


In [66]:
members.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
msno                      34403 non-null object
city                      34403 non-null int64
bd                        34403 non-null int64
gender                    14501 non-null object
registered_via            34403 non-null int64
registration_init_time    34403 non-null int64
expiration_date           34403 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.8+ MB


In [67]:
members.fillna('unknown', inplace=True)

In [68]:
members.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
msno                      34403 non-null object
city                      34403 non-null int64
bd                        34403 non-null int64
gender                    34403 non-null object
registered_via            34403 non-null int64
registration_init_time    34403 non-null int64
expiration_date           34403 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.8+ MB


In [69]:
members.head().T

Unnamed: 0,0,1,2,3,4
msno,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=
city,1,1,1,1,1
bd,0,0,0,0,0
gender,unknown,unknown,unknown,unknown,unknown
registered_via,7,7,4,9,4
registration_init_time,20110820,20150628,20160411,20150906,20170126
expiration_date,20170920,20170622,20170712,20150907,20170613


## Sample submission

In [None]:
sample_submission = pd.read_csv(os.path.join(data_directory, 'sample_submission.csv'))

In [None]:
sample_submission.head(10)

In [None]:
sample_submission.info(verbose=True, null_counts=True)

# Solutions

In [None]:
X, y = train[train.columns[:-2]], train[train.columns[-2]]

In [None]:
X_for_submission = test[test.columns[:-2]]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_for_submission.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42,
    stratify=y
)

## Naive frequency approach

In [None]:
naive_frequency_recommender = SongFrequencyBasedRecommender()

In [None]:
print('\t', cross_val_score(naive_frequency_recommender, X_train, y_train, cv=5))

In [None]:
naive_frequency_recommender.fit(X_train, y_train)

In [None]:
y_pred = naive_frequency_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (Naive frequency approach)')
plt.legend(loc="lower right")
plt.show()

## SVD

In [None]:
svd_recommender = SVDBasedRecommender()

In [None]:
print('\t', cross_val_score(svd_recommender, X_train, y_train, cv=5))

In [None]:
svd_recommender.fit(X_train, y_train)

In [None]:
y_pred = svd_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (SVD)')
plt.legend(loc="lower right")
plt.show()

## FM