# All imports necessary

In [1]:
import os

In [2]:
import sys

In [3]:
import warnings

In [4]:
import numpy as np

In [5]:
import pandas as pd

In [6]:
sys.path.append('..')

In [7]:
import seaborn as sns

In [8]:
import matplotlib.pyplot as plt

In [9]:
from sklearn.metrics import auc

In [10]:
from sklearn.pipeline import Pipeline

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
from sklearn.preprocessing import KBinsDiscretizer

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
warnings.filterwarnings("ignore")

In [16]:
from sklearn.metrics import roc_curve

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
from source.code.models.svdbasedrecommender import SVDBasedRecommender

In [20]:
from source.code.models.songfrequencybasedrecommender import SongFrequencyBasedRecommender

In [21]:
from source.code.transformers.metafeaturesextractor import MetaFeaturesExtractor

# Read the data

In [22]:
data_directory = '../data/datasets/'

## Song extra info

In [23]:
song_extra_info = pd.read_csv(os.path.join(data_directory, 'song_extra_info.csv'))

In [24]:
song_extra_info.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


In [25]:
song_extra_info.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295971 entries, 0 to 2295970
Data columns (total 3 columns):
song_id    2295971 non-null object
name       2295969 non-null object
isrc       2159423 non-null object
dtypes: object(3)
memory usage: 52.6+ MB


## Train

In [26]:
train = pd.read_csv(os.path.join(data_directory, 'train.csv'), engine='python')

In [27]:
train.head().T

Unnamed: 0,0,1,2,3,4
msno,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
song_id,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=
source_system_tab,explore,my library,my library,my library,explore
source_screen_name,Explore,Local playlist more,Local playlist more,Local playlist more,Explore
source_type,online-playlist,local-playlist,local-playlist,local-playlist,online-playlist
target,1,1,1,1,1


In [28]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 6 columns):
msno                  7377418 non-null object
song_id               7377418 non-null object
source_system_tab     7352569 non-null object
source_screen_name    6962614 non-null object
source_type           7355879 non-null object
target                7377418 non-null int64
dtypes: int64(1), object(5)
memory usage: 337.7+ MB


In [29]:
train['data_from'] = 'train'

In [30]:
train.msno.nunique()

30755

In [31]:
train.song_id.nunique()

359966

In [32]:
train.source_system_tab.nunique()

8

In [33]:
train.source_system_tab.value_counts()

my library      3684730
discover        2179252
search           623286
radio            476701
listen with      212266
explore          167949
notification       6185
settings           2200
Name: source_system_tab, dtype: int64

In [34]:
train.source_screen_name.nunique()

20

In [35]:
train.source_screen_name.value_counts()

Local playlist more     3228202
Online playlist more    1294689
Radio                    474467
Album more               420156
Search                   298487
Artist more              252429
Discover Feature         244246
Discover Chart           213658
Others profile more      201795
Discover Genre            82202
My library                75980
Explore                   72342
Unknown                   54170
Discover New              15955
Search Trends             13632
Search Home               13482
My library_Search          6451
Self profile more           212
Concert                      47
Payment                      12
Name: source_screen_name, dtype: int64

In [36]:
train.source_type.nunique()

12

In [37]:
train.source_type.value_counts()

local-library             2261399
online-playlist           1967924
local-playlist            1079503
radio                      483109
album                      477344
top-hits-for-artist        423614
song                       244722
song-based-playlist        210527
listen-with                192842
topic-article-playlist      11194
artist                       3038
my-daily-playlist             663
Name: source_type, dtype: int64

In [38]:
train.fillna('unknown', inplace=True)

In [39]:
train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 7 columns):
msno                  7377418 non-null object
song_id               7377418 non-null object
source_system_tab     7377418 non-null object
source_screen_name    7377418 non-null object
source_type           7377418 non-null object
target                7377418 non-null int64
data_from             7377418 non-null object
dtypes: int64(1), object(6)
memory usage: 394.0+ MB


In [40]:
train.target.value_counts()

1    3714656
0    3662762
Name: target, dtype: int64

## Test

In [41]:
test = pd.read_csv(os.path.join(data_directory, 'test.csv'), engine='python', index_col=0)

In [42]:
test.head().T

id,0,1,2,3,4
msno,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=
song_id,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=
source_system_tab,my library,my library,discover,radio,radio
source_screen_name,Local playlist more,Local playlist more,,Radio,Radio
source_type,local-library,local-library,song-based-playlist,radio,radio


In [43]:
test.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 5 columns):
msno                  2556790 non-null object
song_id               2556790 non-null object
source_system_tab     2548348 non-null object
source_screen_name    2393907 non-null object
source_type           2549493 non-null object
dtypes: object(5)
memory usage: 117.0+ MB


In [44]:
test['target'] = None

In [45]:
test['data_from'] = 'test'

In [46]:
test.msno.nunique()

25131

In [47]:
test.song_id.nunique()

224753

In [48]:
test.source_system_tab.nunique()

8

In [49]:
test.source_system_tab.value_counts()

my library      1019492
discover         871068
search           277615
radio            212765
listen with       98628
explore           66023
notification       2124
settings            633
Name: source_system_tab, dtype: int64

In [50]:
test.source_screen_name.nunique()

22

In [51]:
test.source_screen_name.value_counts()

Local playlist more     845115
Online playlist more    529807
Radio                   211201
Album more              176129
Search                  121982
Artist more             110999
Discover Feature         93401
Others profile more      90457
Discover Chart           78999
Discover Genre           41617
Explore                  27872
My library               25559
Unknown                  23620
Discover New              5277
Search Trends             4883
Search Home               4705
My library_Search         2114
Self profile more          131
Concert                     13
People local                13
Payment                     12
People global                1
Name: source_screen_name, dtype: int64

In [52]:
test.source_type.nunique()

12

In [53]:
test.source_type.value_counts()

online-playlist           774532
local-library             582346
local-playlist            294537
radio                     215164
album                     195190
top-hits-for-artist       179360
song                      129153
song-based-playlist        87179
listen-with                84499
topic-article-playlist      5082
my-daily-playlist           2023
artist                       428
Name: source_type, dtype: int64

In [54]:
test.fillna('unknown', inplace=True)

In [55]:
test.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 7 columns):
msno                  2556790 non-null object
song_id               2556790 non-null object
source_system_tab     2556790 non-null object
source_screen_name    2556790 non-null object
source_type           2556790 non-null object
target                2556790 non-null object
data_from             2556790 non-null object
dtypes: object(7)
memory usage: 156.1+ MB


## Songs

In [56]:
songs = pd.read_csv(os.path.join(data_directory, 'songs.csv'))

In [57]:
songs.head().T

Unnamed: 0,0,1,2,3,4
song_id,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=
song_length,247640,197328,231781,273554,140329
genre_ids,465,444,465,465,726
artist_name,張信哲 (Jeff Chang),BLACKPINK,SUPER JUNIOR,S.H.E,貴族精選
composer,董貞,TEDDY| FUTURE BOUNCE| Bekuh BOOM,,湯小康,Traditional
lyricist,何啟弘,TEDDY,,徐世珍,Traditional
language,3,31,31,3,52


In [58]:
songs.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        2296320 non-null object
song_length    2296320 non-null int64
genre_ids      2202204 non-null object
artist_name    2296320 non-null object
composer       1224966 non-null object
lyricist       351052 non-null object
language       2296319 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 122.6+ MB


In [59]:
songs.lyricist.fillna('unknown', inplace=True)

In [60]:
songs.composer.fillna('unknown', inplace=True)

In [61]:
songs.genre_ids.fillna('unknown', inplace=True)

In [62]:
songs.language.fillna(-1, inplace=True)

In [63]:
songs.language = songs.language.astype(np.int64)

In [64]:
songs.head().T

Unnamed: 0,0,1,2,3,4
song_id,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=
song_length,247640,197328,231781,273554,140329
genre_ids,465,444,465,465,726
artist_name,張信哲 (Jeff Chang),BLACKPINK,SUPER JUNIOR,S.H.E,貴族精選
composer,董貞,TEDDY| FUTURE BOUNCE| Bekuh BOOM,unknown,湯小康,Traditional
lyricist,何啟弘,TEDDY,unknown,徐世珍,Traditional
language,3,31,31,3,52


In [65]:
songs.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2296320 entries, 0 to 2296319
Data columns (total 7 columns):
song_id        2296320 non-null object
song_length    2296320 non-null int64
genre_ids      2296320 non-null object
artist_name    2296320 non-null object
composer       2296320 non-null object
lyricist       2296320 non-null object
language       2296320 non-null int64
dtypes: int64(2), object(5)
memory usage: 122.6+ MB


In [66]:
songs.song_length.nunique()

146534

In [67]:
songs.genre_ids.nunique()

1046

In [68]:
songs.artist_name.nunique()

222363

In [69]:
songs.composer.nunique()

329823

In [70]:
songs.lyricist.nunique()

110925

In [71]:
songs.language.nunique()

10

## Members

In [72]:
members = pd.read_csv(os.path.join(data_directory, 'members.csv'))

In [73]:
members.head().T

Unnamed: 0,0,1,2,3,4
msno,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=
city,1,1,1,1,1
bd,0,0,0,0,0
gender,,,,,
registered_via,7,7,4,9,4
registration_init_time,20110820,20150628,20160411,20150906,20170126
expiration_date,20170920,20170622,20170712,20150907,20170613


In [74]:
members.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
msno                      34403 non-null object
city                      34403 non-null int64
bd                        34403 non-null int64
gender                    14501 non-null object
registered_via            34403 non-null int64
registration_init_time    34403 non-null int64
expiration_date           34403 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.8+ MB


In [75]:
members.fillna('unknown', inplace=True)

In [76]:
members.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34403 entries, 0 to 34402
Data columns (total 7 columns):
msno                      34403 non-null object
city                      34403 non-null int64
bd                        34403 non-null int64
gender                    34403 non-null object
registered_via            34403 non-null int64
registration_init_time    34403 non-null int64
expiration_date           34403 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.8+ MB


In [77]:
members.head().T

Unnamed: 0,0,1,2,3,4
msno,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=
city,1,1,1,1,1
bd,0,0,0,0,0
gender,unknown,unknown,unknown,unknown,unknown
registered_via,7,7,4,9,4
registration_init_time,20110820,20150628,20160411,20150906,20170126
expiration_date,20170920,20170622,20170712,20150907,20170613


In [78]:
members.registration_init_time = pd.to_datetime(members.registration_init_time, format='%Y%m%d')

In [79]:
members.expiration_date = pd.to_datetime(members.expiration_date, format='%Y%m%d')

## Sample submission

In [80]:
sample_submission = pd.read_csv(os.path.join(data_directory, 'sample_submission.csv'))

In [81]:
sample_submission.head(10)

Unnamed: 0,id,target
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
5,5,0.5
6,6,0.5
7,7,0.5
8,8,0.5
9,9,0.5


In [82]:
sample_submission.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 2 columns):
id        2556790 non-null int64
target    2556790 non-null float64
dtypes: float64(1), int64(1)
memory usage: 39.0 MB


# Solutions

In [83]:
X, y = train[train.columns[:-2]], train[train.columns[-2]]

In [84]:
X_for_submission = test[test.columns[:-2]]

In [85]:
X.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist


In [86]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [87]:
X_for_submission.head()

Unnamed: 0_level_0,msno,song_id,source_system_tab,source_screen_name,source_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library
1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library
2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,unknown,song-based-playlist
3,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,ztCf8thYsS4YN3GcIL/bvoxLm/T5mYBVKOO4C9NiVfQ=,radio,Radio,radio
4,1a6oo/iXKatxQx4eS9zTVD+KlSVaAFbTIqVvwLC1Y0k=,MKVMpslKcQhMaFEgcEQhEfi5+RZhMYlU3eRDpySrH8Y=,radio,Radio,radio


In [88]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.9,
    random_state=42,
    stratify=y
)

In [89]:
print(X_train.shape, y_train.shape)

(737741, 5) (737741,)


In [90]:
print(type(X_train), type(y_train))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [91]:
print(X_test.shape, y_test.shape)

(6639677, 5) (6639677,)


In [92]:
print(type(X_test), type(y_test))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


## Naive frequency approach

In [None]:
naive_frequency_recommender = SongFrequencyBasedRecommender()

In [None]:
print('\t', cross_val_score(naive_frequency_recommender, X_train, y_train, cv=5))

In [None]:
naive_frequency_recommender.fit(X_train, y_train)

In [None]:
y_pred = naive_frequency_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (Naive frequency approach)')
plt.legend(loc="lower right")
plt.show()

## SVD

In [None]:
svd_recommender = SVDBasedRecommender()

In [None]:
print('\t', cross_val_score(svd_recommender, X_train, y_train, cv=5))

In [None]:
svd_recommender.fit(X_train, y_train)

In [None]:
y_pred = svd_recommender.predict(X_test)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example (SVD)')
plt.legend(loc="lower right")
plt.show()

## FM

In [93]:
categorical_features = [
#     'source_system_tab',
#     'source_screen_name',
    'city',
    'gender'
]

In [94]:
categorical_features_lang = [
    'language'
]

In [95]:
numerical_features = [
#     'bd',
#     'song_length',
    'days_registered'
]

In [96]:
num_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('discretize', KBinsDiscretizer(n_bins=4, encode='onehot-dense'))
])

In [97]:
cat_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [98]:
cat_features_pipeline_lang = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [99]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_features_pipeline, numerical_features),
        ('cat', cat_features_pipeline, categorical_features),
        ('cat_lang', cat_features_pipeline_lang, categorical_features_lang)
    ]
)

In [100]:
unified_pipeline = Pipeline(
    steps=[
        ('add_meta_info', MetaFeaturesExtractor(user_meta=members, item_meta=songs)),
        ('preprocessing', preprocessor)
    ]
)

In [101]:
X_train = unified_pipeline.fit_transform(X_train, y_train)

In [102]:
X_train.shape

(737741, 38)

In [103]:
from tffm import TFFMClassifier

In [104]:
import tensorflow as tf

In [105]:
model = TFFMClassifier(
    order=6,
    rank=10,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)

In [108]:
model.fit(X_train, y_train.values, show_progress=True)


  0%|                                                                                                                                                                                                            | 0/100 [00:00<?, ?epoch/s]

InvalidArgumentError: NaN or Inf in w[4]. : Tensor had NaN values
	 [[{{node learnable_params/VerifyFinite_4/CheckNumerics}} = CheckNumerics[T=DT_FLOAT, _class=["loc:@Adam/update_learnable_params/embedding_5/ApplyAdam"], message="NaN or Inf in w[4].", _device="/job:localhost/replica:0/task:0/device:CPU:0"](learnable_params/embedding_5/read)]]

Caused by op 'learnable_params/VerifyFinite_4/CheckNumerics', defined at:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1432, in _run_once
    handle._run()
  File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3189, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-106-96a66af7c83c>", line 1, in <module>
    model.fit(X_train, y_train, show_progress=True)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tffm\models.py", line 60, in fit
    self._fit(X_=X, y_=used_y, w_=used_w, n_epochs=n_epochs, show_progress=show_progress)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tffm\base.py", line 209, in _fit
    self.core.build_graph()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tffm\core.py", line 219, in build_graph
    self.init_learnable_params()
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tffm\core.py", line 125, in init_learnable_params
    msg='NaN or Inf in w[{}].'.format(i-1))
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\ops\numerics.py", line 45, in verify_tensor_all_finite
    verify_input = array_ops.check_numerics(t, message=msg)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 972, in check_numerics
    "CheckNumerics", tensor=tensor, message=message, name=name)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\framework\ops.py", line 3272, in create_op
    op_def=op_def)
  File "c:\users\efim_golovin\desktop\courses\ml\11. recommendation systems_2\recommendation_systems_2\venv\lib\site-packages\tensorflow\python\framework\ops.py", line 1768, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): NaN or Inf in w[4]. : Tensor had NaN values
	 [[{{node learnable_params/VerifyFinite_4/CheckNumerics}} = CheckNumerics[T=DT_FLOAT, _class=["loc:@Adam/update_learnable_params/embedding_5/ApplyAdam"], message="NaN or Inf in w[4].", _device="/job:localhost/replica:0/task:0/device:CPU:0"](learnable_params/embedding_5/read)]]
