In [166]:
import keras
from keras.activations import relu, tanh
from keras.callbacks import TensorBoard, EarlyStopping
from keras.layers import Conv2D, Dense, Flatten, MaxPool2D, LeakyReLU
from keras.losses import MSE
from keras.metrics import mse
from keras.models import Sequential
from keras.optimizers import RMSprop, Adam
import boto3
import pandas as pd
from tempfile import  NamedTemporaryFile
from librosa.feature import melspectrogram
import librosa
import numpy as np
import time
import requests
import json
import pickle
import sklearn.metrics as metrics

In [2]:
boto_session = boto3.session.Session(region_name='eu-west-3')
songs_csv_path = 'https://s3.eu-west-3.amazonaws.com/sukikana-msd/full_dataset_10k.csv'


In [2]:
meta_csv_path = '../../data/training_meta.csv'
song_factors_csv_path = '../../data/song_factors.csv'
user_factors_csv_path = '../../data/user_factors.csv'
taste_csv_path = '../../data/training_cf.{}.csv'

In [4]:
df_meta = pd.read_csv(meta_csv_path, sep=';')
df_song_factors = pd.read_csv(song_factors_csv_path, sep=';', index_col='song_id')
df_user_factors = pd.read_csv(user_factors_csv_path, sep=';', index_col='user_id')
df_taste = pd.concat([pd.read_csv(taste_csv_path.format(i), sep=';') for i in range(4)])

In [3]:
api = 'http://localhost:3099/api/v1/songs'

In [5]:
api = 'http://35.189.106.213:3000/api/v1/songs'

In [24]:
latents = []
mels = []
indices = []

In [12]:
req_get = requests.get(api, params={'skip': 0, 'limit': 50})# song.song_id})
req_get.raise_for_status()
res_get = req_get.json()
len(res_get)

50

In [48]:
for i in range(2475, 10000, 45):
    print(i)
    req_get = requests.get(api, params={'skip': i, 'limit': 45, 'select': {'msdId': 1}})# song.song_id})
    req_get.raise_for_status()
    res_get = req_get.json()
    #latents.extend(list(map(lambda song: np.array(song['latents']), res_get)))
    #mels.extend(list(map(lambda song: np.array(song['mels']).reshape(song['nComponents'], -1), res_get)))
    indices.extend(list(map(lambda song: song['msdId'], res_get)))

2475
2520
2565
2610
2655
2700
2745
2790
2835


KeyboardInterrupt: 

In [49]:
len(indices)

2835

In [25]:
with open('latents.pkl', 'wb') as lf:
    #pickle.dump(latents, lf)

In [70]:
for i, chunk in enumerate(range(0, len(mels), 100)):
    with open('mels.{}.pkl'.format(i), 'wb') as mf:
       #pickle.dump(mels[chunk:chunk+100], mf)

In [7]:
with open('latents.pkl', 'rb') as lf:
    latents = pickle.load(lf)

In [62]:
for i in range(46):
    with open('mels.{}.pkl'.format(i), 'rb') as mf:
        mels.extend(pickle.load(mf))

In [63]:
mels = mels[:2835]

In [30]:
df = pd.read_csv('./training.csv', sep=';', index_col=0)

In [35]:
df = df.latents.apply(lambda x: np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))

In [69]:
mels = [mels[i] for i in np.where(~df.loc[indices].isna())[0]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


array([   0,    1,    2, ..., 2832, 2833, 2834])

In [None]:
df.loc[indices][~df.loc[indices].isna()].values

In [107]:
latents = np.array(df.loc[indices][~df.loc[indices].isna()].values.tolist())

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [18]:
mels = [mel.shape[1] >= 1200 for mel in mels]

In [72]:
df = df[[value.shape[1] >= 1200 for value in df.mels.values]]

In [74]:
n_components = 128
mel_spectrum_length = 128
sampling_rate = 22050
n_factors = 50

In [168]:
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(3, 3), strides=2, use_bias=True, activation=relu, input_shape=(n_components, mel_spectrum_length, 1)))
model.add(Flatten())
model.add(Dense(256))
model.add(LeakyReLU(alpha=0.3))
model.add(Dense(n_factors))
model.add(LeakyReLU(alpha=0.3))          
model.compile(optimizer=RMSprop(),loss=MSE, metrics=[])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 63, 63, 32)        320       
_________________________________________________________________
flatten_6 (Flatten)          (None, 127008)            0         
_________________________________________________________________
dense_11 (Dense)             (None, 256)               32514304  
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 50)                12850     
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 50)                0         
Total params: 32,527,474
Trainable params: 32,527,474
Non-trainable params: 0
________________________________________________________________

In [163]:
X = np.array([value[:, 500:628] for value in mels]).reshape(-1, n_components, mel_spectrum_length, 1)

In [164]:
Y = latents

In [169]:
model.fit(x=X, y=Y, validation_split=0.3, batch_size=32, epochs=50, callbacks=[TensorBoard(log_dir='./tensorboard-logs/run-{}'.format(time.strftime('%Y/%m/%d %H:%M:%S'))), EarlyStopping()])

Train on 1980 samples, validate on 850 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


<keras.callbacks.History at 0x19212e278>

In [126]:
df_meta = df_meta[df_meta.song_id.isin(indices)]

In [127]:
df_taste = df_taste[df_taste.song_id.isin(indices)]

In [131]:
df_taste.user_id.value_counts()

ec6dfcf19485cb011e0b22637075037aae34cf26    29
8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc    17
4e11f45d732f4861772b2906f81a7d384552ad12    17
96f7b4f800cafef33eae71a6bc44f7139f63cd7a    16
fef771ab021c200187a419f5e55311390f850a50    15
5ef127be2845313b04cdab97eafd5bab866eaf35    15
9af92dd4349d57bd60506d6fc2c29c1bbeaf7400    14
7e543508a213f4f22e0cb54ecf2df9c370070a28    14
5f428dbd676d47114393f517601d0a6e85a0131e    14
44dbdad6ea3eeb7cb5c4f0e74d46319c64b17704    14
5a3417a1955d9136413e0d293cd36497f5e00238    13
9c2dfee26bbdd4fb19e9800244bea6e7181caeae    13
4e73d9e058d2b1f2dba9c1fe4a8f416f9f58364f    13
4a1af7eb442b43b622bd8f2d3c4fab76f74cf087    13
3288389bf9ef956a23a0a4ea86f60bf24ba7f69e    13
dce67cd21e628ef2757df7c46a9fd66e2ba5d79b    13
ba43e2ef9ea08ae36dbb127d9af34b61536219c9    13
736083bd7ecd162effb7668cab6c281945762e85    13
b48c847eda0f305e195d33de05d56c79692352bb    13
a05e548059abb1f77cad6cb9c3c0c48e0616f551    12
070941445cba8e8a9157f1253116cc430c31a811    12
3eae654f2e94d

In [134]:
df_taste[df_taste.user_id == '8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc'].sort_values('play_count', ascending=False).merge(df_meta, on='song_id')

Unnamed: 0,user_id,song_id,play_count,artist_id,artist_name,audio_md5,song_hotttnesss,title,track_id,preview_id,remote_preview_url,preview_artist,preview_title,preview_service
0,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOZSFWF12A6D4F93D5,2,ARVNNXD1187B9AE50D,Marvin Gaye,4ef06e9dd63989f6fb2189ac575e423d,0.85582,Let's Get It On,TRCZGBG128F4222F61,tra.1963881,http://listen.vo.llnwd.net/g3/9/3/9/4/0/128340...,Marvin Gaye,Let's Get It On,napster
1,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOWPNTP12AAF3B1D86,2,AREYHBH1187FB4537B,Ryan Bingham,242bebb36262dc9a2a94ed5dd4b8cdff,0.624834,Day Is Done,TRCCZWQ128F930900B,tra.28464890,http://listen.vo.llnwd.net/g2/9/3/9/5/3/906035...,Ryan Bingham,Day Is Done,napster
2,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOWIELR12A58A79E3D,2,AROTTEK122E64480AA,Monsters Of Folk,2d37c3074411b0716bb21aeed0d271c4,0.831586,Temazcal,TRJUYRT128F935A63F,tra.53233691,http://listen.vo.llnwd.net/g1/1/9/7/8/9/990598...,Monsters Of Folk,Temazcal,napster
3,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOFZPSJ12A8C13FADE,2,ARYO9H11187FB57945,Old Crow Medicine Show,68edd6412a305930ae5390c4f3f683a6,0.813155,Take 'Em Away,TRCCEKC12903CE8240,tra.42131532,http://listen.vo.llnwd.net/g3/6/5/5/2/4/132404...,Old Crow Medicine Show,Take 'Em Away,napster
4,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOALRMF12A8C1436A7,1,ARVV61U1187FB4FC32,The Temper Trap,3552f49e0cd023cedca98b5ab09c7d13,0.873238,Soldier On,TRCXHZZ128F92EF09E,tra.177472841,http://listen.vo.llnwd.net/g3/4/0/2/1/9/114859...,The Temper Trap,Soldier On,napster
5,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SORPAJB12AB0184D93,1,AR9J2H81187B9B5253,Chic,966dd0896cd270382e58d39b68e0bfde,,Le Freak,TRJBNKQ128F930E0DE,tra.1888622,http://listen.vo.llnwd.net/g3/6/3/9/7/9/133079...,Chic,Le Freak,napster
6,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOYEQLD12AB017C713,1,ARVG4OK1187B9B6BD3,The Killers,bda9f6c47f586a52987576d2a4405a15,,Joy Ride,TRCCPZO128F92F239F,tra.24458955,http://listen.vo.llnwd.net/g3/4/2/2/1/8/106788...,The Killers,Joy Ride,napster
7,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOXBCZH12A67ADAD77,1,ARVHQNN1187B9B9FA3,Cat Stevens,936062456a9fa7790b52cf2dfadd0aaa,0.787894,Peace Train,TRCCVVV128F42838D9,tra.2066906,http://listen.vo.llnwd.net/g1/8/1/4/3/0/104230...,Yusuf / Cat Stevens,Peace Train,napster
8,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOWCNBR12A6D4F69D7,1,ART4QZC1187FB51612,Janet Jackson,093d2a9bb941844d2d28ebb976069da4,0.557313,Rope Burn,TRCCOAE128F1469927,tra.2106350,http://listen.vo.llnwd.net/g1/8/9/8/0/2/104620...,Janet Jackson,Rope Burn,napster
9,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,SOPRVBH12A6D4F6C4F,1,ARRFHHE1187B98FE75,Voxtrot,589fdf5f7502ef927bbdfa1bc89c2a86,0.753795,Ghost,TRZSSMW128F428E44E,tra.14385488,http://listen.vo.llnwd.net/g1/3/3/2/3/7/102207...,Voxtrot,Ghost,napster


In [136]:
df_user_factors.loc['8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc'].values

0     0.152021
1     0.077299
2     0.116307
3    -0.317888
4     0.150680
5    -0.383566
6     0.055319
7    -0.304024
8    -0.023429
9    -0.145054
10   -0.163725
11   -0.191688
12   -0.158902
13   -0.170530
14    0.065332
15   -0.089207
16    0.256355
17    0.137176
18    0.454102
19    0.213469
20   -0.138222
21    0.296328
22    0.270011
23    0.294850
24   -0.058250
25   -0.098731
26   -0.178425
27   -0.075274
28    0.269863
29    0.327501
30    0.049571
31    0.019409
32    0.215409
33   -0.065060
34    0.349223
35    0.200936
36   -0.102151
37    0.268580
38    0.042046
39    0.062064
40    0.146535
41    0.414001
42   -0.255620
43    0.392303
44   -0.018195
45   -0.147027
46   -0.197584
47    0.266280
48    0.393570
49   -0.319378
Name: 8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc, dtype: float64

In [170]:
predictions = model.predict(np.array([value[:, 500:628] for value in mels]).reshape(-1, n_components, mel_spectrum_length, 1))

In [172]:
similarities = metrics.pairwise.cosine_similarity(predictions, df_user_factors.loc['8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc'].values.reshape(1, -1))

In [179]:
similarities.reshape(-1)[1788]

0.33445562785870375

In [177]:
np.argsort(similarities.reshape(-1))

array([1419, 2403, 1941, ...,   35,  862, 1788])

In [171]:
predictions

array([[-0.00853791,  0.00517047,  0.02003692, ..., -0.001036  ,
        -0.00735511,  0.03277497],
       [-0.00822889,  0.00837904, -0.00182874, ..., -0.01182501,
        -0.01614163,  0.03235246],
       [-0.00453793, -0.00111699,  0.05427284, ...,  0.01276771,
        -0.0053461 ,  0.01369182],
       ...,
       [-0.02049732, -0.00901657,  0.01392647, ..., -0.00430805,
        -0.02603914,  0.00082757],
       [-0.00982702,  0.00085453,  0.06321926, ..., -0.00315075,
        -0.00965074,  0.05927781],
       [-0.01577126,  0.00792511,  0.02976018, ..., -0.01109297,
        -0.02430431,  0.04850259]], dtype=float32)

In [180]:
indices[1788]

'SONJYOJ12A8C13D495'

In [183]:
indices[862]

'SOPZQEP12A8C13BA51'

In [182]:
df_meta[df_meta.song_id == 'SONJYOJ12A8C13D495']

Unnamed: 0,artist_id,artist_name,audio_md5,song_hotttnesss,song_id,title,track_id,preview_id,remote_preview_url,preview_artist,preview_title,preview_service
457,ARBNU7D1187B9991A4,Truefaith,3ff1aa41092cdc72118ead443d38537f,,SONJYOJ12A8C13D495,Here I Go Again,TRCCYMF128F428B188,tra.23243903,http://listen.vo.llnwd.net/g1/5/7/9/6/8/134486...,Draftpick,Here We Go Again,napster


In [184]:
df_meta[df_meta.song_id == 'SOPZQEP12A8C13BA51']

Unnamed: 0,artist_id,artist_name,audio_md5,song_hotttnesss,song_id,title,track_id,preview_id,remote_preview_url,preview_artist,preview_title,preview_service
10203,AR1UR7X1187FB50596,Inner City,d59282fc06bfafb1a5d02ab7970997f4,0.738459,SOPZQEP12A8C13BA51,Good Life,TRCXADO128F92DF704,tra.272416115,http://listen.vo.llnwd.net/g3/0/8/2/6/2/135502...,Inner City,Good Life 2017,napster
