In [1]:
import os
import json
import pandas as pd
import numpy as np
import scipy.sparse as sp

NUM_TRAIN_SONGS = 5285871
NUM_TRAIN_PLAYLISTS = 115071
MAX_PLAYLISTS = 153429
NUM_TRACKS = 707989

class DataReader:
    def __init__(self, train_fname = "../data/data_json/train.json"):
        train_list = self.load_json(train_fname)
        self.train_df = pd.DataFrame(index=range(0, NUM_TRAIN_SONGS),
                                     columns=["playlist_id", "song_id"])
        """
        train.json에서 playlist_id와 song_id만 추출하여 dataframe 만들기
        """
        i = 0
        for dic in train_list:
            for (k, v) in dic.items():
                if(k == "id"):
                    playlist_id = v
                if(k == "songs"):
                    for song_id in v:
                        self.train_df.iloc[i][0] = playlist_id
                        self.train_df.iloc[i][1] = song_id
                        i = i + 1

        self.train_df.rename(columns={'playlist_id':'pid',
                                      'song_id':'tid'},inplace=True)
        # csv로 저장
        self.train_df.to_csv("../data/data_csv/train.csv")

    def get_urm(self):
        # collect data to build urm
        playlists = self.train_df['pid'].values
        tracks = self.train_df['tid'].values
        assert (playlists.size == tracks.size)
        n_playlists = MAX_PLAYLISTS
        n_tracks = NUM_TRACKS
        n_interactions = tracks.size

        # building the user-rating matrix(playlist-track matrix)
        urm = sp.csr_matrix((np.ones(n_interactions), (playlists,tracks)),
                            shape=(n_playlists, n_tracks), dtype=np.int32)
        # save urm
        self.__save_matrix('ptm',urm)

        return urm

    def load_json(self, fname):
        with open(fname, encoding="utf-8") as f:
            json_obj = json.load(f)

        return json_obj

    def __save_matrix(self, name, sparse_matrix):
        if not os.path.exists('../matrices/'):
            os.makedirs('../matrices/')
        print("[saving "+name+".npz in "+'../matrices'+ " ]")
        sp.save_npz('../matrices/' + name+ '.npz', sparse_matrix)

    def get_val_playlists(self):
        """
        :param name /.../num_tracks:  if true it will be in the numpy array returned
        :return:            numpy ndarray of shape (10k, 2/3/4/5 )
                                                    (playlists id, features[name...numtracks]
        """
        df = pd.read_csv(filepath_or_buffer='../data/data_csv/val.csv',
                         sep=',', header=0,
                         usecols=['pid'],
                         dtype={'pid': np.int32})

        # building info
        i = 0
        p_info = [df['pid'].values]
        order = str(i) + '-pid'
        i += 1

        p_info = np.array(p_info).T
        return p_info

In [57]:
# make DataReader object and read json file and conver to the csv
dr = DataReader()
# make playlist-track matrix and return
urm = dr.get_urm()

[saving ptm.npz in ../matrices ]


In [3]:
#test
NUM_VAL_PLAYLISTS = 23015
def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj

val_fname = "../data/data_json/val.json"
val_list = load_json(val_fname)
val_df = pd.DataFrame(index=range(0, NUM_VAL_PLAYLISTS),
                                     columns=["playlist_id"])
i = 0
for dic in val_list:
    for (k, v) in dic.items():
        if (k == "id"):
            val_df.iloc[i][0] = v
            i = i + 1

In [4]:
val_df

Unnamed: 0,playlist_id
0,118598
1,131447
2,51464
3,45144
4,79929
...,...
23010,101722
23011,122127
23012,77438
23013,36231


In [6]:
val_df.rename(columns={'playlist_id':'pid'}, inplace=True)

In [7]:
val_df

Unnamed: 0,pid
0,118598
1,131447
2,51464
3,45144
4,79929
...,...
23010,101722
23011,122127
23012,77438
23013,36231


In [1]:
import pandas as pd
import numpy as np
def load_train_csv(train_fname):
    loaded = pd.read_csv(filepath_or_buffer=train_fname,
                     sep=',', header=0,
                     usecols=['pid', 'tid'],
                     dtype={'pid': np.int32, 'tid': np.int32})
    return loaded

df = load_train_csv("../data/data_csv/train.csv")


In [None]:

















dr.train_df.rename(columns={'playlist_id':'pid',
                                      'song_id':'tid'},inplace=True)

In [19]:
dr.train_df.to_csv("../data/data_csv/train.csv")

In [20]:
read_train = pd.read_csv(filepath_or_buffer="../data/data_csv/train.csv",
                               sep=',',header=0, usecols=['pid', 'tid'],
                               dtype={'pid': np.int32, 'tid': np.int32})



In [21]:
read_train

Unnamed: 0,pid,tid
0,61281,525514
1,61281,129701
2,61281,383374
3,61281,562083
4,61281,297861
...,...,...
5285866,100389,111365
5285867,100389,51373
5285868,100389,640239
5285869,100389,13759


In [37]:
a = read_train.iloc[read_train["pid"].idxmax()][0]
a

153428

In [23]:
a = pd.read_csv("../data/recsys/train_interactions.csv",sep='\t')
a

Unnamed: 0,pid,tid,pos
0,0,0,0
1,0,1,1
2,0,2,2
3,0,3,3
4,0,4,4
...,...,...,...
66346423,999999,72432,105
66346424,999999,288294,106
66346425,999999,141054,107
66346426,999999,141039,108


In [24]:
b = pd.read_csv("../data/recsys/tracks.csv",sep='\t')
b

Unnamed: 0,tid,arid,alid,track_uri,track_name,duration_ms
0,0,0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),226863
1,1,1,1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,198800
2,2,2,2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,235933
3,3,3,3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,267266
4,4,4,4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,227600
...,...,...,...,...,...,...
2262287,2262287,93486,229637,spotify:track:2aOoiTTV0OR8DYxCk8o0JR,Forevermind - Pax217 Album Version,206400
2262288,2262288,93486,229637,spotify:track:1Uo65qTxnCg1N1X00lgcjr,Shalom - Pax217 Album Version,268266
2262289,2262289,93486,229637,spotify:track:5uEE5tii66I0cC7kZ7IMxE,Free To Be - Pax217 Album Version,334280
2262290,2262290,93486,229637,spotify:track:6A1RfnrMdxb24OYllzzTUX,Skwid - Pax217 Album Version,240453


In [42]:
# Dataframe with interactions

# collect data to build urm
playlists = dr.train_df['pid'].values
tracks = dr.train_df['tid'].values
assert (playlists.size == tracks.size)
n_playlists = MAX_PLAYLISTS
n_tracks = NUM_TRACKS
n_interactions = tracks.size

# building the urm
urm = sp.csr_matrix((np.ones(n_interactions), (playlists,tracks)),
                    shape=(n_playlists, n_tracks), dtype=np.int32)

In [43]:
urm

<153429x707989 sparse matrix of type '<class 'numpy.int32'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [48]:
def __save_matrix(name, sparse_matrix):
    if not os.path.exists('../matrices/'):
        os.makedirs('../matrices/')
    print("[saving "+name+".npz in "+'../matrices'+ " ]")
    sp.save_npz('../matrices/' + name+ '.npz', sparse_matrix)

In [49]:
# save on disk
__save_matrix('ptm',urm)

[saving ptm.npz in ../matrices ]


In [53]:
dr.get_urm()


AttributeError: 'DataReader' object has no attribute 'get_urm'

In [1]:
import pandas as pd
import numpy as np

train = pd.read_json('../data/data_json/test.json', typ = 'frame')


# 플레이리스트 아이디(id)와 수록곡(songs) 추출
plylst_song_map = train[['id', 'songs']]

# unnest songs
plylst_song_map_unnest = np.dstack(
    (
        np.repeat(plylst_song_map.id.values, list(map(len, plylst_song_map.songs))),
        np.concatenate(plylst_song_map.songs.values)
    )
)

# unnested 데이터프레임 생성 : plylst_song_map
plylst_song_map = pd.DataFrame(data = plylst_song_map_unnest[0], columns = plylst_song_map.columns)
plylst_song_map['id'] = plylst_song_map['id'].astype(str)
plylst_song_map['songs'] = plylst_song_map['songs'].astype(str)

# unnest 객체 제거
del plylst_song_map_unnest

In [2]:
plylst_song_map



Unnamed: 0,id,songs
0,70107.0,398985.0
1,70107.0,449403.0
2,70107.0,411543.0
3,70107.0,528044.0
4,70107.0,143048.0
...,...,...
197643,126651.0,525309.0
197644,126651.0,324209.0
197645,126651.0,124706.0
197646,126651.0,265060.0


In [None]:
import pandas as pd
import numpy as np

train = pd.read_json('../data/data_json/val.json', typ = 'frame')


# 플레이리스트 아이디(id)와 수록곡(songs) 추출
plylst_song_map = train[['id', 'songs']]

# unnest songs
plylst_song_map_unnest = np.dstack(
    (
        np.repeat(plylst_song_map.id.values, list(map(len, plylst_song_map.songs))),
        np.concatenate(plylst_song_map.songs.values)
    )
)

# unnested 데이터프레임 생성 : plylst_song_map
plylst_song_map = pd.DataFrame(data = plylst_song_map_unnest[0], columns = plylst_song_map.columns)
plylst_song_map['id'] = plylst_song_map['id'].astype(str)
plylst_song_map['songs'] = plylst_song_map['songs'].astype(str)

# unnest 객체 제거
del plylst_song_map_unnest


In [1]:
from utils.datareader import DataReader
from recommender.dot_product import dot_product_similarity, dot_product
from utils.post_processing import eurm_to_recommendation_list

# make DataReader object and read json file and convert to the csv
dr = DataReader(only_load=True)
# make playlist-track matrix and return
urm = dr.get_urm(only_load=True)

s = dot_product_similarity(urm.T,k=100)

r = dot_product(urm, s, k=500)
r = r.tocsr()
pid = dr.get_val_playlists().transpose()[0]

"\nConvert the eurm = (10.000, 2M) into a recommendation list if cat is set to 'all', otherwhise\nConvert the eurm = (10.000, 2M) into a recommendation list if a category is specified. #TODO @seba 1k o 10k?\n:param eurm: the estimated user rating matrix\n:param remove_seed: remove seed tracks from playlists\n:param datareader: a Datareader object for seeds removing\n:param cat: 'all' or a value between 1 and 10\n:return: recommendation_list: a list of list of recommendations of shape (10k,500)\n"

In [20]:
pid

array([118598, 131447,  51464, ...,  77438,  36231,  65189])

In [16]:
eurm = r[pid]


In [17]:
eurm.d

array([], dtype=float32)

In [None]:
from tqdm import tqdm
import numpy as np

"""
Convert the eurm = (10.000, 2M) into a recommendation list if cat is set to 'all', otherwhise
Convert the eurm = (10.000, 2M) into a recommendation list if a category is specified. #TODO @seba 1k o 10k?
:param eurm: the estimated user rating matrix
:param remove_seed: remove seed tracks from playlists
:param datareader: a Datareader object for seeds removing
:param cat: 'all' or a value between 1 and 10
:return: recommendation_list: a list of list of recommendations of shape (10k,500)
"""

In [2]:
# Convert eurm
# shape is 23015 x 707989 sparse matrix
eurm = eurm.tocsr()

In [3]:
# shape is 23015 x 707989 sparse matrix
eurm

<23015x707989 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [15]:
print(eurm.data)


[]


In [4]:
"""
Remove seed tracks from the eurm (10K, 2M)
:param eurm: original eurm
:param datareader: a Datareader object, the same used to build the original eurm
:return: eurm: eurm with no seed tracks
"""

# Get urm with shape of eurm
# shape is 153429 x 707989
urm = dr.get_urm(only_load=True)

In [6]:
# shape is ndarray: (23015,)
pids = dr.get_val_pids()

In [8]:
urm_test = urm[pids]

In [9]:
# shape is 23015 x 707989
urm_test

<23015x707989 sparse matrix of type '<class 'numpy.int32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [12]:
max_value = eurm.max()

In [13]:
max_value

0.0

In [None]:
new_data = np.ones(len(urm_test.data)) * max_value
urm_test.data = new_data

# Remove seen
eurm = eurm - urm_test

if eliminate_negative:
    eurm.data[eurm.data <= 0] = 0
    eurm.eliminate_zeros()


# Remove seeds
eurm = eurm_remove_seed(eurm, datareader)
print('Seeds removed!')


# Initialize rec_list
recommendation_list = [[] for x in range(100)]

for row in tqdm((range(eurm.shape[0])), desc='Converting eurm'):
    print("print row")
    print(row)
    print(row.shape)
    val = eurm.data[eurm.indptr[row]:eurm.indptr[row+1]]
    ind = val.argsort()[-500:][::-1]
    ind = list(eurm[row].indices[ind])

    recommendation_list[row] = ind
