# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



In [None]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [None]:
from collections import Counter

import numpy as np
import pandas as pd

import scipy.sparse as spr
import pickle

- train/val split

In [None]:
# -*- coding: utf-8 -*-
import copy
import random
import numpy as np


class ArenaSplitter:
    def _split_data(self, playlists):
        tot = len(playlists)
        train = playlists[:int(tot*0.80)]
        val = playlists[int(tot*0.80):]

        return train, val

    def _mask(self, playlists, mask_cols, del_cols):
        q_pl = copy.deepcopy(playlists)
        a_pl = copy.deepcopy(playlists)

        for i in range(len(playlists)):
            for del_col in del_cols:
                q_pl[i][del_col] = []
                if del_col == 'songs':
                    a_pl[i][del_col] = a_pl[i][del_col][:100]
                elif del_col == 'tags':
                    a_pl[i][del_col] = a_pl[i][del_col][:10]

            for col in mask_cols:
                mask_len = len(playlists[i][col])
                mask = np.full(mask_len, False)
                mask[:mask_len//2] = True
                np.random.shuffle(mask)

                q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
                a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

        return q_pl, a_pl

    def _mask_data(self, playlists):
        playlists = copy.deepcopy(playlists)
        tot = len(playlists)
        song_only = playlists[:int(tot * 0.3)]
        song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
        tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
        title_only = playlists[int(tot * 0.95):]

        print(f"Total: {len(playlists)}, "
              f"Song only: {len(song_only)}, "
              f"Song & Tags: {len(song_and_tags)}, "
              f"Tags only: {len(tags_only)}, "
              f"Title only: {len(title_only)}")

        song_q, song_a = self._mask(song_only, ['songs'], ['tags'])
        songtag_q, songtag_a = self._mask(song_and_tags, ['songs', 'tags'], [])
        tag_q, tag_a = self._mask(tags_only, ['tags'], ['songs'])
        title_q, title_a = self._mask(title_only, [], ['songs', 'tags'])

        q = song_q + songtag_q + tag_q + title_q
        a = song_a + songtag_a + tag_a + title_a

        shuffle_indices = np.arange(len(q))
        np.random.shuffle(shuffle_indices)

        q = list(np.array(q)[shuffle_indices])
        a = list(np.array(a)[shuffle_indices])

        return q, a

    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")

In [None]:
split = ArenaSplitter()
split.run("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/train.json")

Reading data...

Total playlists: 115071
Splitting data...
Original train...
Original val...
Masked val...
Total: 23015, Song only: 6904, Song & Tags: 11508, Tags only: 3452, Title only: 1151


In [None]:
genre_gn_all = pd.read_json('/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/genre_gn_all.json', typ = 'series')
song_meta = pd.read_json("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/song_meta.json")
train = pd.read_json("/content/arena_data/orig/train.json")
test = pd.read_json("/content/arena_data/questions/val.json")

In [None]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000


In [None]:
song_meta.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4


In [None]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [None]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000,1,0
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000,1,1
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000,1,2
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000,1,3
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000,1,4


In [None]:
plylst.dtypes

tags            object
id               int64
plylst_title    object
songs           object
like_cnt         int64
updt_date       object
istrain          int64
nid              int64
dtype: object

# 플레이리스트별 장르 데이터(곡 메타 데이터) 매핑

In [None]:
# 장르코드 : gnr_code, 장르명 : gnr_name
genre_gn_all = pd.DataFrame(genre_gn_all, columns = ['gnr_name']).reset_index().rename(columns = {'index' : 'gnr_code'})

In [None]:
genre_gn_all

Unnamed: 0,gnr_code,gnr_name
0,GN0100,발라드
1,GN0101,세부장르전체
2,GN0102,'80
3,GN0103,'90
4,GN0104,'00
...,...,...
249,GN2900,뮤지컬
250,GN2901,세부장르전체
251,GN2902,국내뮤지컬
252,GN2903,국외뮤지컬


In [None]:
big_gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] == '00'].reset_index(drop=True)
big_gnr_code

Unnamed: 0,gnr_code,gnr_name
0,GN0100,발라드
1,GN0200,댄스
2,GN0300,랩/힙합
3,GN0400,R&B/Soul
4,GN0500,인디음악
5,GN0600,록/메탈
6,GN0700,성인가요
7,GN0800,포크/블루스
8,GN0900,POP
9,GN1000,록/메탈


In [None]:
def mapping_function(data, col1, col2): 
    # 플레이리스트 아이디(col1)와 수록곡(col2) 추출
    plylst_song_map = data[[col1, col2]]

    # unnest col2
    plylst_song_map_unnest = np.dstack(
        (
            np.repeat(plylst_song_map[col1].values, list(map(len, plylst_song_map[col2]))), 
            np.concatenate(plylst_song_map[col2].values)
        )
    )

    # unnested 데이터프레임 생성 : plylst_song_map
    plylst_song_map = pd.DataFrame(data = plylst_song_map_unnest[0], columns = plylst_song_map.columns)
    plylst_song_map[col1] = plylst_song_map[col1].astype(int)
    plylst_song_map[col2] = plylst_song_map[col2].astype(int)

    # unnest 객체 제거
    del plylst_song_map_unnest
    return plylst_song_map

In [None]:
playlst_song_map = mapping_function(plylst, 'id', 'songs')
playlst_song_map = playlst_song_map.rename(columns = {'id' : 'plylst_id'})
playlst_song_map

Unnamed: 0,plylst_id,songs
0,147668,663185
1,147668,649626
2,147668,6855
3,147668,188486
4,147668,348451
...,...,...
4653524,80661,153991
4653525,80661,193909
4653526,80661,236205
4653527,80661,473382


In [None]:
playlst_song_map.dtypes

plylst_id    int64
songs        int64
dtype: object

In [None]:
song_meta.dtypes

song_gn_dtl_gnr_basket    object
issue_date                 int64
album_name                object
album_id                   int64
artist_id_basket          object
song_name                 object
song_gn_gnr_basket        object
artist_name_basket        object
id                         int64
dtype: object

In [None]:
playlst_song_map = playlst_song_map.merge(song_meta[['id', 'song_gn_gnr_basket']], how = 'left', left_on = 'songs', right_on = 'id')

In [None]:
playlst_song_map = playlst_song_map[['plylst_id', 'songs', 'song_gn_gnr_basket']].rename(columns = {'song_gn_gnr_basket':'gnr_code'})

In [None]:
playlst_song_map

Unnamed: 0,plylst_id,songs,gnr_code
0,147668,663185,[GN1500]
1,147668,649626,[GN0600]
2,147668,6855,[GN0600]
3,147668,188486,"[GN0500, GN0100]"
4,147668,348451,[GN0200]
...,...,...,...
4653524,80661,153991,[GN1700]
4653525,80661,193909,[GN1700]
4653526,80661,236205,[GN1900]
4653527,80661,473382,[GN1700]


In [None]:
plylst_gnr_map = playlst_song_map.groupby('plylst_id').apply(lambda x: np.concatenate(x['gnr_code'].values))

In [None]:
plylst_gnr_map = pd.DataFrame(plylst_gnr_map, columns = ['big_gnr_code']).reset_index()

In [None]:
plylst_gnr_map

Unnamed: 0,plylst_id,big_gnr_code
0,1,"[GN0300, GN0300]"
1,2,"[GN0500, GN0600, GN0900, GN1000, GN0900, GN100..."
2,4,"[GN1300, GN0300, GN2500, GN0300, GN2500, GN020..."
3,5,"[GN0200, GN0500, GN0800, GN0600, GN0100, GN010..."
4,6,"[GN2700, GN1100, GN2700, GN1100, GN1100, GN110..."
...,...,...
110460,153422,"[GN1300, GN1200, GN0500, GN0100, GN0900, GN090..."
110461,153423,"[GN0500, GN0600, GN0500, GN0400, GN0500, GN040..."
110462,153425,"[GN1600, GN1600, GN1600, GN1000, GN1600, GN160..."
110463,153426,"[GN1200, GN1200, GN1200, GN1200, GN1200, GN120..."


In [None]:
plylst = plylst.merge(plylst_gnr_map, how = 'left', left_on = 'id', right_on = 'plylst_id')

In [None]:
plylst

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,plylst_id,big_gnr_code
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000,1,0,147668.0,"[GN1500, GN0600, GN0600, GN0500, GN0100, GN020..."
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000,1,1,50422.0,"[GN1700, GN0900, GN1700, GN1400, GN1400, GN190..."
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000,1,2,116432.0,"[GN1800, GN1800, GN1800, GN1800, GN1800, GN180..."
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000,1,3,55076.0,"[GN1000, GN2500, GN0300, GN1000, GN2500, GN030..."
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000,1,4,125064.0,"[GN1200, GN1200, GN1200, GN1200, GN1200, GN120..."
...,...,...,...,...,...,...,...,...,...,...
115066,[],86994,ㅎㅎㅎㅎㅎㅎ,"[30712, 284814, 114387, 117793, 462530, 601315...",1,2016-06-24 15:33:51.000,0,115066,86994.0,"[GN2500, GN0300, GN0300, GN2500, GN0200, GN020..."
115067,"[밤, 새벽]",80661,연인들을 위한 주말 데이트 음악,"[166091, 321101, 630122, 464984, 401055, 15399...",48,2015-07-24 09:44:06.000,0,115067,80661.0,"[GN1600, GN1700, GN1500, GN1700, GN1000, GN170..."
115068,[국힙],138752,ChoiceForYou°단체곡/국내힙합°,[],3,2019-06-17 16:14:08.000,0,115068,,
115069,[알앤비],117885,toctoc dinner,[],1,2014-02-11 18:08:48.000,0,115069,,


# 모델링을 위한 전처리

playlist, song, tag의 id(각각 nid, sid, tid)를 새로 생성하는 이유는, 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문입니다.

- plylst_id_nid : playlist id -> nid
- plylst_nid_id : playlist nid -> id
- song_id_sid : song id -> sid
- song_sid_id : song sid -> id
- tag_id_tid : tag id -> tid
- tag_tid_id : tag tid -> id
- song_dict : song id -> count
- tag_dict : tag id -> count

In [None]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

n_songs = len(song_dict)

plylst의 songs와 tags를 새로운 id로 변환하여 DataFrame에 추가합니다

In [None]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [None]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,plylst_id,big_gnr_code,songs_id,tags_id
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000,1,0,147668.0,"[GN1500, GN0600, GN0600, GN0500, GN0100, GN020...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3]"
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000,1,1,50422.0,"[GN1700, GN0900, GN1700, GN1400, GN1400, GN190...","[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 7...",[4]
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000,1,2,116432.0,"[GN1800, GN1800, GN1800, GN1800, GN1800, GN180...","[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...",[5]
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000,1,3,55076.0,"[GN1000, GN2500, GN0300, GN1000, GN2500, GN030...","[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...","[6, 7, 8]"
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000,1,4,125064.0,"[GN1200, GN1200, GN1200, GN1200, GN1200, GN120...","[116, 117, 118, 119, 120, 121, 122, 123, 124, ...","[0, 1, 9]"


In [None]:
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id','big_gnr_code']]
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
plylst_use

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,big_gnr_code,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2016-06-23 10:06:27.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3]","[GN1500, GN0600, GN0600, GN0500, GN0100, GN020...",66,4
1,1,2013-08-15 13:17:11.000,"[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 7...",[4],"[GN1700, GN0900, GN1700, GN1400, GN1400, GN190...",14,1
2,1,2015-09-03 16:51:50.000,"[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...",[5],"[GN1800, GN1800, GN1800, GN1800, GN1800, GN180...",15,1
3,1,2017-01-09 15:41:25.000,"[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...","[6, 7, 8]","[GN1000, GN2500, GN0300, GN1000, GN2500, GN030...",21,3
4,1,2016-02-22 12:32:50.000,"[116, 117, 118, 119, 120, 121, 122, 123, 124, ...","[0, 1, 9]","[GN1200, GN1200, GN1200, GN1200, GN1200, GN120...",35,3
...,...,...,...,...,...,...,...
115066,0,2016-06-24 15:33:51.000,"[4218, 3461, 3442, 4562, 339, 18516, 45319, 23...",[],"[GN2500, GN0300, GN0300, GN2500, GN0200, GN020...",52,0
115067,0,2015-07-24 09:44:06.000,"[179155, 494519, 278615, 57372, 4489, 226672, ...","[2, 3]","[GN1600, GN1700, GN1500, GN1700, GN1000, GN170...",10,2
115068,0,2019-06-17 16:14:08.000,[],[332],,0,1
115069,0,2014-02-11 18:08:48.000,[],[38],,0,1


In [None]:
# 각 플레이리스트별 {장르: 등장 빈도} 딕셔너리 (곡, 장르가 nan인 경우는 제외함)
plylst_not_na = plylst_use[plylst_use['big_gnr_code'].isnull() == False]
gnr_cnt_dict = plylst_not_na['big_gnr_code'].map(lambda x: Counter(x))

In [None]:
gnr_cnt_dict

nid
0         {'GN1500': 7, 'GN0600': 17, 'GN0500': 14, 'GN0...
1         {'GN1700': 5, 'GN0900': 1, 'GN1400': 2, 'GN190...
2                               {'GN1800': 14, 'GN1500': 1}
3         {'GN1000': 15, 'GN2500': 2, 'GN0300': 2, 'GN09...
4         {'GN1200': 24, 'GN1300': 7, 'GN0900': 1, 'GN15...
                                ...                        
115063    {'GN2600': 70, 'GN1100': 21, 'GN2700': 9, 'GN0...
115064    {'GN0100': 18, 'GN0500': 5, 'GN0600': 4, 'GN04...
115065    {'GN2600': 1, 'GN0900': 2, 'GN0100': 5, 'GN040...
115066    {'GN2500': 20, 'GN0300': 11, 'GN0200': 33, 'GN...
115067    {'GN1600': 1, 'GN1700': 6, 'GN1500': 1, 'GN100...
Name: big_gnr_code, Length: 110465, dtype: object

In [None]:
for i in gnr_cnt_dict.iloc[:4]:
  print(i.keys())

dict_keys(['GN1500', 'GN0600', 'GN0500', 'GN0100', 'GN0200', 'GN0800', 'GN0300', 'GN1000', 'GN0900', 'GN0400', 'GN2500', 'GN1700'])
dict_keys(['GN1700', 'GN0900', 'GN1400', 'GN1900', 'GN0500', 'GN0300', 'GN1300', 'GN0100'])
dict_keys(['GN1800', 'GN1500'])
dict_keys(['GN1000', 'GN2500', 'GN0300', 'GN0900', 'GN0600'])


In [None]:
gnr_cnt_dict[0].keys()

dict_keys(['GN1500', 'GN0600', 'GN0500', 'GN0100', 'GN0200', 'GN0800', 'GN0300', 'GN1000', 'GN0900', 'GN0400', 'GN2500', 'GN1700'])

In [None]:
gnr_cnt_dict[0].values()

dict_values([7, 17, 14, 21, 3, 5, 2, 6, 2, 8, 6, 1])

In [None]:
# 각 플레이리스트별로 포함된 장르 개수
n_gnr = gnr_cnt_dict.map(len)
n_gnr

nid
0         12
1          8
2          2
3          5
4          5
          ..
115063     5
115064    16
115065     4
115066     7
115067     5
Name: big_gnr_code, Length: 110465, dtype: int64

- 여기서부터 train/test 다시 나눔

In [None]:
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [None]:
plylst_train

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,big_gnr_code,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2016-06-23 10:06:27.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3]","[GN1500, GN0600, GN0600, GN0500, GN0100, GN020...",66,4
1,1,2013-08-15 13:17:11.000,"[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 7...",[4],"[GN1700, GN0900, GN1700, GN1400, GN1400, GN190...",14,1
2,1,2015-09-03 16:51:50.000,"[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...",[5],"[GN1800, GN1800, GN1800, GN1800, GN1800, GN180...",15,1
3,1,2017-01-09 15:41:25.000,"[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...","[6, 7, 8]","[GN1000, GN2500, GN0300, GN1000, GN2500, GN030...",21,3
4,1,2016-02-22 12:32:50.000,"[116, 117, 118, 119, 120, 121, 122, 123, 124, ...","[0, 1, 9]","[GN1200, GN1200, GN1200, GN1200, GN1200, GN120...",35,3
...,...,...,...,...,...,...,...
92051,1,2020-01-15 15:15:45.000,"[7846, 25985, 8369, 25484, 18175, 47640, 64639...","[3155, 60, 58, 418, 23797, 1429]","[GN0100, GN0100, GN0100, GN0800, GN0800, GN080...",200,6
92052,1,2010-03-23 00:03:00.000,"[549717, 312106, 44486, 73897, 549718, 75445, ...",[4],"[GN1000, GN0900, GN0900, GN1200, GN0900, GN130...",12,1
92053,1,2019-05-15 13:26:07.000,"[61348, 16816, 1589, 83322, 8238, 152861, 1528...","[19, 18, 9, 37]","[GN0400, GN0600, GN0400, GN0100, GN0300, GN060...",99,4
92054,1,2013-12-24 14:40:01.000,"[191659, 182526, 549722, 183503, 35025, 79397,...","[159, 38]","[GN1300, GN1300, GN1300, GN1300, GN1300, GN130...",32,2


In [None]:
sum(plylst_train['big_gnr_code'].isnull()) # train data에는 장르, 곡에 결측값 없음

0

In [None]:
np.repeat(range(n_train), n_gnr[plylst_train.index])

array([    0,     0,     0, ..., 92055, 92055, 92055])

In [None]:
gnr_cnt_dict[:n_train]

nid
0        {'GN1500': 7, 'GN0600': 17, 'GN0500': 14, 'GN0...
1        {'GN1700': 5, 'GN0900': 1, 'GN1400': 2, 'GN190...
2                              {'GN1800': 14, 'GN1500': 1}
3        {'GN1000': 15, 'GN2500': 2, 'GN0300': 2, 'GN09...
4        {'GN1200': 24, 'GN1300': 7, 'GN0900': 1, 'GN15...
                               ...                        
92051    {'GN0100': 141, 'GN0800': 15, 'GN0400': 19, 'G...
92052    {'GN1000': 1, 'GN0900': 7, 'GN1200': 1, 'GN130...
92053    {'GN0400': 18, 'GN0600': 13, 'GN0100': 46, 'GN...
92054    {'GN1300': 24, 'GN1200': 4, 'GN0900': 2, 'GN11...
92055    {'GN1200': 24, 'GN1500': 2, 'GN2200': 1, 'GN09...
Name: big_gnr_code, Length: 92056, dtype: object

In [None]:
gnr_list_all = np.concatenate(list(map(lambda x: list(x.keys()), gnr_cnt_dict[:n_train])))
gnr_list_all

array(['GN1500', 'GN0600', 'GN0500', ..., 'GN2200', 'GN0900', 'GN1300'],
      dtype='<U32')

In [None]:
np.concatenate(list(map(lambda x: list(x.values()), gnr_cnt_dict[:n_train])))

array([ 7., 17., 14., ...,  1.,  1.,  1.])

- train data에는 있는데 장르 코드 데이터에는 없는 장르코드: GN9000 는 뭘까? 일단 그냥 포함해서 데이터 만들기

In [None]:
gnr_to_idx = dict(zip(Counter(gnr_list_all).keys(), range(len(Counter(gnr_list_all)))))
gnr_to_idx

{'GN0100': 3,
 'GN0200': 4,
 'GN0300': 6,
 'GN0400': 9,
 'GN0500': 2,
 'GN0600': 1,
 'GN0700': 20,
 'GN0800': 5,
 'GN0900': 8,
 'GN1000': 7,
 'GN1100': 18,
 'GN1200': 16,
 'GN1300': 14,
 'GN1400': 12,
 'GN1500': 0,
 'GN1600': 23,
 'GN1700': 11,
 'GN1800': 15,
 'GN1900': 13,
 'GN2000': 21,
 'GN2100': 22,
 'GN2200': 25,
 'GN2300': 29,
 'GN2400': 27,
 'GN2500': 10,
 'GN2600': 17,
 'GN2700': 19,
 'GN2800': 28,
 'GN2900': 24,
 'GN9000': 26}

In [None]:
# bm25 weighting 코드 

def bm25_row(X, K1=1.2, B=0.75):
    # Weighs each row of a sparse matrix by OkapiBM25 weighting
    # calculate idf per term (song or tag)
    X = spr.coo_matrix(X)
    N = float(X.shape[0])
    idf = np.log(N / (1 + np.bincount(X.col)))

    # calculate length_norm per document (playlist)
    row_sums = np.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X.tocsr()

In [None]:
# train data에서 row가 playlist(nid)이고 column이 대분류 장르코드인 sparse_matrix 만들기

row = np.repeat(range(n_train), n_gnr[plylst_train.index])
col = [gnr_to_idx[gnr] for gnr in gnr_list_all]
dat = np.concatenate(list(map(lambda x: list(x.values()), gnr_cnt_dict[:n_train])))

train_gnrs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, 30))
train_gnrs_A = bm25_row(train_gnrs_A)

In [None]:
train_gnrs_A.shape

(92056, 30)

test set에서 샘플 300개만 뽑아 테스트해봅니다.

In [None]:
# sample test
np.random.seed(33)
n_sample = 300

test = plylst_test.iloc[np.random.choice(range(n_test), n_sample, replace=False),:]

# real test
# test = plylst_test
# print(len(test))

row가 playlist(nid)이고 column이 item(sid or tid)인 sparse matrix A를 만듭니다.

- 각 플레이리스트(row)별로 포함하는 수록곡/태그의 컬럼에 해당하는 값이 1, 나머지는 0인 sparse matrix 생성 (2개 matrix)
- 각 sparse matrix에 bm25 weighting 적용

In [None]:
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
train_songs_A = bm25_row(train_songs_A)

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
train_tags_A = bm25_row(train_tags_A)

In [None]:
train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T = train_tags_A.T.tocsr()

In [None]:
train_songs_A.data

array([ 6.45773177,  5.24387366,  4.68657376, ..., 12.22118728,
       12.57123369, 13.0645964 ])

In [None]:
train_songs_A.shape

(92056, 576169)

In [None]:
# (플레이리스트*장르idx로 이루어진 희소행렬) + (플레이리스트*곡id로 이루어진 희소행렬) 

train_songs_gnrs_A = spr.hstack([train_gnrs_A, train_songs_A])
train_songs_gnrs_A.shape

(92056, 576199)

# 협업필터링: 유사도 계산 및 추천 결과

## 유사도를 내적으로 계산

In [None]:
test.head()

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,big_gnr_code,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
101788,0,2016-02-05 00:27:32.000,"[5738, 8029, 2110, 80206, 21932, 116312, 20924...",[],"[GN0900, GN0900, GN0900, GN0900, GN0900, GN090...",19,0
99118,0,2016-06-23 10:13:57.000,"[316087, 9145, 116469, 45328, 46344, 40948, 52...","[4893, 52, 3490, 50]","[GN1900, GN1700, GN0900, GN1000, GN1700, GN110...",9,4
103058,0,2015-09-09 13:30:26.000,"[144048, 134389, 22478, 71311, 71292, 259684, ...",[75],"[GN1800, GN1800, GN1800, GN1800, GN1800, GN180...",16,1
111522,0,2017-09-04 22:26:20.000,"[222405, 12406, 18133, 359, 73123, 4512, 7539]",[],"[GN0400, GN0100, GN0100, GN0500, GN0800, GN040...",7,0
94927,0,2008-05-09 13:31:44.000,"[153571, 166209, 265198, 161510, 553077]",[],"[GN1500, GN0100, GN1500, GN0100, GN1500, GN150...",5,0


##### pid == 101788 인 경우 테스트 코드

In [None]:
pid = test.index[0]

In [None]:
test.loc[pid,'big_gnr_code']

array(['GN0900', 'GN0900', 'GN0900', 'GN0900', 'GN0900', 'GN0900',
       'GN0900', 'GN1300', 'GN0900', 'GN0900', 'GN0900', 'GN1300',
       'GN0900', 'GN0900', 'GN0900', 'GN0900', 'GN1000', 'GN1100',
       'GN0900', 'GN1200'], dtype='<U6')

In [None]:
gnr_cnt_dict[pid]

Counter({'GN0900': 15, 'GN1000': 1, 'GN1100': 1, 'GN1200': 1, 'GN1300': 2})

In [None]:
p_gnr_col = [gnr_to_idx[gnr] for gnr in gnr_cnt_dict[pid]]
p_gnr_col

[8, 14, 7, 18, 16]

In [None]:
p_gnr_cnt = list(gnr_cnt_dict[pid].values())

In [None]:
p = np.zeros((30,1))      # 전체 장르 개수(30개)를 길이로 하는 0으로 된 배열
for i in range(len(p_gnr_col)):
  p[p_gnr_col[i]] = p_gnr_cnt[i]

-test data 중에서 장르와 곡이 결측값인 플레이리스트가 있음에 주의  
-곡과 태그가 모두 없는 테스트 데이터의 플레이리스트는 어떻게 유사도를 계산하는 거지...?  
-밑의 코드에서 곡 데이터가 없으면 모든 플레이리스트와의 유사도가 0이 되지 않나..?

### 플레이리스트에 포함된 장르만 가지고 유사도 계산

In [None]:
# 유사도: 내적 활용 
from tqdm import tqdm

def rec(pids):
  tt = 1

  res = []

  for pid in pids:
    p = np.zeros((30,1))      # 전체 장르 개수(30개)를 길이로 하는 0으로 된 배열

    # 위 배열에서 해당 플레이리스트가 포함하고 있는 장르 코드에 해당하는 값에 장르 등장 횟수 (나머지는 0)
    try:
      p_gnr_col = [gnr_to_idx[gnr] for gnr in gnr_cnt_dict[pid]] # 포함하고 있는 장르 코드의 index
      p_gnr_cnt = list(gnr_cnt_dict[pid].values()) # 포함하고 있는 장르의 플레이리스트 내 등장 횟수
      for i in range(len(p_gnr_col)):
        p[p_gnr_col[i]] = p_gnr_cnt[i]
    except: # 곡/장르가 결측값인 플레이리스트의 경우 
      p = np.zeros((30,1))

    val = train_gnrs_A.dot(p).reshape(-1)  # 해당 플레이리스트와 각각의 다른 플레이리스트들 간의 유사도(포함하는 장르 패턴의 유사도)구함

    songs_already = test.loc[pid, "songs_id"]  # 해당 플레이리스트에 이미 포함된 곡
    tags_already = test.loc[pid, "tags_id"]    # 해당 플레이리스트에 이미 포함된 태그

    cand_song = train_songs_A_T.dot(val) # 각 곡이 해당 플레이리스트에 포함될 가능성
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]  # 포함될 가능성이 높은 곡 순서대로 150개 곡의 인덱스 배열

    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]  # 위의 곡들 중에 이미 포함되어 있는 곡 제외하고 상위 100개 곡 인덱스
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # 곡 인덱스 바탕으로 추천할 곡 목록 리스트 저장

    cand_tag = train_tags_A_T.dot(val)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:  # 플레이리스트 1000개 마다 tt 출력
      print(tt)

    tt += 1
  return res

In [None]:
answers = rec(test.index)

In [None]:
answers

In [None]:
write_json(answers, "results/results.json")

In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.0356492
Tag nDCG: 0.25943
Score: 0.0692164


- 장르 데이터만 가지고 유사도 계산한 결과, 태그의 score는 어느 정도 나왔지만, 곡의 score가 낮아서 전체적으로 매우 낮게 나옴  
Music nDCG: 0.0356492  
Tag nDCG: 0.25943  
Score: 0.0692164  

### 각 플레이리스트에 포함된 곡+장르 데이터를 활용하여 유사도 계산하기

In [None]:
# 유사도: 내적 활용
from tqdm import tqdm

def rec(pids):
  tt = 1

  res = []

  for pid in pids:
    p_gnr = np.zeros((30,1))      # 전체 장르 개수(30개)를 길이로 하는 0으로 된 배열

    # 위 배열에서 해당 플레이리스트가 포함하고 있는 장르 코드에 해당하는 값에 장르 등장 횟수 (나머지는 0)
    if pid in gnr_cnt_dict.index:
      p_gnr_col = [gnr_to_idx[gnr] for gnr in gnr_cnt_dict[pid]] # 포함하고 있는 장르 코드의 index
      p_gnr_cnt = list(gnr_cnt_dict[pid].values()) # 포함하고 있는 장르의 플레이리스트 내 등장 횟수
      for i in range(len(p_gnr_col)):
        p_gnr[p_gnr_col[i]] = p_gnr_cnt[i]
    else: # 곡/장르가 결측값인 플레이리스트의 경우 
      p_gnr = np.zeros((30,1))

    p_song = np.zeros((n_songs,1))      # 전체 곡 개수를 길이로 하는 0으로 된 배열
    p_song[test.loc[pid,'songs_id']] = 1   # 위 배열에서 해당 플레이리스트가 포함하고 있는 song_id에 해당하는 값은 1 (나머지는 0)

    p_concat = np.concatenate((p_gnr, p_song)) 

    val = train_songs_gnrs_A.dot(p_concat).reshape(-1)  # 해당 플레이리스트와 각각의 다른 플레이리스트들 간의 유사도 (각 플레이리스트에 포함된 장르와 곡을 기준으로 계산)

    songs_already = test.loc[pid, "songs_id"]  # 해당 플레이리스트에 이미 포함된 곡
    tags_already = test.loc[pid, "tags_id"]    # 해당 플레이리스트에 이미 포함된 태그

    cand_song = train_songs_A_T.dot(val) # 각 곡이 해당 플레이리스트에 포함될 가능성
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]  # 포함될 가능성이 높은 곡 순서대로 150개 곡의 인덱스 배열

    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]  # 위의 곡들 중에 이미 포함되어 있는 곡 제외하고 상위 100개 곡 인덱스
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # 곡 인덱스 바탕으로 추천할 곡 목록 리스트 저장

    cand_tag = train_tags_A_T.dot(val)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:  # 플레이리스트 1000개 마다 tt 출력
      print(tt)

    tt += 1
  return res

In [None]:
answers = rec(test.index)

In [None]:
write_json(answers, "results/results.json")

In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.0448148
Tag nDCG: 0.261376
Score: 0.077299


- 오히려 곡만 가지고 유사도 구했을 때보다 score가 더 떨어짐

### 곡 추천할 때는 곡 기반으로 유사도를 구하여 추천하고, 태그 추천할 때는 장르 기반 유사도로 추천

In [None]:
# 유사도: 내적 활용 
from tqdm import tqdm

def rec(pids):
  tt = 1

  res = []

  for pid in pids:
    p_gnr = np.zeros((30,1))      # 전체 대분류 장르 개수(30개)를 길이로 하는 0으로 된 배열

    # 해당 플레이리스트가 포함하는 장르의 인덱스에 해당 장르의 등장 빈도 수 부여 (나머지는 0)
    if pid in gnr_cnt_dict.index:
      p_gnr_col = [gnr_to_idx[gnr] for gnr in gnr_cnt_dict[pid]] # 포함하고 있는 장르 코드의 index
      p_gnr_cnt = list(gnr_cnt_dict[pid].values()) # 포함하고 있는 장르의 플레이리스트 내 등장 횟수
      for i in range(len(p_gnr_col)):
        p_gnr[p_gnr_col[i]] = p_gnr_cnt[i]
    else: # 곡/장르가 결측값인 플레이리스트의 경우 
      p_gnr = np.zeros((30,1))

    p_song = np.zeros((n_songs,1))      # 전체 곡 개수를 길이로 하는 0으로 된 배열
    p_song[test.loc[pid,'songs_id']] = 1   # 해당 플레이리스트가 포함하는 song_id에 해당하는 값은 1 (나머지는 0)

    val_gnr = train_gnrs_A.dot(p_gnr).reshape(-1)  # 장르별 빈도수를 나타내는 벡터의 내적을 기준으로 플레이리스트 간의 유사도 계산
    val_song = train_songs_A.dot(p_song).reshape(-1)  # 수록곡 벡터의 내적을 기준으로 플레이리스트 간의 유사도 계산

    songs_already = test.loc[pid, "songs_id"]  # 해당 플레이리스트에 이미 포함된 곡
    tags_already = test.loc[pid, "tags_id"]    # 해당 플레이리스트에 이미 포함된 태그

    cand_song = train_songs_A_T.dot(val_song) # 각 곡의 벡터에 해당 플레이리스트와의 유사도를 내적
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]  # 포함될 가능성이 높은 곡 순서대로 150개 곡의 인덱스 배열

    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]  # 위의 곡들 중에 이미 포함되어 있는 곡 제외하고 상위 100개 곡 인덱스
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # 곡 인덱스 바탕으로 추천할 곡 목록 리스트 저장

    cand_tag = train_tags_A_T.dot(val_gnr)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:  # 플레이리스트 1000개 마다 tt 출력
      print(tt)

    tt += 1
  return res

In [None]:
answers = rec(test.index)

In [None]:
write_json(answers, "results/results.json")

In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.202075
Tag nDCG: 0.25943
Score: 0.210678
