In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Setup

In [None]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



Custom evaluating (weak)

In [None]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [None]:
from collections import Counter

import numpy as np
import pandas as pd

import scipy.sparse as spr
import pickle

- train/val split

In [None]:
# -*- coding: utf-8 -*-
import copy
import random
import numpy as np


class ArenaSplitter:
    def _split_data(self, playlists):
        tot = len(playlists)
        train = playlists[:int(tot*0.80)]
        val = playlists[int(tot*0.80):]

        return train, val

    def _mask(self, playlists, mask_cols, del_cols):
        q_pl = copy.deepcopy(playlists)
        a_pl = copy.deepcopy(playlists)

        for i in range(len(playlists)):
            for del_col in del_cols:
                q_pl[i][del_col] = []
                if del_col == 'songs':
                    a_pl[i][del_col] = a_pl[i][del_col][:100]
                elif del_col == 'tags':
                    a_pl[i][del_col] = a_pl[i][del_col][:10]

            for col in mask_cols:
                mask_len = len(playlists[i][col])
                mask = np.full(mask_len, False)
                mask[:mask_len//2] = True
                np.random.shuffle(mask)

                q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
                a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

        return q_pl, a_pl

    def _mask_data(self, playlists):
        playlists = copy.deepcopy(playlists)
        tot = len(playlists)
        song_only = playlists[:int(tot * 0.3)]
        song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
        tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
        title_only = playlists[int(tot * 0.95):]

        print(f"Total: {len(playlists)}, "
              f"Song only: {len(song_only)}, "
              f"Song & Tags: {len(song_and_tags)}, "
              f"Tags only: {len(tags_only)}, "
              f"Title only: {len(title_only)}")

        song_q, song_a = self._mask(song_only, ['songs'], ['tags'])
        songtag_q, songtag_a = self._mask(song_and_tags, ['songs', 'tags'], [])
        tag_q, tag_a = self._mask(tags_only, ['tags'], ['songs'])
        title_q, title_a = self._mask(title_only, [], ['songs', 'tags'])

        q = song_q + songtag_q + tag_q + title_q
        a = song_a + songtag_a + tag_a + title_a

        shuffle_indices = np.arange(len(q))
        np.random.shuffle(shuffle_indices)

        q = list(np.array(q)[shuffle_indices])
        a = list(np.array(a)[shuffle_indices])

        return q, a

    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")

In [None]:
split = ArenaSplitter()
split.run("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/train.json")

Reading data...

Total playlists: 115071
Splitting data...
Original train...
Original val...
Masked val...
Total: 23015, Song only: 6904, Song & Tags: 11508, Tags only: 3452, Title only: 1151


In [None]:
song_meta = pd.read_json("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/song_meta.json")
train = pd.read_json("/content/arena_data/orig/train.json")
test = pd.read_json("/content/arena_data/questions/val.json")

playlist, song, tag의 id(각각 nid, sid, tid)를 새로 생성하는 이유는, 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문입니다.

- plylst_id_nid : playlist id -> nid
- plylst_nid_id : playlist nid -> id
- song_id_sid : song id -> sid
- song_sid_id : song sid -> id
- tag_id_tid : tag id -> tid
- tag_tid_id : tag tid -> id
- song_dict : song id -> count
- tag_dict : tag id -> count

In [None]:
song_meta

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4
...,...,...,...,...,...,...,...,...,...
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987


In [None]:
train

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000
...,...,...,...,...,...,...
92051,"[90년생, 회상, 추억, 좋은노래, 80년생, 옛날노래]",149690,옛날노래 * 좋은노래 8090년생 노래 모음,"[292099, 513963, 174225, 287212, 140444, 62469...",155,2020-01-15 15:15:45.000
92052,[팝],35004,LOVE 1,"[62596, 359718, 596004, 668790, 291212, 148977...",8,2010-03-23 00:03:00.000
92053,"[여행, 발라드, 기분전환, 사랑]",59765,추억의 2004년 발라드 베스트,"[214372, 145150, 407082, 160552, 102445, 50845...",3,2019-05-15 13:26:07.000
92054,"[소울, 알앤비]",9867,All Music Guide 선정 90s R&B: 1997,"[561958, 397574, 250915, 110345, 426772, 10698...",51,2013-12-24 14:40:01.000


In [None]:
test

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[듣기좋은, 재즈추천, 재즈음악, 분위기좋은재즈]",120361,재즈가 알고싶은 분들을 위한 쉽고 듣기 좋은 재즈곡 모음,"[640591, 481141, 489042, 13991, 389776, 411147...",8,2020-03-16 10:30:47.000
1,[사랑],72736,여자가 들으면 설레는 달콤한 노래,"[483384, 114686, 532888, 634908, 134855, 27541...",5,2016-07-06 16:19:54.000
2,[],137352,리얼 카페 뮤직 vol.1,"[650557, 402769, 596045, 530193, 131923, 64837...",15,2014-06-26 15:02:32.000
3,"[스윙노래, swing, 스윙, 스윙음악]",132990,[스윙재즈] 신나는 패스트 스윙음악_#1,"[692618, 288502, 149966, 536896]",34,2019-04-29 22:54:50.000
4,[],46882,완전 소중한 인디 음악,"[556069, 560903, 90244, 105637, 38832, 395669,...",11,2015-09-15 15:54:43.000
...,...,...,...,...,...,...
23010,"[일어나세요, 아침, 상큼하게, 일상시작]",52552,우리 다같이 아침을 상큼하게 시작해봐요 : ),"[33130, 638133, 87119, 674998, 536793, 530267,...",9,2017-08-17 00:25:12.000
23011,[잔잔한],55298,잠 안오는 밤에 듣는 잔잔한 음악4,"[668321, 612907, 78766, 694755, 162521, 323212...",5,2015-09-20 23:29:39.000
23012,"[국외뮤지컬, 고음, 스트레스해소]",54148,막힌 도로 뻥! 가창력 소오름 돋는 뮤지컬 넘버,"[79037, 31596, 609629, 454236, 55192, 153662, ...",62,2019-03-15 14:57:49.000
23013,[],122565,Jazz-경쾌함_090827,[],17,2009-08-27 10:36:49.000


In [None]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000


In [None]:
train['token_plylst_title'] = train['plylst_title'].apply(lambda x:x.split())
train['title_tag_token'] = train['tags'] + train['token_plylst_title']
train = train.drop(columns=['plylst_title', 'like_cnt', 'updt_date', 'token_plylst_title'])
train['title_tag_token'] = train['title_tag_token'].apply(lambda x: " ".join(x))
train.head()

Unnamed: 0,tags,id,songs,title_tag_token
0,"[힐링, 휴식, 밤, 새벽]",147668,"[663185, 649626, 6855, 188486, 348451, 169945,...",힐링 휴식 밤 새벽 To. 힘들고 지친 분들에게
1,[팝],50422,"[627035, 256438, 603324, 200889, 441319, 21689...",팝 130807-7
2,[뉴에이지],116432,"[129204, 369497, 649743, 344619, 110281, 63266...",뉴에이지 숙면을 위한 슬픈 마음을 달래 줄 피아노
3,"[하드락, 록스피릿, 댄스]",55076,"[677591, 420396, 104934, 119279, 251988, 58850...",하드락 록스피릿 댄스 당신을 하얗게 불태울 곡들
4,"[힐링, 휴식, 기분전환]",125064,"[704455, 694036, 508043, 154933, 57614, 645195...",힐링 휴식 기분전환 [스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙...


In [None]:
test['token_plylst_title'] = test['plylst_title'].apply(lambda x:x.split())
test['title_tag_token'] = test['tags'] + test['token_plylst_title']
test = test.drop(columns=['plylst_title', 'like_cnt', 'updt_date', 'token_plylst_title'])
test['title_tag_token'] = test['title_tag_token'].apply(lambda x: " ".join(x))
test.head()

Unnamed: 0,tags,id,songs,title_tag_token
0,[],8241,[],"가을, 해질녘, 산책길의 BGM"
1,"[electronica, Techno]",10045,"[190813, 528117, 674944, 461713, 294436, 598660]","electronica Techno [장르] 테크노 : 뇌, 심장, 그리고 신경을 자..."
2,[],73777,[],My Best Rock 2
3,[],85239,"[157900, 105231, 121014, 585388, 530886, 82893...",카페에서 듣기 좋은 음악들 모음!
4,[],36107,"[47106, 705134, 297064, 406388, 12328, 466529,...",연애하는 느낌드는 달달한 노래


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
lv = TfidfVectorizer(max_features=500)
train_tfidf_matrix = lv.fit_transform(train['title_tag_token'])
test_tfidf_matrix = lv.fit_transform(test['title_tag_token'])

In [None]:
train_tfidf_matrix.shape

(92056, 500)

In [None]:
test_tfidf_matrix.shape

(23015, 500)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(train_tfidf_matrix, test_tfidf_matrix[0])

In [None]:
similarity 

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [None]:
len(similarity)

92056

In [None]:
test_tfidf_matrix[0]

<1x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

### 플레이리스트 제목 전처리 및 Word2Vec 임베딩(테스트)

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000


In [None]:
# 훈련 데이터 okt 품사 태그 기준으로 토큰화 (불용어 제거)
X_train = []
for sentence in train['plylst_title']:
  temp = []
  for word in okt.pos(str(sentence)):
    if word[1] not in ['Josa', 'Eomi', 'Punctuation', 'KoreanParticle', 'Suffix']:
      temp.append(word[0])
  X_train.append(temp)

train['tokenized_title'] = X_train

- mecab으로 토큰화하는 게 계산 속도는 빠른데, 문제는 품사 태그가 여러 개 붙어서 나오기 때문에 불용어 제거를 어떻게 해야 할지 고민
- 일단 okt로 토큰화하고 품사 태그로 불용어 제거함

In [None]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,tokenized_title
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000,"[To, 힘들고, 지친, 분]"
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000,[130807-7]
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000,"[숙면, 위, 슬픈, 마음, 달래, 줄, 피아노]"
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000,"[당신, 하얗게, 불, 태울, 곡]"
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000,"[스피커, 필수, HIPHOP, 듣고, 있음, 꿀렁꿀렁, 싶은, 힙합, 음악]"


In [None]:
train['title_tag_token'] = train['tags'] + train['tokenized_title']
train = train.drop(columns=['plylst_title', 'like_cnt', 'updt_date', 'tokenized_title'])
train.head()

Unnamed: 0,tags,id,songs,title_tag_token
0,"[힐링, 휴식, 밤, 새벽]",147668,"[663185, 649626, 6855, 188486, 348451, 169945,...","[힐링, 휴식, 밤, 새벽, To, 힘들고, 지친, 분]"
1,[팝],50422,"[627035, 256438, 603324, 200889, 441319, 21689...","[팝, 130807-7]"
2,[뉴에이지],116432,"[129204, 369497, 649743, 344619, 110281, 63266...","[뉴에이지, 숙면, 위, 슬픈, 마음, 달래, 줄, 피아노]"
3,"[하드락, 록스피릿, 댄스]",55076,"[677591, 420396, 104934, 119279, 251988, 58850...","[하드락, 록스피릿, 댄스, 당신, 하얗게, 불, 태울, 곡]"
4,"[힐링, 휴식, 기분전환]",125064,"[704455, 694036, 508043, 154933, 57614, 645195...","[힐링, 휴식, 기분전환, 스피커, 필수, HIPHOP, 듣고, 있음, 꿀렁꿀렁, ..."


In [None]:
from gensim.models import Word2Vec

In [None]:
word2vec_model = Word2Vec(train['title_tag_token'], size=100, window=5, min_count=1, workers=4, sg=0)

In [None]:
def vectors(document_list):
    document_embedding_list = []

    # 각 문서에 대해서
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line:
            if word in word2vec_model.wv.vocab:
                count += 1
                # 해당 문서에 있는 모든 단어들의 벡터값을 더한다.
                if doc2vec is None:
                    doc2vec = word2vec_model[word]
                else:
                    doc2vec = doc2vec + word2vec_model[word]

        if doc2vec is not None:
            # 단어 벡터를 모두 더한 벡터의 값을 문서 길이로 나눠준다.
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)

    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

In [None]:
document_embedding_list = vectors(train['title_tag_token'])
print('문서 벡터의 수 :',len(document_embedding_list))

  del sys.path[0]
  from ipykernel import kernelapp as app


문서 벡터의 수 : 92056


In [None]:
word2vec_model.wv.vectors.shape

(54934, 100)

In [None]:
test.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[듣기좋은, 재즈추천, 재즈음악, 분위기좋은재즈]",120361,재즈가 알고싶은 분들을 위한 쉽고 듣기 좋은 재즈곡 모음,"[640591, 481141, 489042, 13991, 389776, 411147...",8,2020-03-16 10:30:47.000
1,[사랑],72736,여자가 들으면 설레는 달콤한 노래,"[483384, 114686, 532888, 634908, 134855, 27541...",5,2016-07-06 16:19:54.000
2,[],137352,리얼 카페 뮤직 vol.1,"[650557, 402769, 596045, 530193, 131923, 64837...",15,2014-06-26 15:02:32.000
3,"[스윙노래, swing, 스윙, 스윙음악]",132990,[스윙재즈] 신나는 패스트 스윙음악_#1,"[692618, 288502, 149966, 536896]",34,2019-04-29 22:54:50.000
4,[],46882,완전 소중한 인디 음악,"[556069, 560903, 90244, 105637, 38832, 395669,...",11,2015-09-15 15:54:43.000


In [None]:
# test 데이터 okt 품사 태그 기준으로 토큰화 (불용어 제거)
X_test = []
for sentence in test['plylst_title']:
  temp = []
  for word in okt.pos(str(sentence)):
    if word[1] not in ['Josa', 'Eomi', 'Punctuation', 'KoreanParticle', 'Suffix']:
      temp.append(word[0])
  X_test.append(temp)

test['tokenized_title'] = X_test

In [None]:
test['title_tag_token'] = test['tags'] + test['tokenized_title']
test = test.drop(columns=['plylst_title', 'like_cnt', 'updt_date', 'tokenized_title'])
test.head()

Unnamed: 0,tags,id,songs,title_tag_token
0,"[듣기좋은, 재즈추천, 재즈음악, 분위기좋은재즈]",120361,"[640591, 481141, 489042, 13991, 389776, 411147...","[듣기좋은, 재즈추천, 재즈음악, 분위기좋은재즈, 재즈, 알고싶은, 분, 위, 쉽고..."
1,[사랑],72736,"[483384, 114686, 532888, 634908, 134855, 27541...","[사랑, 여자, 들으면, 설레는, 달콤한, 노래]"
2,[],137352,"[650557, 402769, 596045, 530193, 131923, 64837...","[리얼, 카페, 뮤직, vol, 1]"
3,"[스윙노래, swing, 스윙, 스윙음악]",132990,"[692618, 288502, 149966, 536896]","[스윙노래, swing, 스윙, 스윙음악, 스윙재즈, 신나는, 패스트, 스윙음악, 1]"
4,[],46882,"[556069, 560903, 90244, 105637, 38832, 395669,...","[완전, 소중한, 인디, 음악]"


In [None]:
word2vec_model_test = Word2Vec(test['title_tag_token'], size=100, window=5, min_count=1, workers=4, sg=0)

- 아레나 노트북: playlist embedding using word2vec 코드 참고!
- 제목+태그 토큰 활용해서 플레이리스트 임베딩하기

### most_popular.py 코드 돌리기(베이스 라인)

In [None]:
def most_popular(playlists, col, topk_count):
    c = Counter()

    for doc in playlists:
        c.update(doc[col])

    topk = c.most_common(topk_count)
    return c, [k for k, v in topk]

In [None]:
def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [None]:
from tqdm import tqdm

class MostPopular:
    def _generate_answers(self, train, questions):
        _, song_mp = most_popular(train, "songs", 200)
        _, tag_mp = most_popular(train, "tags", 100)

        answers = []

        for q in tqdm(questions):
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10],
            })

        return answers

    def run(self, train_fname, question_fname):
        print("Loading train file...")
        train = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(train, questions)
        write_json(answers, "results/results_most_pop.json")

In [None]:
train_fname = "/content/arena_data/orig/train.json"
question_fname = "/content/arena_data/questions/val.json"

mostpop = MostPopular()
mostpop.run(train_fname, question_fname)

Loading train file...
Loading question file...
Writing answers...


100%|██████████| 23015/23015 [00:00<00:00, 34625.77it/s]


### Word2Vec 활용하여 플레이리스트 임베딩 

In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.1 MB/s 
[?25hCollecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.8 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 43.2 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [None]:
import os
import json

import pandas as pd

from tqdm import tqdm
from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

from konlpy.tag import Okt
okt = Okt()

- 아래 코드에서 self.most_results는 카카오아레나 베이스라인 코드의 결과 파일을 나타냄

In [None]:
class PlaylistEmbedding:
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH
        self.min_count = 3
        self.size = 100
        self.window = 210
        self.sg = 5
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)
        with open(os.path.join(FILE_PATH, 'orig/train.json'), encoding="utf-8") as f:
            self.train = json.load(f)
        with open(os.path.join(FILE_PATH, 'questions/val.json'), encoding="utf-8") as f:
            self.val = json.load(f)
        with open(os.path.join(FILE_PATH, 'results/results_most_pop.json'), encoding="utf-8") as f:
            self.most_results = json.load(f)
            
    def get_dic(self, train, val):
        song_dic = {}
        tag_dic = {}
        data = train + val

        # 플레이리스트 제목 토큰화/전처리
        X_train = []
        for sentence in list(map(lambda x: x['plylst_title'], data)):
          temp = []
          for word in okt.pos(str(sentence)):
            if word[1] not in ['Josa', 'Eomi', 'Punctuation', 'KoreanParticle', 'Suffix']:
              temp.append(word[0])
          X_train.append(temp)

        for n, q in enumerate(data):
          q['tokenized_title'] = X_train[n]
          q['title_tag_token'] = list(q['tags']) + list(q['tokenized_title'])

        for q in tqdm(data):
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['title_tag_token']
        self.song_dic = song_dic
        self.tag_dic = tag_dic
        total = list(map(lambda x: list(map(str, x['songs'])) + list(x['title_tag_token']), data))
        total = [x for x in total if len(x)>1]
        self.total = total
        
    def get_w2v(self, total, min_count, size, window, sg):
        w2v_model = Word2Vec(total, min_count = min_count, size = size, window = window, sg = sg)
        self.w2v_model = w2v_model
            
    def update_p2v(self, train, val, w2v_model):
        ID = []   
        vec = []
        for q in tqdm(train + val):
            tmp_vec = 0
            if len(q['songs'])>=1:
                for song in q['songs'] + q['title_tag_token']:
                    try: 
                        tmp_vec += w2v_model.wv.get_vector(str(song))
                    except KeyError:
                        pass
            if type(tmp_vec)!=int:
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
        self.p2v_model.add(ID, vec)
    
    def get_result(self, p2v_model, song_dic, tag_dic, most_results, val):
        answers = []
        for n, q in tqdm(enumerate(val), total = len(val)):
            try:
                most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += song_dic[ID]
                    get_tag += tag_dic[ID]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": most_results[n]["id"],
                  "songs": most_results[n]['songs'],
                  "tags": most_results[n]["tags"],
                }) 
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], self.most_results[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], self.most_results[n]['tags'])[:10-len(q['tags'])]  
        self.answers = answers
    
    def run(self):
        self.get_dic(self.train, self.val)
        self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg)
        self.update_p2v(self.train, self.val, self.w2v_model)
        self.get_result(self.p2v_model, self.song_dic, self.tag_dic, self.most_results, self.val)
        write_json(self.answers, 'results/results.json')

In [None]:
FILE_PATH = '/content/arena_data/'
U_space = PlaylistEmbedding(FILE_PATH)
U_space.run()

100%|██████████| 115071/115071 [00:00<00:00, 466625.37it/s]
100%|██████████| 115071/115071 [00:23<00:00, 4834.70it/s] 
100%|██████████| 23015/23015 [05:29<00:00, 69.91it/s]


In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.162752
Tag nDCG: 0.38736
Score: 0.196444


- 곡+태그만 임베딩한 원래 word2vec 모델보다 tag nDCG 값이 줄어듦
- 제목 데이터에 추가 전처리 및 다른 토큰화 방법 활용 필요

