In [1]:
# colab drive 연동을 위한 code
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


<div style="font-size:50px;font-weight:bold" align="center">사용 Model 도식</div>

![model](https://user-images.githubusercontent.com/50820635/88534733-a9e58900-d043-11ea-821b-1166c64e2b42.png)

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import  collections 

project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
default_file_path = os.path.join(project_path,"data")

# sys.path.append(default_file_path)
!pip install fire

# 대회에서 제공한 custom package
# from arena_util import load_json
# from evaluate import ArenaEvaluator

Collecting fire
[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)
[K     |████                            | 10kB 14.4MB/s eta 0:00:01[K     |████████                        | 20kB 7.0MB/s eta 0:00:01[K     |████████████▏                   | 30kB 3.8MB/s eta 0:00:01[K     |████████████████▏               | 40kB 3.9MB/s eta 0:00:01[K     |████████████████████▏           | 51kB 2.7MB/s eta 0:00:01[K     |████████████████████████▎       | 61kB 2.9MB/s eta 0:00:01[K     |████████████████████████████▎   | 71kB 3.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.0MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=14ae1142e58e86997928c5a061b73f6d20fdad654ce233180a58e0367177ea46
  Stored in directory: /root/.cache/pip/whe

## arena_util.py
모델 평가를 위해 대회 측에서 제공한 custom python package

In [3]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


# branch test
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(parent)
    with io.open(fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf-8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))


def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]


def most_popular(playlists, col, topk_count):
    c = Counter()

    for doc in playlists:
        c.update(doc[col])

    topk = c.most_common(topk_count)
    return c, [k for k, v in topk]


## evaluator.py
모델 평가를 위해 대회 측에서 제공한 custom python package


In [4]:
# -*- coding: utf-8 -*-
import fire
import numpy as np

# from arena_util import load_json


class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)
        if len(gt)>100:
            gt = gt[:100]
        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])
        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]
        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate_with_save(self, gt_fname, rec_fname, model_file_path, default_file_path):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        with open(f'{default_file_path}/results.txt','a') as f:
            f.write(model_file_path)
            f.write(f"\nMusic nDCG: {music_ndcg:.6}\n")
            f.write(f"Tag nDCG: {tag_ndcg:.6}\n")
            f.write(f"Score: {score:.6}\n\n")
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        # except Exception as e:
        #     print(e)

    def evaluate(self, gt_fname, rec_fname):
        # try:
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        print(f"Music nDCG: {music_ndcg:.6}")
        print(f"Tag nDCG: {tag_ndcg:.6}")
        print(f"Score: {score:.6}")
        # except Exception as e:
        #     print(e)

## data_util.py

* tags_ids_convert: playlist에 있는 감정 Tag를 새로운 id를 부여한 tag2id, id2tag dictionary를 생성
* save_freq_song_id_dict: 특정 빈도 이상의 노래만을 추출하여 새로운 id를 부여하는 song2id, id2song dictionary를 생성
* genre_gn_all_preprocessing: 장르 정보 Dataframe을 대분류 / 소분류 장르 Dataframe으로 분리하여 생성
* genre_DicGenerator: {대분류 장르: 대분류 id}, {상세 장르: 상세 id}, {song_id: 대분류 id}, {song_id, 소분류 id} dictionary를 생성

In [5]:
# Playlist에 있는 감정 Tag를 dictionary 형태로 새로운 id를 부여하며 변환하여 저장
def tags_ids_convert(json_data, tag2id_filepath, id2tag_filepath):
    playlist_df = pd.DataFrame(json_data)
    tags_list = playlist_df['tags'].to_list()
    _id = 0
    tags_dict = dict()
    ids_dict = dict()
    tags_set = set()
    for tags in tags_list:
        for tag in tags:
            if tag not in tags_set:
                tags_set.add(tag)
                tags_dict[tag] = _id
                ids_dict[_id] = tag
                _id += 1
    with open(tag2id_filepath, 'wb') as f:
        np.save(f, tags_dict)
        print('{} is created'.format(tag2id_filepath))
    with open(id2tag_filepath, 'wb') as f:
        np.save(f, ids_dict)
        print('{} is created'.format(id2tag_filepath))
    return True

# 특정 빈도수 이상의 노래를 선정 후 이를 새로운 song2id id2song dict로 재매핑
def save_freq_song_id_dict(train, thr, default_file_path, model_postfix):
    song_counter = collections.Counter()
    for plylst in train:
        song_counter.update(plylst['songs'])

    selected_songs = []
    song_counter = list(song_counter.items())
    for k, v in song_counter:
        if v > thr:
            selected_songs.append(k)

    # thr시 선정되는 곡의 수를 확인
    print(f'{len(song_counter)} songs to {len(selected_songs)} songs')

    freq_song2id = {song: _id for _id, song in enumerate(selected_songs)}
    np.save(f'{default_file_path}/freq_song2id_thr{thr}_{model_postfix}', freq_song2id)
    print(f'{default_file_path}/freq_song2id_thr{thr}_{model_postfix} is created')
    id2freq_song = {v: k for k, v in freq_song2id.items()}
    np.save(f'{default_file_path}/id2freq_song_thr{thr}_{model_postfix}', id2freq_song)
    print(f'{default_file_path}/id2freq_song_thr{thr}_{model_postfix} is created')
    

def genre_gn_all_preprocessing(genre_gn_all):
    # genre_gn_all: pd.DataFrame
    ## 대분류 장르코드
    # 장르코드 뒷자리 두 자리가 00인 코드를 필터링
    gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] == '00']

    ## 상세 장르코드
    # 장르코드 뒷자리 두 자리가 00이 아닌 코드를 필터링
    dtl_gnr_code = genre_gn_all[genre_gn_all['gnr_code'].str[-2:] != '00'].copy()
    dtl_gnr_code.rename(columns={'gnr_code': 'dtl_gnr_code', 'gnr_name': 'dtl_gnr_name'}, inplace=True)

    return gnr_code, dtl_gnr_code

# {대분류 장르: 대분류 id}, {상세 장르: 상세 id}, {song_id: 대분류 id}, {song_id, 소분류 id}
def genre_DicGenerator(gnr_code, dtl_gnr_code, song_meta):
    ## gnr_dic (key: 대분류 장르 / value: 대분류 장르 id)
    gnr_dic = {}
    i = 0
    for gnr in gnr_code['gnr_code']:
        gnr_dic[gnr] = i
        i += 1

    ## dtl_dic (key: 상세 장르 / value: 상세 장르 id)
    dtl_dic = {}
    j = 0
    for dtl in dtl_gnr_code['dtl_gnr_code']:
        dtl_dic[dtl] = j
        j += 1

    ## song_gnr_dic (key: 곡 id / value: 해당 곡의 대분류 장르)
    ## song_dtl_dic (key: 곡 id / value: 해당 곡의 상세 장르)
    song_gnr_dic = {}
    song_dtl_dic = {}

    for s in song_meta:
        song_gnr_dic[s['id']] = s['song_gn_gnr_basket']
        song_dtl_dic[s['id']] = s['song_gn_dtl_gnr_basket']

    return gnr_dic, dtl_dic, song_gnr_dic, song_dtl_dic

## MelonDataset.py
AutoEncoder 및 Word2Vector 학습시 사용할 Pytorch Dataset을 생성하는 python script

각 playlist에 대해 수록된 노래 및 태그를 전체 노래/태그 벡터에서 해당되는 값만 1로 변환한 벡터를 출력하고 이를 하나의 벡터로 concatenate

* SongTagDataset: 매 스텝마다 playlist_id, [song_vector, tag_vector]를 반환하는 데이터셋
* SongTagGenreDataset: 매 스텝마다 playlist_id, [song_vector, tag_vector], gnr_vector, detail_gnr_vector를 반환하는 데이터셋

~~~
.
├── SongTagDataset
│   ├── _song_ids2vec: 플레이리스트에 수록된 노래를 vector화 하는 함수
│   ├── _tag_ids2vec: 플레이리스트에 있는 태그를 vector화 하는 함수
├── SongTagGenreDataset
│   ├── _init_song_meta: 대분류 / 소분류 장르에 대한 vector화 작업전 수반되는 dictonary 및 DataFrame 생성 함수
│   ├── _song_ids2vec: 플레이리스트에 수록된 노래를 vector화 하는 함수
│   ├── _tag_ids2vec: 플레이리스트에 있는 태그를 vector화 하는 함수
│   ├── _get_gnr_vector: 플레이리스트의 노래들에 대해 대분류 장르를 각각 추출하여 vector화 하는 함수
│   └── _get_dtl_gnr_vector: 플레이리스트의 노래들에 대해 소분류 장르를 각각 추출하여 vector화 하는 함수
.
~~~


In [6]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
# from arena_util import load_json
# from data_util import genre_gn_all_preprocessing, genre_DicGenerator
import torch


class SongTagDataset(Dataset):
    def __init__(self, json_dataset, tag2id_file_path, prep_song2id_file_path):
        self.train = json_dataset
        self.tag2id = dict(np.load(tag2id_file_path, allow_pickle=True).item())
        self.prep_song2id = dict(np.load(prep_song2id_file_path, allow_pickle=True).item())
        self.num_songs = len(self.prep_song2id)
        self.num_tags = len(self.tag2id)

    def __len__(self):
        return len(self.train)

    # song_vec, tag_vec를 AE에 넣기 위해 하나의 vector로 concatenate
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        _id = self.train[idx]['id']
        song_vector = self._song_ids2vec(self.train[idx]['songs'])
        tag_vector = self._tag_ids2vec(self.train[idx]['tags'])
        _input = torch.from_numpy(
            np.concatenate([song_vector, tag_vector]).astype(np.float32))

        return _id, _input

    def _song_ids2vec(self, songs):
        songs = [self.prep_song2id[song] for song in songs if song in self.prep_song2id.keys()]

        songs = np.asarray(songs, dtype=np.int)
        bin_vec = np.zeros(self.num_songs)
        if len(songs) > 0:
            bin_vec[songs] = 1
        return np.array(bin_vec)

    def _tag_ids2vec(self, tags):
        tags = [self.tag2id[tag] for tag in tags if tag in self.tag2id.keys()]
        tags = np.asarray(tags, dtype=np.int)
        bin_vec = np.zeros(self.num_tags)
        bin_vec[tags] = 1
        return np.array(bin_vec)


class SongTagGenreDataset(Dataset):
    def __init__(self, json_dataset, tag2id_file_path, prep_song2id_file_path):
        
        project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
        self.default_file_path = os.path.join(project_path,"data")

        self.train = json_dataset
        self.tag2id = dict(np.load(tag2id_file_path, allow_pickle=True).item())
        self.prep_song2id = dict(np.load(prep_song2id_file_path, allow_pickle=True).item())
        self.num_songs = len(self.prep_song2id)
        self.num_tags = len(self.tag2id)
        self._init_song_meta()

    def __len__(self):
        return len(self.train)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        _id = self.train[idx]['id']
        song_vector = self._song_ids2vec(self.train[idx]['songs'])
        tag_vector = self._tag_ids2vec(self.train[idx]['tags'])
        gnr_vector = self._get_gnr_vector(self.train[idx]['songs'], self.gnr_code, self.gnr_dic, self.song_gnr_dic)
        dtl_gnr_vector = self._get_dtl_gnr_vector(self.train[idx]['songs'], self.dtl_gnr_code, self.dtl_dic, self.song_dtl_dic)
        _input = torch.from_numpy(np.concatenate([song_vector, tag_vector]).astype(np.float32))

        return _id, _input, gnr_vector, dtl_gnr_vector

    def _init_song_meta(self):
        song_meta = load_json(os.path.join(self.default_file_path,'song_meta.json'))

        genre_gn_all = pd.read_json(os.path.join(self.default_file_path,'genre_gn_all.json'), encoding='utf8', typ='series')
        genre_gn_all = pd.DataFrame(genre_gn_all, columns=['gnr_name']).reset_index().rename(
            columns={'index': 'gnr_code'})

        self.gnr_code, self.dtl_gnr_code = genre_gn_all_preprocessing(genre_gn_all)
        self.num_gnr = len(self.gnr_code)
        self.num_dtl_gnr = len(self.dtl_gnr_code)
        # {노래 - 대분류 장르}, {노래 - 소분류 장르} 데이터 생성
        self.gnr_dic, self.dtl_dic, self.song_gnr_dic, self.song_dtl_dic = genre_DicGenerator(
            self.gnr_code, self.dtl_gnr_code, song_meta)

    # 노래를 등장 빈도로 구분한 song2id dict를 활용하여 id 값을 재부여
    # & 23만곡 가운데에 등장한 노래에 대해서 1을 부여
    def _song_ids2vec(self, songs):
        songs = [self.prep_song2id[song] for song in songs if song in self.prep_song2id.keys()]

        songs = np.asarray(songs, dtype=np.int)
        bin_vec = np.zeros(self.num_songs)
        # 등장한 노래에 대해 1값부여
        if len(songs) > 0:
            bin_vec[songs] = 1
        return np.array(bin_vec)

    def _tag_ids2vec(self, tags):
        tags = [self.tag2id[tag] for tag in tags if tag in self.tag2id.keys()]
        tags = np.asarray(tags, dtype=np.int)
        bin_vec = np.zeros(self.num_tags)
        bin_vec[tags] = 1
        return np.array(bin_vec)

    # 특정 플레이리스트에 있는 곡별로 대분류 장르가 포함되는 비율을 벡터로 표현
    def _get_gnr_vector(self, songs, gnr_code, gnr_dic, song_gnr_dic):
        # v_gnr (각 플레이리스트의 수록곡 장르 비율을 담은 30차원 vector)
        v_gnr = np.zeros(len(gnr_code))
        for t_s in songs:
            for g in song_gnr_dic[t_s]:
                if g in gnr_code['gnr_code'].values:
                    v_gnr[gnr_dic[g]] += 1
        if v_gnr.sum() > 0:
            v_gnr = v_gnr / v_gnr.sum()
        return v_gnr

    def _get_dtl_gnr_vector(self, songs, dtl_gnr_code, dtl_dic, song_dtl_dic):
        ## plylst_dtl (각 플레이리스트의 수록곡 상세 장르 비율을 담은 224차원 vector)
        v_dtl = np.zeros(len(dtl_gnr_code))
        for t_s in songs:
            for g in song_dtl_dic[t_s]:
                if g in dtl_gnr_code['dtl_gnr_code'].values:
                    v_dtl[dtl_dic[g]] += 1
        if v_dtl.sum() > 0:
            v_dtl = v_dtl / v_dtl.sum()
        return v_dtl

## Models.py

임베딩에 사용될 AutoEncode Model을 선언하는 python script

In [7]:
import torch
import torch.nn as nn

class AutoEncoder(nn.Module):
  def __init__(self, D_in, H, D_out, dropout):
    super(AutoEncoder, self).__init__()
    encoder_layer = nn.Linear(D_in, H, bias=True)
    decoder_layer = nn.Linear(H, D_out, bias=True)

    # 빠른 최적화 및 높은 성능을 위해 각 가중치들에 대해 정규화를 수행
    torch.nn.init.xavier_uniform_(encoder_layer.weight)
    torch.nn.init.xavier_uniform_(decoder_layer.weight)

    self.encoder = nn.Sequential(
                              nn.Dropout(dropout),
                              encoder_layer,
                              nn.BatchNorm1d(H),
                              nn.LeakyReLU()
                            )
    
    self.decoder = nn.Sequential(
                              decoder_layer,
                              nn.Sigmoid()
                          )
    
  def forward(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

## w2v.py
임베딩시 사용하는 Word2Vec 학습 및 수행을 위한 python script

각 playlist에 대해 수록된 노래 및 태그를 전체 노래/태그 벡터에서 해당되는 값만 1로 변환한 벡터를 출력하고 이를 하나의 벡터로 concatenate

~~~
.
├── make_input4tokenizer
│   ├── _wv_tags: 전체 플레이리스트의 tags list에서 각 tags를 하나의 string으로 변환하여 새로이 list에 append하여 반환
│   ├── _wv_genre: 전체 장르 Dataframe에 대해서 세부 장르를 [전체장르, 세부장르] 형태로 list를 생성하여 반환
├── train_tokenizer: w2v전 string token화 사용시 tokenizer 학습을 위해 사용하는 함수
├── get_tokens_from_sentences: string으로 변환된 문장들을 하나의 element로 변환한 list를 element로 하는 list를 생성하여 바환
├── get_tokens_from_sentence
├── string2vec: w2v 모델 실행을 위한 class
│   ├── set_model
│   ├── save_embeddings
│   ├── save_model
│   ├── show_similar_words
├── title_tokenizer: playlist title을 token화 하기 위한 class
│   ├── make_input_file:
│   ├── train_tokenizer:
│   ├── get_tokens:
├── train_tokenizer_w2v: tokenizer 및 w2v모델 학습 수행을 위한 함수
.
~~~


In [8]:

!pip install sentencepiece

import os
import sys
import json
import torch
import io
import os
import copy
import random
import math
import datetime as dt
import distutils.dir_util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sentencepiece as spm

from collections import defaultdict
from tqdm import tqdm
from gensim.models import Word2Vec as w2v
from collections import Counter
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from torch import nn
# from arena_util import write_json, load_json

vocab_size = 24000
method = 'bpe'


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def make_input4tokenizer(train_file_path, genre_file_path, result_file_path, valid_file_path=None, test_file_path=None):
    # 특정 playlist의 태그를 vector화
    def _wv_tags(tags_list):
        taS = []
        for tags in tags_list:
            taS.append(' '.join(tags))

        return(taS)
        
    # 특정 playlist의 장르를 vector화
    def _wv_genre(genre):
        genre_dict = dict()
        for code, value in genre:
            code_num = int(code[2:])
            # 상위 장르의 경우
            if not code_num % 100:
                cur_genre = value
                genre_dict[cur_genre] = []
            # 하위 장르의 경우
            else:
                value = ' '.join(value.split('/'))
                genre_dict[cur_genre].append(value)
        genre_sentences = []
        for key in genre_dict:
            sub_list = genre_dict[key]
            key = ' '.join(key.split('/'))
            # 상위장르 key의 value list가 0인 경우
            if not len(sub_list):
                continue
            for sub in sub_list:
                genre_sentences.append(key+' '+sub)
        return genre_sentences

    try:
        plylsts = load_json(train_file_path)
        if valid_file_path is not None:
            val_plylsts = load_json(valid_file_path)
            plylsts += val_plylsts
        if test_file_path is not None:
            test_plylsts = load_json(test_file_path)
            plylsts += test_plylsts

        genre_all = load_json(genre_file_path)
        genre_all_lists = []
        for code, gnr in genre_all.items():
            if gnr != '세부장르전체':
                genre_all_lists.append([code, gnr])
        genre_all_lists = np.asarray(genre_all_lists)

        sentences = []
        for plylst in plylsts:
            tiS = plylst['plylst_title']
            taS = ' '.join(plylst['tags'])
            upS = ' '.join(plylst['updt_date'][:7].split('-'))
            sentences.append(' '.join([tiS, taS, upS]))

        geS = _wv_genre(genre_all_lists)
        # 각노래와 플레이리스트에 대해서 장르를 붙이는게 아니라 그냥 전체 장르 데이터를 붙여버림,,, ???
        sentences = sentences + geS
        with open(result_file_path, 'w', encoding='utf8') as f:
            for sentence in sentences:
                f.write(sentence+'\n')
    except Exception as e:
        print(e.with_traceback())
        return False
    return sentences


def train_tokenizer(input_file_path, model_file_path, vocab_size, model_type):
    templates = ' --input={} \
        --pad_id=0 \
        --bos_id=1 \
        --eos_id=2 \
        --unk_id=3 \
        --model_prefix={} \
        --vocab_size={} \
        --character_coverage=1.0 \
        --model_type={}'

    cmd = templates.format(input_file_path,
                model_file_path,    # output model 이름
                vocab_size,# 작을수록 문장을 잘게 쪼갬
                model_type)# unigram (default), bpe, char

    spm.SentencePieceTrainer.Train(cmd)
    print("tokenizer {} is generated".format(model_file_path))

def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            if len(token) > 1:
                new_tokens.append(token)
        if len(new_tokens) > 1:
            tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens


class string2vec():
    def __init__(self, train_data, size=200, window=5, min_count=2, workers=8, sg=1, hs=1):
        self.model = w2v(train_data, size=size, window=window, min_count=min_count, workers=workers, sg=sg, hs=hs)

    def set_model(self, model_fn):
        self.model = w2v.load(model_fn)

    def save_embeddings(self, emb_fn):
        word_vectors = self.model.wv

        vocabs = []
        vectors = []
        for key in word_vectors.vocab:
            vocabs.append(key)
            vectors.append(word_vectors[key])

        df = pd.DataFrame()
        df['voca'] = vocabs
        df['vector'] = vectors

        df.to_csv(emb_fn,index=False)

    def save_model(self, md_fn):
        self.model.save(md_fn)
        print("word embedding model {} is trained".format(md_fn))

    def show_similar_words(self,word, topn):
        print(self.model.most_similar(positive=[word], topn=topn))


class title_tokenizer():
    def make_input_file(self, input_fn, sentences):
        with open(input_fn, 'w', encoding='utf8') as f:
            for sentence in sentences:
                f.write(sentence + '\n')

    def train_tokenizer(self, input_fn, prefix, vocab_size, model_type):
        templates = '--input={}         --pad_id=0         --bos_id=1         --eos_id=2         --unk_id=3         --model_prefix={}         --vocab_size={}         --character_coverage=1.0         --model_type={}'

        print(input_fn)

        cmd = templates.format(input_fn,
                               prefix,  # output model 이름
                               vocab_size,  # 작을수록 문장을 잘게 쪼갬
                               model_type)  # unigram (default), bpe, char

        spm.SentencePieceTrainer.Train(cmd)
        print("tokenizer model {} is trained".format(prefix + ".model"))

    def get_tokens(self, sp, sentences):
        tokenized_sentences = []

        for sentence in sentences:
            tokens = sp.EncodeAsPieces(sentence)
            new_tokens = []
            for token in tokens:
                token = token.replace("▁", "")
                if len(token) > 1:
                    new_tokens.append(token)
            if len(new_tokens) > 1:
                tokenized_sentences.append(new_tokens)

        return tokenized_sentences


def train_tokenizer_w2v(_train_file_path, _val_file_path, _test_file_path, _genre_file_path, _tokenize_input_file_path,
                        _submit_type):

    sentences = make_input4tokenizer(_train_file_path, _genre_file_path, _tokenize_input_file_path, _val_file_path,
                                     _test_file_path)

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    if not sentences:
        sys.exit(1)

    tokenizer_name = default_file_path+'/models/tokenizer_{}_{}_{}'.format(method, vocab_size, _submit_type)
    tokenizer_name_model = default_file_path+'/models/tokenizer_{}_{}_{}.model'.format(method, vocab_size, _submit_type)
    print("start train_tokenizer...w.")
    train_tokenizer(_tokenize_input_file_path, tokenizer_name, vocab_size, method)
    sp = spm.SentencePieceProcessor()
    sp.Load(tokenizer_name_model)
    tokenized_sentences = get_tokens_from_sentences(sp, sentences)

    w2v_name = default_file_path+'/models/w2v_{}_{}_{}.model'.format(method, vocab_size, _submit_type)
    print("start train_w2v....")
    model = string2vec(tokenized_sentences, size=200, window=5, min_count=1, workers=8, sg=1, hs=1)
    model.save_model(w2v_name)

    return tokenized_sentences

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.6MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


## train.py

~~~
.
├── train: AE 학습 수행을 위한 함수
├── AE 및 w2v 학습 수행을 위한 코드
.
~~~

In [10]:
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook

def train(train_dataset, model_file_path, id2prep_song_file_path, id2tag_file_path, question_dataset, answer_file_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 변환한 tag dict 및 노래 빈도수 dict load
    id2tag_dict = dict(np.load(id2tag_file_path, allow_pickle=True).item())
    id2prep_song_dict = dict(np.load(id2prep_song_file_path, allow_pickle=True).item())

    # parameters
    num_songs = train_dataset.num_songs
    num_tags = train_dataset.num_tags

    # hyper parameters
    D_in = D_out = num_songs + num_tags

    #local_val mode인 경우 중간 중간 결과 확인 => mode 2 => None
    q_data_loader = None
    check_every = 5
    tmp_result_file_path = default_file_path+'/results/tmp_results.json'
    evaluator = ArenaEvaluator()
    # mode 2 => None
    if question_dataset is not None:
        q_data_loader = DataLoader(question_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)

    # Auto encoder 투입용 DataLoader
    data_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)

    model = AutoEncoder(D_in, H, D_out, dropout=dropout).to(device)

    parameters = model.parameters()
    loss_func = nn.BCELoss()
    optimizer = torch.optim.Adam(parameters, lr=learning_rate)


    try:
        model = torch.load(model_file_path)
        print("\n--------model restored--------\n")
    except:
        print("\n--------model not restored--------\n")
        pass

    # temp_fn = 'arena_data/answers/temp.json'
    temp_fn = os.path.join(default_file_path,"answers","temp.json")
    if os.path.exists(temp_fn):
        os.remove(temp_fn)

    for epoch in range(epochs):
        print()
        print('epoch: ', epoch)
        running_loss = 0.0
        for idx, (_id, _data) in enumerate(tqdm_notebook(data_loader, desc='training...')):
            _data = _data.to(device)

            optimizer.zero_grad()
            output = model(_data)
            loss = loss_func(output, _data)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print('loss: %d %d%% %.4f' % (epoch, epoch / epochs * 100, running_loss))

        torch.save(model, model_file_path)

        if mode == 0:
            if epoch % check_every == 0:
                if os.path.exists(tmp_result_file_path):
                    os.remove(tmp_result_file_path)
                elements = []
                for idx, (_id, _data) in enumerate(tqdm(q_data_loader, desc='testing...')):
                    with torch.no_grad():
                        _data = _data.to(device)
                        output = model(_data)

                        songs_input, tags_input = torch.split(_data, num_songs, dim=1)
                        songs_output, tags_output = torch.split(output, num_songs, dim=1)

                        songs_ids = binary_songs2ids(songs_input, songs_output, id2prep_song_dict)
                        tag_ids = binary_tags2ids(tags_input, tags_output, id2tag_dict)

                        _id = list(map(int, _id))
                        for i in range(len(_id)):
                            element = {'id': _id[i], 'songs': list(songs_ids[i]), 'tags': tag_ids[i]}
                            elements.append(element)

                write_json(elements, tmp_result_file_path)
                evaluator.evaluate(answer_file_path, tmp_result_file_path)
                os.remove(tmp_result_file_path)


In [11]:
# dimension
H=450
epochs=2
batch_size=256
learning_rate=0.0005
dropout=0.2
num_workers=20
freq_thr=2
# parser.add_argument('-mode', type=int, help="local_val: 0, val: 1, test: 2", default=2)
mode = 2

# mode에 따른 train dataset과 관련 데이터 로드
question_data = None
question_dataset = None
answer_file_path = None

if mode == 0: # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴
    default_file_path = 'arena_data/'
    model_postfix = 'local_val'

    train_file_path = f'{default_file_path}/orig/train.json'
    question_file_path = f'{default_file_path}/questions/val.json'
    answer_file_path = f'{default_file_path}/answers/val.json'

    train_data = load_json(train_file_path)
    question_data = load_json(question_file_path)

elif mode == 1:
    # default_file_path = 'res'
    model_postfix = 'val'

    train_file_path = f'{default_file_path}/train.json'
    val_file_path = f'{default_file_path}/val.json'
    train_data = load_json(train_file_path) + load_json(val_file_path)

elif mode == 2:
    # default_file_path = 'res'
    model_postfix = 'test'

    train_file_path = f'{default_file_path}/train.json'
    val_file_path = f'{default_file_path}/val.json'
    test_file_path = f'{default_file_path}/test.json'
    train_data = load_json(train_file_path) + load_json(val_file_path) + load_json(test_file_path)

else:
    print('mode error! local_val: 0, val: 1, test: 2')
    sys.exit(1)

# Autoencoder의 input: song, tag binary vector의 concatenate, tags는 str이므로 id로 변형할 필요 있음
tag2id_file_path = f'{default_file_path}/tag2id_{model_postfix}.npy'
id2tag_file_path = f'{default_file_path}/id2tag_{model_postfix}.npy'
# Song이 너무 많기 때문에 frequency에 기반하여 freq_thr번 이상 등장한 곡들만 남김, 남은 곡들에게 새로운 id 부여
prep_song2id_file_path = f'{default_file_path}/freq_song2id_thr{freq_thr}_{model_postfix}.npy'
id2prep_song_file_path = f'{default_file_path}/id2freq_song_thr{freq_thr}_{model_postfix}.npy'
# 관련 데이터들이 없으면 default file path에 새로 만들음
if not (os.path.exists(tag2id_file_path) & os.path.exists(id2tag_file_path)):
    tags_ids_convert(train_data, tag2id_file_path, id2tag_file_path)

if not (os.path.exists(prep_song2id_file_path) & os.path.exists(id2prep_song_file_path)):
    save_freq_song_id_dict(train_data, freq_thr, default_file_path, model_postfix)

train_dataset = SongTagDataset(train_data, tag2id_file_path, prep_song2id_file_path)
if question_data is not None:
    question_dataset = SongTagDataset(question_data, tag2id_file_path, prep_song2id_file_path)

model_file_path = default_file_path+'/models/autoencoder_{}_{}_{}_{}_{}_{}.pkl'. \
    format(H, batch_size, learning_rate, dropout, freq_thr, model_postfix)

# 전체 song에 대한 playlist의 존재여부 벡터와 전체 Tag에 대한 playlist 존재여부 벡터를 concat && 학습
train(train_dataset, model_file_path, id2prep_song_file_path, id2tag_file_path, question_dataset, answer_file_path)

# w2v 학습 시작
vocab_size = 24000
# About bite pair encoding
# https://wikidocs.net/22592 
method = 'bpe'
if model_postfix == 'val':
    default_file_path = 'res'
    question_file_path = 'res/val.json'
    train_file_path = 'res/train.json'
elif model_postfix == 'test':
    # default_file_path = 'res'
    # project_path = "/gdrive/My Drive/colab/melon_playlist_continuation/"
    # default_file_path = os.path.join(project_path,"data")
    val_file_path = os.path.join(default_file_path,'val.json')
    question_file_path = os.path.join(default_file_path,'test.json')
    train_file_path = os.path.join(default_file_path, 'train.json')
elif model_postfix == 'local_val':
    default_file_path = 'arena_data'
    train_file_path = f'{default_file_path}/orig/train.json'
    question_file_path = f'{default_file_path}/questions/val.json'
    default_file_path = f'{default_file_path}/orig'

genre_file_path = os.path.join(default_file_path, 'genre_gn_all.json')

tokenize_input_file_path = default_file_path+f'/models/tokenizer_input_{method}_{vocab_size}_{model_postfix}.txt'

if model_postfix == 'local_val':
    val_file_path = None
    test_file_path = None
    train = load_json(train_file_path)
    question = load_json(question_file_path)
elif model_postfix == 'val':
    test_file_path = None
    val_file_path = question_file_path
    train = load_json(train_file_path)
    question = load_json(question_file_path)
elif model_postfix == 'test':
    val_file_path = val_file_path
    test_file_path = question_file_path
    train = load_json(train_file_path)
    val = load_json(val_file_path)
    test = load_json(test_file_path)
    train = train + val
    question = test

tokenized_sentences = train_tokenizer_w2v(train_file_path, val_file_path, test_file_path, genre_file_path, tokenize_input_file_path, model_postfix)

print('train completed')


--------model restored--------


epoch:  0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='training...', max=582.0, style=ProgressStyle(description_…


loss: 0 0% 2.1929

epoch:  1


HBox(children=(FloatProgress(value=0.0, description='training...', max=582.0, style=ProgressStyle(description_…


loss: 1 50% 2.1896
start train_tokenizer...w.
tokenizer /gdrive/MyDrive/colab/melon_playlist_continuation/data/models/tokenizer_bpe_24000_test is generated
start train_w2v....
word embedding model /gdrive/MyDrive/colab/melon_playlist_continuation/data/models/w2v_bpe_24000_test.model is trained
train completed


## get_autoencoder_scores.py

~~~
.
├── AE_get_plylsts_embeddings: 학습한 AE 모델을 활용하여 플레이리스트의 벡터들에 대해서 embedding을 실시하는 함수
├── AE_save_scores: embedding된 벡터를 가지고 유사도 점수를 추출하는 함수
├── get_autoencoder_scores: 위의 두 함수를 활용하는 함수
.
~~~

autoencoder로 임베딩한 벡터 기준

numpy형태로 저장되는 test_scores_bias_cos.npy, test_scores_bias_cos_gnr.npy 는 dictonary 형태로 저장되며, 형태는 아래와 같다.

* key: valid dataset의 playlist_id
* values: list([train dataset에서 가장 유사도가 높은 상위 1000개의 train playlist)id ],[train dataset에서 가장 유사도가 높은 상위 1000개의 유사도 점수])

In [11]:
import random
import torch.nn as nn
import sentencepiece as spm
import matplotlib.pyplot as plt

import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
# from MelonDataset import SongTagDataset, SongTagGenreDataset
# from data_util import *
# from arena_util import write_json, load_json
# from evaluate import ArenaEvaluator
from collections import Counter, defaultdict
# from Models import AutoEncoder

from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

random.seed(777)
np.random.seed(777)


def AE_get_plylsts_embeddings(_model_file_path, _submit_type, genre=False):

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    if _submit_type == 'val':
        default_file_path = 'res'
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path)
    elif _submit_type == 'test':
        # default_file_path = 'res'
        question_file_path = os.path.join(default_file_path,'test.json')
        train_file_path = os.path.join(default_file_path,'train.json')
        val_file_path = os.path.join(default_file_path,'val.json')
        train_dataset = load_json(train_file_path) + load_json(val_file_path)
    elif _submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        default_file_path = f'{default_file_path}/orig'
        train_dataset = load_json(train_file_path)

    tag2id_file_path = f'{default_file_path}/tag2id_{_submit_type}.npy'
    id2tag_file_path = f'{default_file_path}/id2tag_{_submit_type}.npy'
    prep_song2id_file_path = f'{default_file_path}/freq_song2id_thr2_{_submit_type}.npy'
    id2prep_song_file_path = f'{default_file_path}/id2freq_song_thr2_{_submit_type}.npy'

    if genre:
        train_dataset = SongTagGenreDataset(train_dataset, tag2id_file_path, prep_song2id_file_path)
        question_dataset = SongTagGenreDataset(load_json(question_file_path), tag2id_file_path, prep_song2id_file_path)
    else:
        train_dataset = SongTagDataset(train_dataset, tag2id_file_path, prep_song2id_file_path)
        question_dataset = SongTagDataset(load_json(question_file_path), tag2id_file_path, prep_song2id_file_path)

    plylst_embed_weight = []
    plylst_embed_bias = []

    model_file_path = _model_file_path

    model = torch.load(model_file_path)
    for name, param in model.named_parameters():
        if param.requires_grad:
            if name == 'encoder.1.weight':
                plylst_embed_weight = param.data
            elif name == 'encoder.1.bias':
                plylst_embed_bias = param.data

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=256, num_workers=4)
    question_loader = DataLoader(question_dataset, shuffle=True, batch_size=256, num_workers=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # AE로 학습한 모델을 evaluation / test를 위해 변환
    plylst_emb_with_bias = dict()

    # 대분류 / 소분류 장르 벡터를 AE로 변환한 vector와 concat
    if genre:
        for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate(tqdm(train_loader, desc='get train vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                #???????????? why??
                output_with_bias = (torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist()
                output_with_bias = np.concatenate([output_with_bias, _dnr, _dtl_dnr], axis=1)

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]

        for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate(tqdm(question_loader, desc='get question vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist()
                output_with_bias = np.concatenate([output_with_bias, _dnr, _dtl_dnr], axis=1)

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]
    # AE로 변환한 vector만.
    else:
        for idx, (_id, _data) in enumerate(tqdm(train_loader, desc='get train vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist()

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]

        for idx, (_id, _data) in enumerate(tqdm(question_loader, desc='get question vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist()

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]

    # 훈련셋과 테스트셋에 대한 벡터값 추출
    return plylst_emb_with_bias


def AE_save_scores(_autoencoder_embs, _score_type, _submit_type, genre=False):

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    if _submit_type == 'val':
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path)
    elif _submit_type == 'test':
        question_file_path = os.path.join(default_file_path,'test.json')
        train_file_path = os.path.join(default_file_path, 'train.json')
        val_file_path = os.path.join(default_file_path, 'val.json')
        train_dataset = load_json(train_file_path) + load_json(val_file_path)
    elif _submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        train_dataset = load_json(train_file_path)

    _train = train_dataset
    _val = load_json(question_file_path)

    def pcc(_x, _y):
        vx = _x - torch.mean(_x)
        vy = _y - torch.mean(_y, axis=1).reshape(-1, 1)
        return torch.sum((vx * vy), axis=1) / (
                    torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum((vy ** 2), axis=1)))

    def euclidean(_x, _y):
        return torch.sqrt(torch.sum((_y - _x) ** 2, axis=1))

    # 합쳐진 emb를 playlist id로 구분
    all_train_ids = [plylst['id'] for plylst in _train]
    all_val_ids = [plylst['id'] for plylst in _val]

    train_ids = []
    train_embs = []
    val_ids = []
    val_embs = []

    for plylst_id, emb in tqdm(_autoencoder_embs.items()):
        if plylst_id in all_train_ids:
            train_ids.append(plylst_id)
            train_embs.append(emb)
        elif plylst_id in all_val_ids:
            val_ids.append(plylst_id)
            val_embs.append(emb)

    gpu = torch.device('cuda')
    cos = nn.CosineSimilarity(dim=1)

    train_tensor = torch.tensor(train_embs).to(gpu)
    val_tensor = torch.tensor(val_embs).to(gpu)

    # scores 와 sorted_id는 순서대로 기록
    # simliarity 점수를 출력 (val 개체 대상 수 X train 개체 대상 수)
    scores = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.float64)
    # simliarity 점수가 높은 순서를 기록
    sorted_idx = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.int32)

    # 하나의 val_vector와 전체 train_tensor간의 cosine simillarity 계산
    for idx, val_vector in enumerate(tqdm(val_tensor)):
        if _score_type == 'pcc':
            output = pcc(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'cos':
            output = cos(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'euclidean':
            output = euclidean(val_vector.reshape(1, -1), train_tensor)
        # simliarity 값이 높은 index를 앞으로 정렬
        index_sorted = torch.argsort(output, descending=True)
        scores[idx] = output
        sorted_idx[idx] = index_sorted

    # val 대상 playlist_id를 key로 하여 
    results = defaultdict(list)
    for i, val_id in enumerate(tqdm(val_ids)):
        # 가장 상관성이 높은 상위 1000개의 플레이리스트
        for j, train_idx in enumerate(sorted_idx[i][:1000]):
            # (상위 1000개의 유사한 playlist_id, 상위 1000개의 유사도 점수)
            results[val_id].append((train_ids[train_idx], scores[i][train_idx].item()))
    if genre:
        if _submit_type == 'val':
            np.save(default_file_path+f'/results/val_scores_bias_{_score_type}_gnr', results)
        elif _submit_type == 'test':
            np.save(default_file_path+f'/results/test_scores_bias_{_score_type}_gnr', results)
        else:
            np.save(default_file_path+f'/results/local_val_scores_bias_{_score_type}_gnr', results)
    else:
        if _submit_type == 'val':
            np.save(default_file_path+f'/results/val_scores_bias_{_score_type}', results)
        elif _submit_type == 'test':
            np.save(default_file_path+f'/results/test_scores_bias_{_score_type}', results)
        else:
            np.save(default_file_path+f'/results/local_val_scores_bias_{_score_type}', results)


def get_autoencoder_scores(model_file_path, submit_type):
    print("get autoencoder's latent embeddings")
    plylst_emb_with_bias = AE_get_plylsts_embeddings(model_file_path, submit_type, False)

    print("get autoencoder's latent embeddings (genre embeddings are concated)")
    plylst_emb_with_bias_gnr = AE_get_plylsts_embeddings(model_file_path, submit_type, True)

    print("save cos-similarity scores between test embeddings")
    AE_save_scores(plylst_emb_with_bias, 'cos', submit_type, False)

    print("save cos-similarity scores between (test + genre) embeddings and train embeddings")
    AE_save_scores(plylst_emb_with_bias_gnr, 'cos', submit_type, True)


## get_w2v_scores.py

~~~
.
├── W2V_get_plylsts_embeddings: 학습한 w2v 모델을 활용하여 플레이리스트의 벡터들에 대해서 embedding을 실시하는 함수
├── W2V_save_scores: embedding된 벡터를 가지고 유사도 점수를 추출하는 함수
├── get_w2v_scores: 위의 두 함수를 활용하는 함수
.
~~~

w2v 모델로 임베딩한 벡터 기준

numpy형태로 저장되는 test_scores_bias_cos.npy, test_scores_bias_cos_gnr.npy 는 dictonary 형태로 저장되며, 형태는 아래와 같다.

* key: valid dataset의 playlist_id
* values: list([train dataset에서 가장 유사도가 높은 상위 1000개의 train playlist)id ],[train dataset에서 가장 유사도가 높은 상위 1000개의 유사도 점수])

In [12]:
import os
import sys
import json
import torch
import io
import os
import copy
import random
import math
import datetime as dt
import distutils.dir_util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sentencepiece as spm

from collections import defaultdict
from tqdm import tqdm
from gensim.models import Word2Vec as w2v
from collections import Counter
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from torch import nn
# from arena_util import write_json, load_json
# from w2v import title_tokenizer

vocab_size = 24000
method = 'bpe'

# 학습시킨 W2V에 대해서 embedding하는 벡터를 가져오는 함수
def W2V_get_plylsts_embeddings(_train, _question, _submit_type):

    
    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    print('saving embeddings')

    # toekenizer model
    tokenizer_name = default_file_path+'/models/tokenizer_{}_{}_{}.model'.format(method, vocab_size, _submit_type)
    sp = spm.SentencePieceProcessor()
    sp.Load(tokenizer_name)
    tt = title_tokenizer()

    # w2v model
    w2v_model_name = default_file_path+'/models/w2v_{}_{}_{}.model'.format(method, vocab_size, _submit_type)
    w2v_model = w2v.load(w2v_model_name)

    # train plylsts to vectors
    t_plylst_title_tag_emb = {}  # plylst_id - vector dictionary
    for plylst in tqdm(_train):
        p_id = plylst['id']
        p_title = plylst['plylst_title']
        p_title_tokens = tt.get_tokens(sp, [p_title])
        if len(p_title_tokens):
          # ?????
            p_title_tokens = p_title_tokens[0]
        else:
            p_title_tokens = []
        p_tags = plylst['tags']
        p_times = plylst['updt_date'][:7].split('-')
        p_words = p_title_tokens + p_tags + p_times
        word_embs = []
        for p_word in p_words:
            try:
                word_embs.append(w2v_model.wv[p_word])
            except KeyError:
                pass
        if len(word_embs):
            p_emb = np.average(word_embs, axis=0).tolist()
        else:
            p_emb = np.zeros(200).tolist()

        t_plylst_title_tag_emb[p_id] = p_emb

    # val plylsts to vectors
    for plylst in tqdm(_question):
        p_id = plylst['id']
        p_title = plylst['plylst_title']
        p_title_tokens = tt.get_tokens(sp, [p_title])
        p_songs = plylst['songs']
        if len(p_title_tokens):
            p_title_tokens = p_title_tokens[0]
        else:
            p_title_tokens = []
        p_tags = plylst['tags']
        p_times = plylst['updt_date'][:7].split('-')
        p_words = p_title_tokens + p_tags + p_times
        word_embs = []
        for p_word in p_words:
            try:
                word_embs.append(w2v_model.wv[p_word])
            except KeyError:
                pass
        if len(word_embs):
            p_emb = np.average(word_embs, axis=0).tolist()
        else:
            p_emb = np.zeros(200).tolist()
        t_plylst_title_tag_emb[p_id] = p_emb

    return t_plylst_title_tag_emb


def W2V_save_scores(_train, _question, _autoencoder_embs, _score_type, _submit_type):

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    print('saving scores...')

    def pcc(_x, _y):
        vx = _x - torch.mean(_x)
        vy = _y - torch.mean(_y, axis=1).reshape(-1, 1)
        return torch.sum((vx * vy), axis=1) / (
                    torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum((vy ** 2), axis=1)))

    def euclidean(_x, _y):
        return torch.sqrt(torch.sum((_y - _x) ** 2, axis=1))

    all_train_ids = [plylst['id'] for plylst in _train]
    all_val_ids = [plylst['id'] for plylst in _question]

    train_ids = []
    train_embs = []
    val_ids = []
    val_embs = []

    for plylst_id, emb in tqdm(_autoencoder_embs.items()):
        if plylst_id in all_train_ids:
            train_ids.append(plylst_id)
            train_embs.append(emb)
        elif plylst_id in all_val_ids:
            val_ids.append(plylst_id)
            val_embs.append(emb)

    gpu = torch.device('cuda')
    cos = nn.CosineSimilarity(dim=1)
    train_tensor = torch.tensor(train_embs).to(gpu)
    val_tensor = torch.tensor(val_embs).to(gpu)

    scores = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.float64)
    sorted_idx = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.int32)

    for idx, val_vector in enumerate(tqdm(val_tensor)):
        if _score_type == 'pcc':
            output = pcc(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'cos':
            output = cos(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'euclidean':
            output = euclidean(val_vector.reshape(1, -1), train_tensor)
        index_sorted = torch.argsort(output, descending=True)
        scores[idx] = output
        sorted_idx[idx] = index_sorted

    results = defaultdict(list)

    for i, val_id in enumerate(tqdm(val_ids)):
        for j, train_idx in enumerate(sorted_idx[i][:1000]):
            results[val_id].append((train_ids[train_idx], scores[i][train_idx].item()))

    if _submit_type == 'val':
        np.save(default_file_path+f'/results/val_scores_title_{_score_type}_24000', results)
    elif _submit_type == 'test':
        np.save(default_file_path+f'/results/test_scores_title_{_score_type}_24000', results)
    elif _submit_type == 'local_val':
        np.save(default_file_path+f'/results/local_val_scores_title_{_score_type}_24000', results)
    else:
        np.save(default_file_path+f'/results/test_scores_title_{_score_type}_24000', results)

    


def get_w2v_scores(submit_type):

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    if submit_type == 'val':
        default_file_path = 'res'
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
    elif submit_type == 'test':
        # default_file_path = 'res'
        val_file_path = os.path.join(default_file_path,'val.json')
        question_file_path = os.path.join(default_file_path,'test.json')
        train_file_path = os.path.join(default_file_path,'train.json')
    elif submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        default_file_path = f'{default_file_path}/orig'

    genre_file_path = os.path.join(default_file_path,'genre_gn_all.json')

    tokenize_input_file_path = default_file_path+f'/models/tokenizer_input_{method}_{vocab_size}_{submit_type}.txt'

    if submit_type == 'local_val':
        val_file_path = None
        test_file_path = None
        train = load_json(train_file_path)
        question = load_json(question_file_path)
    elif submit_type == 'val':
        test_file_path = None
        val_file_path = question_file_path
        train = load_json(train_file_path)
        question = load_json(question_file_path)
    elif submit_type == 'test':
        val_file_path = val_file_path
        test_file_path = question_file_path
        train = load_json(train_file_path)
        val = load_json(val_file_path)
        test = load_json(test_file_path)
        train = train + val
        question = test

    plylst_title_tag_emb = W2V_get_plylsts_embeddings(train, question, submit_type)
    W2V_save_scores(train, question, plylst_title_tag_emb, 'cos', submit_type)
    

## recommender.py

~~~
.
├── DicGenerator: 학습한 w2v 모델을 활용하여 플레이리스트의 벡터들에 대해서 embedding을 실시하는 함수
├── Recommender: 추천 프로세스를 실행하는 함수
│   ├── most_similar: Counter 객체에서 가장 유사한 k개의 element를 반환하는함수
│   ├── most_similar_emb: 미리 계산한 유사도 기준 top k 개의 playlist와 score를 반환하는 함수
│   ├── get_new_song_plylst_dict: 들어오는 플레이리스트에 대해 새 song : playlist dictionary 생성
.
~~~

In [13]:
## 라이브러리 불러오기
import copy
import random
import numpy as np
import datetime as dt

from tqdm import tqdm
from collections import Counter
from collections import defaultdict
# from arena_util import write_json, load_json, remove_seen, most_popular

## 모델 튜닝을 위해 시드 고정
random.seed(777)
np.random.seed(777)


## 빠른 접근을 위한 Dictionary 생성 
def DicGenerator(train, x):
    # key: song / value: issue_date
    song_issue_dic = defaultdict(lambda: '')

    for i in range(len(song_meta)):
        song_issue_dic[song_meta[i]['id']] = song_meta[i]['issue_date']

    # key: song / value: artist_id_basket
    song_artist_dic = defaultdict(lambda: [])

    for i in range(len(song_meta)):
        lt_art_id = song_meta[i]['artist_id_basket']
        song_artist_dic[song_meta[i]['id']] = lt_art_id

    # key: song / value: playlist
    song_plylst_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        for t_s in train[i]['songs']:
            song_plylst_dic[t_s] += [train[i]['id']]

    # key: song / value: tag
    song_tag_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        for t_s in train[i]['songs']:
            song_tag_dic[t_s] += train[i]['tags']

    # key: plylst / value: song
    plylst_song_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        plylst_song_dic[train[i]['id']] += train[i]['songs']

    # key: plylst / value: tag
    plylst_tag_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        plylst_tag_dic[train[i]['id']] += train[i]['tags']

    # key: tag / value: plylst
    tag_plylst_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        for t_q in train[i]['tags']:
            tag_plylst_dic[t_q] += [train[i]['id']]

    # key: tag / value: song
    tag_song_dic = defaultdict(lambda: [])

    for i in range(len(train)):
        for t_q in train[i]['tags']:
            tag_song_dic[t_q] += train[i]['songs']

    return song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic


## 추천 함수

'''
input
 > train: 학습에 사용할 playlist들
 > questions: 일부가 가려진 question용 playlist들
 > n_msp, n_mtp, sim_measure: 하이퍼파라미터
 > song_meta: song의 meta 정보
 > save: 파일 저장 여부
output
 > questions에 대한 최종 추천 리스트
'''


def Recommender(train, questions, n_msp, n_mtp, mode, sim_measure, song_meta, freq_song, save=False):

    project_path = "/gdrive/MyDrive/colab/melon_playlist_continuation/"
    default_file_path = os.path.join(project_path,"data")

    ## 최종 추천리스트
    rec_list = []

    ## 1단계: 전처리
    # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성
    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 20)

    # 2) 빠른 접근을 위한 Dictionary 생성
    # {song_id ,:소속_plylst_id}, {song_id : tag_id}, {plylst_id: song_id}, {plylst_id : tag_id}, {tag_id : plylst_id}, {tag_id : song_id}, {song_id, issue_date}, {song_id : artist_id}
    song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator(
        train, song_meta)

    # 3) 미리 계산한 플레이리스트 유사도 불러오기
    '''
    sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반)
    gnr_scores: 입력으로 들어온 questions과 train간 유사도 ([Autoencoder] genre 정보 추가) 
    : 초기 AE 학습시에는 SongTagDataset으로 해놓고 왜... 그 모델을 가지고 SongTagGenreDataset으로 스코어를 낸거를 활용하지??
    title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반)
    '''
    sim_scores = np.load(default_file_path+f'/results/{mode}_scores_bias_{sim_measure}.npy', allow_pickle=True).item()
    gnr_scores = np.load(default_file_path+f'/results/{mode}_scores_bias_{sim_measure}_gnr.npy', allow_pickle=True).item()
    title_scores = np.load(default_file_path+f'/results/{mode}_scores_title_{sim_measure}_24000.npy', allow_pickle=True).item()

    ## 2단계: 함수 정의
    # 1) Counter 객체에서 빈도수 기준 topk개 출력
    def most_similar(cnt, topk):
        cnt_topk = cnt.most_common(topk)
        return [k for k, v in cnt_topk]

    # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력
    def most_similar_emb(q_id, topk, title=False, genre=False):
        # title_scores 기준
        if title:
            plylsts = [t[0] for t in title_scores[q_id][:topk]]
            scores = [t[1] for t in title_scores[q_id][:topk]]
        # gnr_scores 기준
        elif genre:
            plylsts = [t[0] for t in gnr_scores[q_id][:topk]]
            scores = [t[1] for t in gnr_scores[q_id][:topk]]
        # sim_scores 기준
        else:
            plylsts = [t[0] for t in sim_scores[q_id][:topk]]
            scores = [t[1] for t in sim_scores[q_id][:topk]]
        return plylsts, scores

    # 유사도 점수로 추출한 playlist_id들에 수록된 노래들을 plylst_song_dic로 추출하고 이를 {song_id : playlist_id} dictionary로 반환
    # 3) new_song_plylst_dict
    def get_new_song_plylst_dict(plylst_ms):
        new_song_plylst_dict = defaultdict(set)
        for plylst in plylst_ms:
            for _song in plylst_song_dic[plylst]:
                new_song_plylst_dict[_song].add(plylst)
        return new_song_plylst_dict

    ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천
    for q in tqdm(questions):

        # 1) question 플레이리스트의 정보
        # 수록 song/tag
        q_songs = q['songs']
        q_tags = q['tags']

        # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수
        song_plylst_C = Counter()
        song_tag_C = Counter()
        tag_plylst_C = Counter()
        tag_song_C = Counter()

        # 수록 song/tag가 둘 다 없거나 적을 때
        no_songs_tags, few_songs_tags = False, False
        if len(q_songs) == 0 and len(q_tags) == 0:
            no_songs_tags = True
        elif len(q_songs) <= 3:
            few_songs_tags = True

        # 2) 빈도수 기반 추천을 위해 카운트
        # 수록 song에 대해
        for q_s in q_songs:
            song_plylst_C.update(song_plylst_dic[q_s])
            song_tag_C.update(song_tag_dic[q_s])
        # 수록 tag에 대해
        for q_t in q_tags:
            tag_plylst_C.update(tag_plylst_dic[q_t])
            tag_song_C.update(tag_song_dic[q_t])
        # 수록곡 수로 나눠서 비율로 계산 
        # (특정 플레이리스트에 수록된 노래들이 다른 플레이리스트들에 몇번 등장했는지 카운트하고 등장한 플레이리스트의 전체 노래 중에 해당 노래의 비율을 계산)
        for i, j in list(song_plylst_C.items()):
            if len(plylst_song_dic[i]) > 0:
                song_plylst_C[i] = (j / len(plylst_song_dic[i]))

        # 3) 유사도 기반 추천을 위해 점수 계산
        plylst_song_scores = defaultdict(lambda: 0)
        plylst_tag_scores = defaultdict(lambda: 0)

        # Case 1: song과 tag가 둘 다 없는 경우
        if no_songs_tags:
            # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            # w2v
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp, title=True)
            plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True)
            # ae
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp)

        # Case 2: song과 tag가 부족한 경우
        elif few_songs_tags:
            # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True)
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, genre=True)

        # Case 3: song과 tag가 충분한 경우
        else:
            # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, genre=True)
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, title=True)

        new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms)

        # 3-1. plylst_song_scores 계산
        # plylst_ms: 유사도 점수로 추출한 상위 k개의 playlist
        for idx, ms_p in enumerate(plylst_ms):
            for song in plylst_song_dic[ms_p]:
                song_score = 0
                for q_s in q_songs:
                    try:
                        # question으로 들어온 노래와 추천을 통해 추출한 playlist 중 노래에 있는경우 = 1 -- a
                        # a / 추천을 통해 얻은 playlist로 생성한 song_id : playlist_id 의 특정 song의 playlist 개수
                        song_score += len(new_song_plylst_dict[q_s] & new_song_plylst_dict[song]) / len(
                            new_song_plylst_dict[q_s])
                    except:
                        pass
                if song in freq_song:
                  # plylst_ms에서 상위에 등장한 플레이리스트의 song 일수록 가중치 부여 &  n번이상 등장하는 노래에 대항 가중치 부여
                    plylst_song_scores[song] += song_plylst_C[ms_p] * song_score * song_scores[idx] * (n_msp - idx) * 4
                else:
                    plylst_song_scores[song] += song_plylst_C[ms_p] * song_score * song_scores[idx] * (n_msp - idx)
            # 왜 tag_score 계산이 song_score 계산에 있는가?
            for tag in plylst_tag_dic[ms_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx)

        # 3-2. plylst_tag_scores 계산
        for idx, mt_p in enumerate(plylst_mt):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx)
            # 왜 song_score 계산이 tag_score 계산에 있는가?
            for song in plylst_song_dic[mt_p]:
                plylst_song_scores[song] += tag_scores[idx]

        # 3-3. plylst_{song/tag}_scores 보정
        for idx, mt_p in enumerate(plylst_add):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx)

        # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기
        if no_songs_tags:
            # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측)
            pre_songs = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True)
            pre_songs = [scores[0] for scores in pre_songs][:200]
            pre_songs = pre_songs + remove_seen(pre_songs, song_mp)
            q_songs = pre_songs[:100]

            # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측)
            pre_tags = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True)
            pre_tags = [scores[0] for scores in pre_tags][:20]
            pre_tags = pre_tags + remove_seen(pre_tags, tag_mp)
            q_tags = pre_tags[:10]

        # 5) questions 플레이리스트에 대해 추천
        ## song 추천
        # song 있을 때
        lt_song_art = []
        if len(q_songs) > 0:
            plylst_song_scores = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True)

            lt_artist = []
            for w_song in q_songs:
                # artist는 여러명일 수 있기 때문에 extend
                lt_artist.extend(song_artist_dic[w_song])
            counter_artist = Counter(lt_artist)
            counter_artist = sorted(counter_artist.items(), key=lambda x: x[1], reverse=True)
            if few_songs_tags:
                artist = [art[0] for art in counter_artist]
            else:
                artist = [x[0] for x in counter_artist if x[1] > 1]
            # 가수의 수와 candidate의 개수를 변경하는 것과는 무슨 연관???
            cand_ms = [scores[0] for scores in plylst_song_scores][(100 - len(artist)):1000]
            for cand in cand_ms:
                if artist == []:
                    break
                if cand in q_songs:
                    break
                for art in song_artist_dic[cand]:
                    if art in artist:
                        lt_song_art.append(cand)
                        artist.remove(art)
                        break
            song_ms = [scores[0] for scores in plylst_song_scores][:200]


        # song 없고, tag 있을 때
        else:
            song_ms = most_similar(tag_song_C, 200)

        ## tag 추천
        # tag 있을 때
        if len(q_tags) > 0:
            plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        # tag 없고, song 있을 때
        else:
            plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        ## issue date 늦은 song 제거
        if q['updt_date']:
            q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q['updt_date'][8:10]
            song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date]

        ## 중복 제거 및 부족하면 most_popular로 채워넣기
        song_candidate = song_ms + remove_seen(song_ms, song_mp)
        tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp)

        # 기존에 test set에 들어가 있는 태그나 노래는 제거
        song_remove = q_songs
        tag_remove = q_tags

        # 노래나 태그가 test set에 없던 경우라면 바로 상위 100개를 사용; 아니라면 candidate에서 제거한 후 상위 100개
        song_candidate = song_candidate[:100] if no_songs_tags else remove_seen(song_remove, song_candidate)[:100]
        if len(lt_song_art) > 0:
            # 계산한 song_candidate를 제외한 노래
            lt_song_art = [x for x in lt_song_art if x not in song_candidate]
            # 100개 중의 위의 해당하는것들을 하위 n개로 변경
            song_candidate[(100 - len(lt_song_art)):100] = lt_song_art

        rec_list.append({
            "id": q["id"],
            "songs": song_candidate,
            "tags": tag_candidate[:10] if no_songs_tags else remove_seen(tag_remove, tag_candidate)[:10]
        })

    # 6) results.json 파일 저장 여부
    if save == True:
        write_json(rec_list, default_file_path+'/results/results_' + dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json')

    return rec_list

## Inference.py

학습된 모델을 가지고 추천을 수행하는 python script

In [14]:
import sys
# from MelonDataset import SongTagDataset, SongTagGenreDataset
# from arena_util import write_json, load_json
# from get_autoencoder_scores import get_autoencoder_scores
# from get_w2v_scores import get_w2v_scores
# import argparse
# from recommender import Recommender
import numpy as np
import os

# 유사도 측정 방식
sim_measure = 'cos'

# 상위 song / tag 추출 개수
n_msp = 50
n_mtp = 90
freq_thr = 2

mode = 2
_submit_type = mode

# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('-mode', type=int, help="local_val: 0, val: 1, test: 2", default=2)
#     args = parser.parse_args()
#     _submit_type = args.mode

if _submit_type == 0:  # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴
    default_file_path = 'arena_data/'
    model_postfix = 'local_val'

    train_file_path = f'{default_file_path}/orig/train.json'
    question_file_path = f'{default_file_path}/questions/val.json'
    answer_file_path = f'{default_file_path}/answers/val.json'

    train_data = load_json(train_file_path)
    question_data = load_json(question_file_path)
    model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_local_val.pkl"
    auto_score_file_path = "scores/local_val_scores_bias_cos"
    w2v_score_file_path = 'scores/local_val_scores_title_cos_24000'

elif _submit_type == 1:
    default_file_path = 'res'
    model_postfix = 'val'

    train_file_path = f'{default_file_path}/train.json'
    val_file_path = f'{default_file_path}/val.json'
    train_data = load_json(train_file_path) + load_json(val_file_path)
    question_data = load_json(val_file_path)
    model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_val.pkl"
    auto_score_file_path = "scores/val_scores_bias_cos"
    w2v_score_file_path = 'scores/val_scores_title_cos_24000'

elif _submit_type == 2:
    # default_file_path = 'res'
    model_postfix = 'test'

    train_file_path = f'{default_file_path}/train.json'
    val_file_path = f'{default_file_path}/val.json'
    test_file_path = f'{default_file_path}/test.json'
    train_data = load_json(train_file_path) + load_json(val_file_path) + load_json(val_file_path) + load_json(
        test_file_path)
    question_data = load_json(test_file_path)
    model_file_path = default_file_path+"/models/autoencoder_450_256_0.0005_0.2_2_test.pkl"
    auto_score_file_path = default_file_path+"/results/test_scores_bias_cos"
    w2v_score_file_path = default_file_path+'/results/test_scores_title_cos_24000'

else:
    print('mode error! local_val: 0, val: 1, test: 2')
    sys.exit(1)

# Autoencoder의 input: song, tag binary vector의 concatenate, tags는 str이므로 id로 변형할 필요 있음
tag2id_file_path = f'{default_file_path}/tag2id_{model_postfix}.npy'
id2tag_file_path = f'{default_file_path}/id2tag_{model_postfix}.npy'
# Song이 너무 많기 때문에 frequency에 기반하여 freq_thr번 이상 등장한 곡들만 남김, 남은 곡들에게 새로운 id 부여
prep_song2id_file_path = f'{default_file_path}/freq_song2id_thr{freq_thr}_{model_postfix}.npy'
id2prep_song_file_path = f'{default_file_path}/id2freq_song_thr{freq_thr}_{model_postfix}.npy'

tokenizer_model_path = default_file_path+'/models/tokenizer_bpe_24000_{}.model'.format(model_postfix)
w2v_model_path = default_file_path+'/models/w2v_bpe_24000_{}.model'.format(model_postfix)
if (not os.path.exists(model_file_path)) or (not os.path.exists(tokenizer_model_path)) \
        or (not os.path.exists(w2v_model_path)):
    print("Error: there is no autoencoder model. Please execute train.py first")
    sys.exit(1)

if (not os.path.exists(auto_score_file_path + '.npy')) or (not os.path.exists(auto_score_file_path + '_gnr.npy')):
    get_autoencoder_scores(model_file_path, model_postfix)
if not os.path.exists(w2v_score_file_path + '.npy'):
    get_w2v_scores(model_postfix)

song_meta = load_json(os.path.join(default_file_path,'song_meta.json'))
prep_song2id = dict(np.load(prep_song2id_file_path, allow_pickle=True).item())
freq_song = set(prep_song2id.keys())

rec_list = Recommender(train_data, question_data, n_msp, n_mtp, model_postfix, sim_measure, song_meta, freq_song,
                        save=True)


get autoencoder's latent embeddings


get train vectors...: 100%|██████████| 540/540 [02:31<00:00,  3.57it/s]
get question vectors...: 100%|██████████| 42/42 [00:19<00:00,  2.12it/s]


get autoencoder's latent embeddings (genre embeddings are concated)


get train vectors...: 100%|██████████| 540/540 [07:56<00:00,  1.13it/s]
get question vectors...: 100%|██████████| 42/42 [00:33<00:00,  1.26it/s]


save cos-similarity scores between test embeddings


100%|██████████| 148826/148826 [04:54<00:00, 505.37it/s]
100%|██████████| 10740/10740 [00:51<00:00, 207.36it/s]
100%|██████████| 10740/10740 [01:34<00:00, 114.19it/s]


save cos-similarity scores between (test + genre) embeddings and train embeddings


100%|██████████| 148826/148826 [04:52<00:00, 508.53it/s]
100%|██████████| 10740/10740 [01:49<00:00, 97.74it/s]
100%|██████████| 10740/10740 [01:37<00:00, 109.64it/s]


saving embeddings


100%|██████████| 138086/138086 [00:13<00:00, 10162.32it/s]
100%|██████████| 10740/10740 [00:00<00:00, 15526.89it/s]
  2%|▏         | 3208/148826 [00:00<00:04, 32059.01it/s]

saving scores...


100%|██████████| 148826/148826 [05:09<00:00, 480.79it/s]
100%|██████████| 10740/10740 [00:33<00:00, 316.69it/s]
100%|██████████| 10740/10740 [01:37<00:00, 110.12it/s]
100%|██████████| 10740/10740 [17:23<00:00, 10.29it/s]
