In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 협업 필터링 (Collaborative filtering)

- 주어진 train_data를 8:2 비율로 나누어서 검증 데이터(question/answer) 만드는 작업 필요 (여기서 쓰는 val.json 데이터는 대회 제출 시 사용하는 val.json 데이터와 다름)
- 카카오 아레나 멜론 깃허브 베이스라인 코드 참고
 - https://github.com/kakao-arena/melon-playlist-continuation

In [None]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



In [None]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [None]:
from collections import Counter

import numpy as np
import pandas as pd

import scipy.sparse as spr
import pickle

- train/val split

In [None]:
# -*- coding: utf-8 -*-
import copy
import random
import numpy as np


class ArenaSplitter:
    def _split_data(self, playlists):
        tot = len(playlists)
        train = playlists[:int(tot*0.80)]
        val = playlists[int(tot*0.80):]

        return train, val

    def _mask(self, playlists, mask_cols, del_cols):
        q_pl = copy.deepcopy(playlists)
        a_pl = copy.deepcopy(playlists)

        for i in range(len(playlists)):
            for del_col in del_cols:
                q_pl[i][del_col] = []
                if del_col == 'songs':
                    a_pl[i][del_col] = a_pl[i][del_col][:100]
                elif del_col == 'tags':
                    a_pl[i][del_col] = a_pl[i][del_col][:10]

            for col in mask_cols:
                mask_len = len(playlists[i][col])
                mask = np.full(mask_len, False)
                mask[:mask_len//2] = True
                np.random.shuffle(mask)

                q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
                a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

        return q_pl, a_pl

    def _mask_data(self, playlists):
        playlists = copy.deepcopy(playlists)
        tot = len(playlists)
        song_only = playlists[:int(tot * 0.3)]
        song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
        tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
        title_only = playlists[int(tot * 0.95):]

        print(f"Total: {len(playlists)}, "
              f"Song only: {len(song_only)}, "
              f"Song & Tags: {len(song_and_tags)}, "
              f"Tags only: {len(tags_only)}, "
              f"Title only: {len(title_only)}")

        song_q, song_a = self._mask(song_only, ['songs'], ['tags'])
        songtag_q, songtag_a = self._mask(song_and_tags, ['songs', 'tags'], [])
        tag_q, tag_a = self._mask(tags_only, ['tags'], ['songs'])
        title_q, title_a = self._mask(title_only, [], ['songs', 'tags'])

        q = song_q + songtag_q + tag_q + title_q
        a = song_a + songtag_a + tag_a + title_a

        shuffle_indices = np.arange(len(q))
        np.random.shuffle(shuffle_indices)

        q = list(np.array(q)[shuffle_indices])
        a = list(np.array(a)[shuffle_indices])

        return q, a

    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")

In [None]:
split = ArenaSplitter()
split.run("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/train.json")

Reading data...

Total playlists: 115071
Splitting data...
Original train...
Original val...
Masked val...
Total: 23015, Song only: 6904, Song & Tags: 11508, Tags only: 3452, Title only: 1151


In [None]:
song_meta = pd.read_json("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/song_meta.json")
train = pd.read_json("/content/arena_data/orig/train.json")
test = pd.read_json("/content/arena_data/questions/val.json")

playlist, song, tag의 id(각각 nid, sid, tid)를 새로 생성하는 이유는, 새로 생성할 id를 matrix의 row, column index로 사용할 것이기 때문입니다.

- plylst_id_nid : playlist id -> nid
- plylst_nid_id : playlist nid -> id
- song_id_sid : song id -> sid
- song_sid_id : song sid -> id
- tag_id_tid : tag id -> tid
- tag_tid_id : tag tid -> id
- song_dict : song id -> count
- tag_dict : tag id -> count

In [None]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,"[힐링, 휴식, 밤, 새벽]",147668,To. 힘들고 지친 분들에게,"[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27.000
1,[팝],50422,130807-7,"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11.000
2,[뉴에이지],116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50.000
3,"[하드락, 록스피릿, 댄스]",55076,당신을 하얗게 불태울 곡들,"[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25.000
4,"[힐링, 휴식, 기분전환]",125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50.000


In [None]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [None]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

n_songs = len(song_dict)

plylst의 songs와 tags를 새로운 id로 변환하여 DataFrame에 추가합니다

In [None]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [None]:
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [None]:
plylst_use

Unnamed: 0_level_0,istrain,updt_date,songs_id,tags_id,num_songs,num_tags
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,2016-06-23 10:06:27.000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3]",66,4
1,1,2013-08-15 13:17:11.000,"[66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 7...",[4],14,1
2,1,2015-09-03 16:51:50.000,"[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9...",[5],15,1
3,1,2017-01-09 15:41:25.000,"[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, ...","[6, 7, 8]",21,3
4,1,2016-02-22 12:32:50.000,"[116, 117, 118, 119, 120, 121, 122, 123, 124, ...","[0, 1, 9]",35,3
...,...,...,...,...,...,...
115066,0,2015-04-07 11:18:51.000,"[111028, 576173, 9706, 29137, 12406, 5087, 462...","[2, 3]",27,2
115067,0,2020-04-20 07:40:58.000,"[354, 2045, 7872, 1380, 4885, 7875, 312, 2081,...","[126, 129, 60, 58, 34]",36,5
115068,0,2007-04-08 16:54:15.000,"[462, 42948, 385640, 120811, 1555, 420, 16865]",[],7,0
115069,0,2010-01-04 15:35:11.000,"[326412, 240343, 223948, 329234, 244908, 57617...",[],55,0


test set에서 샘플 300개만 뽑아 테스트해봅니다.

In [None]:
# sample test
np.random.seed(33)
n_sample = 300

test = plylst_test.iloc[np.random.choice(range(n_test), n_sample, replace=False),:]

# real test
# test = plylst_test
# print(len(test))

row가 playlist(nid)이고 column이 item(sid or tid)인 sparse matrix A를 만듭니다.

- 각 플레이리스트(row)별로 포함하는 수록곡/태그의 컬럼에 해당하는 값이 1, 나머지는 0인 sparse matrix 생성 (2개 matrix)
- 각 sparse matrix에 bm25 weighting 적용

In [None]:
# bm25 weighting 코드 

def bm25_row(X, K1=1.2, B=0.75):
    # Weighs each row of a sparse matrix by OkapiBM25 weighting
    # calculate idf per term (song or tag)
    X = spr.coo_matrix(X)
    N = float(X.shape[0])
    idf = np.log(N / (1 + np.bincount(X.col)))

    # calculate length_norm per document (playlist)
    row_sums = np.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X.tocsr()

- bm25 관련 자료: https://simonezz.tistory.com/41 

In [None]:
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
train_songs_A = bm25_row(train_songs_A)

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
train_tags_A = bm25_row(train_tags_A)

In [None]:
train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T = train_tags_A.T.tocsr()

In [None]:
train_songs_A.data

array([ 6.45773177,  5.24387366,  4.68657376, ..., 12.22118728,
       12.57123369, 13.0645964 ])

In [None]:
train_songs_A.shape

(92056, 576192)

### 유사도 계산
1) 내적  
2) 자카드 계수 

#### 유사도를 내적으로 계산

In [None]:
# 유사도: 내적 활용
from tqdm import tqdm

def rec(pids):
  tt = 1

  res = []

  for pid in pids:
    p = np.zeros((n_songs,1))      # 전체 곡 개수를 길이로 하는 0으로 된 배열
    p[test.loc[pid,'songs_id']] = 1   # 해당 플레이리스트가 포함하고 있는 song_id에 해당하는 값은 1 

    val = train_songs_A.dot(p).reshape(-1)  # 해당 플레이리스트와 각각의 다른 플레이리스트들 간의 유사도

    songs_already = test.loc[pid, "songs_id"]  # 해당 플레이리스트에 이미 포함된 곡
    tags_already = test.loc[pid, "tags_id"]    # 해당 플레이리스트에 이미 포함된 태그

    cand_song = train_songs_A_T.dot(val) # 각 곡의 벡터에 해당 플레이리스트와의 유사도를 내적
    cand_song_idx = cand_song.reshape(-1).argsort()[-200:][::-1]  # 포함될 가능성이 높은 곡 순서대로 200개 곡의 인덱스 배열

    # 위의 곡들 중에 이미 포함되어 있는 곡 제외하고 상위 100개 곡 인덱스 저장
    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]  
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # 곡 인덱스 바탕으로 추천할 곡 목록 리스트 저장

    cand_tag = train_tags_A_T.dot(val)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:  # 플레이리스트 1000개 마다 tt 출력
      print(tt)

    tt += 1
  return res

In [None]:
answers = rec(test.index)

In [None]:
answers

In [None]:
write_json(answers, "results/results.json")

In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.174681
Tag nDCG: 0.374618
Score: 0.204671


#### 유사도를 자카드 계수로 계산 
- 두 배열에서 교집합 개수 / 합집합 개수
- 참고 자료: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html  
https://wooono.tistory.com/132  

In [None]:
# 유사도: 자카드 계수 활용: 수정 필요
from tqdm import tqdm
from sklearn.metrics import jaccard_score

def rec(pids):
  tt = 1

  res = []

  for pid in pids:
    p = np.zeros((n_songs, 1))      # 전체 곡 개수를 길이로 하는 0으로 된 배열
    p[test.loc[pid,'songs_id']] = 1   # 위 배열에서 해당 플레이리스트가 포함하고 있는 song_id에 해당하는 값은 1 (나머지는 0)
    
    # 자카드 계수   
    a_intersect_b = train_songs_A.dot(p).reshape(-1)  # 해당 플레이리스트와 다른 플레이리스트가 공통으로 포함하는 수록곡 개수

    sum_a = train_songs_A.sum(axis=1)
    sum_a = np.ravel(sum_a)

    sum_b = np.repeat(sum(p), train_songs_A.shape[0])
    
    jaccard_sim = a_intersect_b / (sum_a + sum_b - a_intersect_b)

    songs_already = test.loc[pid, "songs_id"]  # 해당 플레이리스트에 이미 포함된 곡
    tags_already = test.loc[pid, "tags_id"]    # 해당 플레이리스트에 이미 포함된 태그

    cand_song = train_songs_A_T.dot(jaccard_sim) # 각 곡이 해당 플레이리스트에 포함될 가능성
    cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]  # 포함될 가능성이 높은 곡 순서대로 150개 곡의 인덱스 배열

    cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]  # 위의 곡들 중에 이미 포함되어 있는 곡 제외하고 상위 100개 곡 인덱스
    rec_song_idx = [song_sid_id[i] for i in cand_song_idx]  # 곡 인덱스 바탕으로 추천할 곡 목록 리스트 저장

    cand_tag = train_tags_A_T.dot(jaccard_sim)
    cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

    cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
    rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

    res.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })
    
    if tt % 1000 == 0:  # 플레이리스트 1000개 마다 tt 출력
      print(tt)

    tt += 1
  return res

##### 자카드 계수로 유사도 구하여 추천한 결과 score

In [None]:
answers = rec(test.index)

In [None]:
write_json(answers, "results/results.json")

In [None]:
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.21745
Tag nDCG: 0.370077
Score: 0.240344


- 자카드 계수로 유사도 구한 결과 내적으로 유사도 계산한 모델에 비해 약 0.04 정도 score 상승!
- tag nDCG는 비슷하나, song nDCG score가 상승함
