In [1]:
from operator import itemgetter
from scipy.sparse import coo_matrix, save_npz
import numpy as np
from collections import defaultdict

import os
import io
import distutils.dir_util
from collections import Counter
import json
import pickle

In [2]:
def pickle_dump(data, fname): #pickle 모듈을 이용하여 입력
    with open(fname, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
def write_json(data, fname): #json 파일 쓰기
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)

In [4]:
def load_json(fname): #json 파일 읽기
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


In [5]:
def most_popular(playlists, col, topk_count): #많이 등장하는 곡 개수 추출
    c = Counter()
    for doc in playlists:
        c.update(doc[col])
    topk = c.most_common(topk_count)
    return c, [k for k, v in topk]

In [8]:
def remove_seen(seen, l): # 중복 제거
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [22]:
def title_into_words(title):  #키워드 사전 만들기
    word_index_list = []
    for word in all_word_list:
        if word[0] in title:
            word_index_list.append([word[0], title.index(word[0]), tags_mp_counter[word[0]] * -1])
    word_list = [word_index[0] for word_index in sorted(word_index_list, key=itemgetter(1,2))]
    word_list_popular = []
    i=0
    while i < len(word_list):
        same_words = [word_list[i]]
        for j in range(i+1, len(word_list)):
            if word_list[i] in word_list[j] or word_list[j] in word_list[i]:
                same_words.append(word_list[j])
            else:
                break
        i+=len(same_words)
        max_popularity = 0
        word_to_append = None
        for word in same_words:
            if tags_mp_counter[word] > max_popularity:
                word_to_append = word
                max_popularity = tags_mp_counter[word]
        if word_to_append is not None:
            word_list_popular.append(word_to_append)
    return word_list_popular

In [11]:
if not(os.path.isdir('data/')):
    os.makedirs('data/')

In [12]:
train = load_json('res/train.json')
val = load_json('res/val.json')
test = load_json('res/test.json')
train = train + val + test

In [15]:
print('rank popular songs/tags...')
data_by_yearmonth = defaultdict(list)
for q in train:
    try:
        data_by_yearmonth[q['updt_date'][0:4]].append(q)
    except:
        pass
    try:
        data_by_yearmonth[q['updt_date'][0:7]].append(q)
    except:
        pass
data_by_yearmonth = dict(data_by_yearmonth)

rank popular songs/tags...


In [24]:
most_popular_results = {}
songs_mp_counter, most_popular_results['songs'] = most_popular(train, "songs", 200)
tags_mp_counter, most_popular_results['tags'] = most_popular(train, "tags", 50)
for y in data_by_yearmonth.keys():
    _, most_popular_results['songs' + y] = most_popular(data_by_yearmonth[y], "songs", 200)
    _, most_popular_results['tags' + y] = most_popular(data_by_yearmonth[y], "tags", 50)


In [31]:
print('split title into words...')
all_word_list = []
for t in tags_mp_counter.most_common(): # use tags as words dictionary
    if t[1] >= 5 and len(t[0]) > 1:
        all_word_list.append(t)
for q in train:
    q['title_words'] = title_into_words(q['plylst_title'])
for q in test:
    q['title_words'] = title_into_words(q['plylst_title'])

split title into words...


In [64]:
q['songs']

[685111, 20039, 634240, 519385, 321994, 525309, 324209, 124706, 265060, 317738]

In [34]:
print('write train matrix...')
playlist_song_train_matrix = []
p_encode, s_encode, p_decode, s_decode = {}, {}, {}, {}
playlist_idx = 0
song_idx = 0
for q in train:
    if len(q['songs']) + len(q['tags']) + len(q['title_words']) >= 1:
        p_encode[q['id']] = playlist_idx
        for s in q['songs']:
            if s not in s_encode.keys():
                s_encode[s] = song_idx
                song_idx += 1
            playlist_song_train_matrix.append([playlist_idx, s_encode[s]])
        playlist_idx += 1

write train matrix...


In [35]:
s_decode['@tag_start_idx'] = song_idx
for q in train:
    if len(q['songs']) + len(q['tags']) + len(q['title_words']) >= 1:
        for s in q['tags']:
            if s not in s_encode.keys():
                s_encode[s] = song_idx
                song_idx += 1
            playlist_song_train_matrix.append([p_encode[q['id']], s_encode[s]])

In [36]:
s_decode['@tag_title_start_idx'] = song_idx
for q in train:
    if len(q['songs']) + len(q['tags']) + len(q['title_words']) >= 1:
        for s in q['title_words']:
            if '!title_' + str(s) not in s_encode.keys():
                s_encode['!title_' + str(s)] = song_idx
                song_idx += 1
            playlist_song_train_matrix.append([p_encode[q['id']], s_encode['!title_' + str(s)]])


In [37]:
playlist_song_train_matrix = np.array(playlist_song_train_matrix)
playlist_song_train_matrix = coo_matrix((np.ones(playlist_song_train_matrix.shape[0]),(playlist_song_train_matrix[:,0], playlist_song_train_matrix[:,1])),shape=(playlist_idx,song_idx))
save_npz('data/playlist_song_train_matrix.npz', playlist_song_train_matrix)
for s in s_encode.keys():
    s_decode[s_encode[s]] = s
pickle_dump(s_decode, 'data/song_label_decoder.pickle')
pickle_dump(p_encode, 'data/playlist_label_encoder.pickle')

In [41]:
title_words_mp_counter, _ = most_popular(train, "title_words", 50)

In [42]:
print('write test item indices...')
for q in test:
    if len(q['songs']) + len(q['tags']) + len(q['title_words']) >= 1:
        if np.mean([songs_mp_counter[i] for i in q['songs']] + [tags_mp_counter[i] for i in q['tags']] + [title_words_mp_counter[i] for i in q['title_words']]) > 1:
            items = [s_encode[s] for s in q['songs'] + q['tags']]
            try:
                for s in q['title_words']:
                    if '!title_' + str(s) in s_encode.keys():
                        items.append(s_encode['!title_' + str(s)])
            except KeyError:
                q['title_words'] = []
            q['items'] = items
    
    if 'songs'+q['updt_date'][0:7] in most_popular_results.keys():
        q['songs_mp'] = (remove_seen(q['songs'],most_popular_results['songs'+q['updt_date'][0:7]] + remove_seen(most_popular_results['songs'+q['updt_date'][0:7]], most_popular_results['songs'])))[:100]
        q['tags_mp'] = (remove_seen(q['tags'],most_popular_results['tags'+q['updt_date'][0:7]] + remove_seen(most_popular_results['tags'+q['updt_date'][0:7]], most_popular_results['songs'])))[:10]
    elif 'songs'+q['updt_date'][0:4] in most_popular_results.keys():
        q['songs_mp'] = (remove_seen(q['songs'],most_popular_results['songs'+q['updt_date'][0:4]] + remove_seen(most_popular_results['songs'+q['updt_date'][0:4]], most_popular_results['songs'])))[:100]
        q['tags_mp'] = (remove_seen(q['tags'],most_popular_results['tags'+q['updt_date'][0:4]] + remove_seen(most_popular_results['tags'+q['updt_date'][0:4]], most_popular_results['songs'])))[:10]
    else:
        q['songs_mp'] = remove_seen(q['songs'],most_popular_results['songs'][:100])
        q['tags_mp'] = remove_seen(q['tags'],most_popular_results['tags'][:10])


write test item indices...


In [19]:
write_json(test, 'data/test_items.json')

In [49]:
test_items=load_json('data/test_items.json')

In [68]:
q['updt_date']

'2015-10-26 17:43:30.000'

In [50]:
test_items

[{'tags': [],
  'id': 70107,
  'plylst_title': '',
  'songs': [398985,
   449403,
   411543,
   528044,
   143048,
   98020,
   316600,
   621133,
   674580,
   205149,
   199134,
   347303,
   394079,
   551588,
   346913,
   534183,
   169026,
   377279,
   454218,
   271922,
   179503,
   225247,
   153362,
   148992,
   489337,
   552203,
   231614,
   559041],
  'like_cnt': 6,
  'updt_date': '2012-09-29 01:57:26.000',
  'title_words': [],
  'items': [35711,
   56843,
   31473,
   11995,
   47022,
   144371,
   15000,
   102344,
   33956,
   176163,
   9969,
   9767,
   31747,
   7406,
   1863,
   4364,
   4365,
   138921,
   6525,
   31776,
   22993,
   19558,
   3907,
   3418,
   2156,
   27216,
   16030,
   9782],
  'songs_mp': [132247,
   246984,
   539406,
   264162,
   77612,
   454528,
   158021,
   288044,
   55466,
   685316,
   611695,
   545231,
   210145,
   470684,
   348250,
   81605,
   333562,
   504520,
   158153,
   231154,
   525998,
   195300,
   284179,
   4295

In [73]:
import pandas as pd
test_items_pd=pd.read_json('data/test_items.json',typ='frame')

In [74]:
test_items_pd

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,title_words,items,songs_mp,tags_mp
0,[],70107,,"[398985, 449403, 411543, 528044, 143048, 98020...",6,2012-09-29 01:57:26.000,[],"[35711, 56843, 31473, 11995, 47022, 144371, 15...","[132247, 246984, 539406, 264162, 77612, 454528...","[발라드, 락, 힙합, 일렉, 랩, 까페, 팝, 댄스, 힐링, 휴식]"
1,"[나만의Best3, 인디아티스트들의추천음악]",7461,,"[196298, 269984, 267805, 175867, 529244, 63825...",0,2019-12-17 14:06:45.000,[],"[197466, 12578, 10595, 544260, 201545, 154599,...","[205179, 215411, 463173, 680366, 235773, 52009...","[감성, 겨울, 기분전환, 드라이브, 카페, 휴식, 사랑, 잔잔한, 발라드, 힙합]"
2,[드라이브],90348,,"[273433, 331003, 68432, 411659, 117793, 616860...",21,2015-05-23 10:44:48.000,[],"[15105, 839, 37066, 48737, 30419, 7402, 7406, ...","[625875, 236393, 326376, 663256, 107853, 45925...","[힐링, 기분전환, 휴식, 설렘, 까페, 사랑, 잔잔한, 밤, 새벽, 회상]"
3,[분위기],58617,,"[702227, 48152, 440008, 358488, 701041, 540721...",0,2019-03-14 09:47:34.000,[],"[451019, 14021, 85718, 66322, 2155, 461833, 37...","[246531, 701557, 650494, 357367, 571790, 36296...","[Pop, 기분전환, 감성, 재즈, 드라이브, 힙합, 잔잔한, 힐링, 봄, 휴식]"
4,[],102395,,"[630683, 481582, 528550, 285114, 506667, 17922...",38,2018-07-11 16:43:32.000,[],"[176233, 176235, 163658, 201143, 58834, 210708...","[346619, 357367, 144663, 704707, 93143, 222272...","[기분전환, 여름, 드라이브, 감성, 잔잔한, 신나는, 새벽, 휴식, 여행, 카페]"
...,...,...,...,...,...,...,...,...,...,...
10735,[추억],137930,,"[323755, 397594, 445908, 570242, 221853, 20018...",16,2016-04-18 11:02:09.000,[],"[5575, 52074, 294413, 16408, 177249, 53691, 29...","[701557, 650494, 333595, 132994, 376435, 20818...","[기분전환, 힐링, 휴식, 새벽, 밤, 봄, 회상, 사랑, 설렘, 잔잔한]"
10736,"[띵곡의, 우울, 분위기, 드라이브, 산책]",936,,"[105140, 582252, 199262, 422915, 547967, 48791...",1,2020-04-08 07:15:59.000,[],"[10541, 1986, 5304, 10543, 4016, 10544, 10547,...","[205179, 106034, 235773, 680366, 427724, 46317...","[감성, 기분전환, 휴식, 잔잔한, 신나는, 봄, 사랑, 카페, 발라드, 힐링]"
10737,[기분전환],110589,,"[21976, 207746, 40025, 31635, 567462, 641799, ...",6,2016-06-29 00:57:21.000,[],"[132480, 163536, 13476, 74636, 97836, 59338, 1...","[360825, 24087, 153807, 505036, 627363, 12466,...","[힐링, 휴식, 밤, 새벽, 잔잔한, 여름, 사랑, 힙합, 팝, 설렘]"
10738,[여름],2605,,"[234554, 265033, 507260, 83092, 366757, 497097...",4,2015-06-06 09:52:01.000,[],"[1926, 33466, 118295, 48143, 158055, 172396, 7...","[672550, 586653, 418935, 26083, 695494, 236393...","[기분전환, 휴식, 힐링, 새벽, 밤, 까페, 잔잔한, 회상, 사랑, 설렘]"


In [75]:
import numpy as np
import gc
from scipy.sparse import coo_matrix, lil_matrix, csr_matrix, load_npz
from multiprocessing import Pool
from time import time
from sklearn.svm import LinearSVC

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

In [76]:
def pickle_load(fname):
    with open(fname, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [77]:
def neighbor_based_cf(playlist_id):
    item_indices = test_item_indices[playlist_id]

    alpha, beta, theta = 0.9, 0.7, 0.99
    Cr = 0.4 + (100 - np.shape(item_indices)[0]) * 0.0055
    if Cr < 0.2:
        Cr = 0.2
    elif Cr > 1:
        Cr = 1
    
    song_playlist_train_matrix = lil_matrix(song_playlist_train_matrix_raw)
    song_playlist_train_matrix[:,p_encode[playlist_id]] = 0

    weight = song_playlist_train_matrix[item_indices, :].multiply(np.power(1e-1 + I_list, beta - 1)).sum(axis=0)
    weight = np.array(weight).flatten()
    weight = np.power(weight,theta)
    value = song_playlist_train_matrix[item_indices, :].multiply(weight)
    value = value.dot(song_playlist_train_matrix.transpose()) 
    I_song_i = np.power(1e-1+I_song[item_indices],-alpha)
    value = value.multiply(I_song_i.reshape((-1,1)))
    value = value.multiply(np.power(1e-1+I_song,alpha-1))
    value = csr_matrix(value)

    predictions = lil_matrix(value)
    label = np.zeros(song_playlist_train_matrix.shape[0])
    label[item_indices] = 1
    
    clf = LinearSVC(C=Cr,class_weight={0:1,1:1},tol=1e-6, dual = True, max_iter=360000)
    clf.fit(predictions.transpose(),label)
    predictions = clf.decision_function(predictions.transpose())
    predictions = np.argsort(np.array(predictions).flatten() - min(predictions))[::-1]

    return np.array(list(predictions[predictions < tag_start_idx][:400]) + list(predictions[(predictions >= tag_start_idx) & (predictions < tag_title_start_idx)][:100]))  


In [78]:
s_decode = pickle_load('data/song_label_decoder.pickle')
p_encode = pickle_load('data/playlist_label_encoder.pickle')
tag_start_idx = s_decode['@tag_start_idx']
tag_title_start_idx = s_decode['@tag_title_start_idx']

In [79]:
print("load train matrix...")
playlist_song_train_matrix = load_npz('data/playlist_song_train_matrix.npz')
song_playlist_train_matrix_raw = lil_matrix(playlist_song_train_matrix.transpose())

load train matrix...


In [80]:
playlist_song_train_matrix

<148590x682732 sparse matrix of type '<class 'numpy.float64'>'
	with 6774644 stored elements in COOrdinate format>

In [81]:
song_playlist_train_matrix_raw

<682732x148590 sparse matrix of type '<class 'numpy.float64'>'
	with 6774644 stored elements in List of Lists format>

In [82]:
gc.collect() 

60

In [83]:
I_song = np.array(song_playlist_train_matrix_raw.sum(axis=1)).flatten()
I_list = np.array(song_playlist_train_matrix_raw.sum(axis=0)).flatten()

In [84]:
print("load test data...")
test = load_json('data/test_items.json')
test_item_indices = dict()
test_playlist_id = []
for q in test:
    if 'items' in q.keys():
        test_item_indices[q['id']] = q['items']
        test_playlist_id.append(q['id'])

load test data...


In [None]:
print("predictions begin...")
pool = Pool(10) 
results = pool.map(neighbor_based_cf, test_playlist_id)
pool.close()
pool.join()

predictions begin...


In [None]:
prediction_results = {}
for i in range(len(results)):
    prediction_results[test_playlist_id[i]] = {"songs": [s_decode[s] for s in results[i][:400]], "tags": [s_decode[s] for s in results[i][400:]]}


In [None]:
print("write results.json...")
answers = []
for q in test:
    if q['id'] in test_playlist_id:
        answers.append({'id': q['id'],
        'songs': remove_seen(q['songs'], prediction_results[q['id']]['songs'])[:100],
        'tags': remove_seen(q['tags'], prediction_results[q['id']]['tags'])[:10] })
    else:
        answers.append({'id': q['id'],
        'songs': remove_seen(q['songs'], q['songs_mp'])[:100],
        'tags': remove_seen(q['tags'], q['tags_mp'])[:10] })
    if len(answers[len(answers)-1]['songs']) < 100 or len(answers[len(answers)-1]['tags']) < 10:
        answers[len(answers)-1]['songs'] = (answers[len(answers)-1]['songs'] + remove_seen(q['songs'] + answers[len(answers)-1]['songs'], q['songs_mp']))[:100]
        answers[len(answers)-1]['tags'] = (answers[len(answers)-1]['tags'] + remove_seen(q['tags'] + answers[len(answers)-1]['tags'], q['tags_mp']))[:10]
write_json(answers, 'results.json')

In [None]:
print('end')

In [None]:
import pandas as pd
result=pd.read_json('results.json',typ='series')
result