## 데이터 로드

In [1]:
import json
import pandas as pd

with open('../Datasets/train.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

In [2]:
train_data = pd.DataFrame(json_data)
train_data = train_data.drop(['id', 'plylst_title', 'updt_date', 'like_cnt'], axis=1)
train_data.head()

Unnamed: 0,tags,songs
0,[락],"[525514, 129701, 383374, 562083, 297861, 13954..."
1,"[추억, 회상]","[432406, 675945, 497066, 120377, 389529, 24427..."
2,"[까페, 잔잔한]","[83116, 276692, 166267, 186301, 354465, 256598..."
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...","[394031, 195524, 540149, 287984, 440773, 10033..."
4,[댄스],"[159327, 553610, 5130, 645103, 294435, 100657,..."


In [3]:
with open('../Datasets/song_meta.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

In [4]:
song_data = pd.DataFrame(json_data)
song_data = song_data.drop(['album_name', 'song_name', 'artist_name_basket', 'song_gn_gnr_basket'], axis=1)
song_data.head()

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_id,artist_id_basket,id
0,[GN0901],20140512,2255639,[2727],0
1,"[GN1601, GN1606]",20080421,376431,[29966],1
2,[GN0901],20180518,4698747,[3361],2
3,"[GN1102, GN1101]",20151016,2644882,[838543],3
4,"[GN1802, GN1801]",20110824,2008470,[560160],4


## 데이터 열 이름 변경

In [5]:
train_data.rename(columns={'songs':'song_id'}, inplace=True)
train_data.head()

Unnamed: 0,tags,song_id
0,[락],"[525514, 129701, 383374, 562083, 297861, 13954..."
1,"[추억, 회상]","[432406, 675945, 497066, 120377, 389529, 24427..."
2,"[까페, 잔잔한]","[83116, 276692, 166267, 186301, 354465, 256598..."
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...","[394031, 195524, 540149, 287984, 440773, 10033..."
4,[댄스],"[159327, 553610, 5130, 645103, 294435, 100657,..."


In [6]:
song_data.rename(columns={'id':'song_id', 'song_gn_dtl_gnr_basket': 'gnr'}, inplace=True)
song_data.head()

Unnamed: 0,gnr,issue_date,album_id,artist_id_basket,song_id
0,[GN0901],20140512,2255639,[2727],0
1,"[GN1601, GN1606]",20080421,376431,[29966],1
2,[GN0901],20180518,4698747,[3361],2
3,"[GN1102, GN1101]",20151016,2644882,[838543],3
4,"[GN1802, GN1801]",20110824,2008470,[560160],4


## 데이터 추출

- 500개의 플레이리스트 추출

In [7]:
train_data_sample = train_data[:500]

## 태그 병합

- 같은 노래에 부여된 서로 다른 태그들을 합친다
- 그 결과 동일한 태그 리스트가 거의 모든 노래에 부여되었다

In [8]:
train_data_sample = train_data_sample.explode('song_id', ignore_index=True)
train_data_sample.head()

Unnamed: 0,tags,song_id
0,[락],525514
1,[락],129701
2,[락],383374
3,[락],562083
4,[락],297861


In [9]:
train_dict = dict()

for i in range(len(train_data_sample)):
    song = train_data_sample['song_id'][i]
    tag = train_data_sample['tags'][i]
    
    if song in train_dict:
        for j in tag:
            train_dict[song].add(j)
    
    else:
        train_dict[song] = set(tag)
        
print(train_dict[157435])

{'여자아이돌', '댄스', '스트레스해소', 'kpop', '걸그룹댄스'}


In [10]:
train_data_sample.drop_duplicates(subset='song_id', keep='first',inplace=False)
train_data_sample.shape

(21425, 2)

In [11]:
for i in range(len(train_data_sample)):
    song = train_data_sample['song_id'][i]
    
    train_data_sample['tags'][i] = list(train_dict[song])

train_data_sample.head()

Unnamed: 0,tags,song_id
0,[락],525514
1,[락],129701
2,[락],383374
3,[락],562083
4,[락],297861


In [12]:
merge = pd.merge(train_data_sample, song_data)
merge.head()

Unnamed: 0,tags,song_id,gnr,issue_date,album_id,artist_id_basket
0,[락],525514,"[GN1402, GN1401]",20130506,2200223,[734201]
1,[락],129701,"[GN0901, GN0902, GN1001]",20130917,2201802,[536907]
2,[락],383374,"[GN1012, GN1005, GN1001]",19911021,2216938,[166978]
3,[락],562083,"[GN1013, GN0901, GN0902, GN1001]",20000919,43227,[19035]
4,[락],297861,"[GN1013, GN0901, GN0902, GN1001]",20050306,303657,[170117]


## Word2Vec 사용

- 태그 리스트들을 word2vec로 학습시켜 태그 하나와 연관된 다른 태그들을 유추

In [13]:
train_data_sample2 = train_data[:500]

In [14]:
from gensim.models.word2vec import Word2Vec

w2v = Word2Vec(sentences = train_data_sample2['tags'], vector_size = 100, 
               window = 5, min_count = 5, workers = 4, sg = 0)

w2v.wv.vectors.shape

(66, 100)

In [15]:
print(w2v.wv.most_similar('스트레스'))

[('분위기', 0.22454456984996796), ('힙합', 0.17852607369422913), ('국힙', 0.16757139563560486), ('이별', 0.1608145534992218), ('팝', 0.15153266489505768), ('휴식', 0.13615798950195312), ('여름', 0.13451433181762695), ('설렘', 0.11924611777067184), ('추억', 0.1166408360004425), ('회상', 0.1142653077840805)]


## 코사인 유사도 사용

- 세부 장르를 사용해 코사인 유사도 측정한다
- 그후 유사도를 행렬로 저장한다

In [16]:
train_data_explode = train_data_sample2.explode('song_id', ignore_index=True)
train_data_explode.head()

Unnamed: 0,tags,song_id
0,[락],525514
1,[락],129701
2,[락],383374
3,[락],562083
4,[락],297861


In [17]:
train_data_explode.drop_duplicates(subset='song_id', keep='first',inplace=False)
train_data_explode.head()

Unnamed: 0,tags,song_id
0,[락],525514
1,[락],129701
2,[락],383374
3,[락],562083
4,[락],297861


In [18]:
merge2 = pd.merge(train_data_explode['song_id'], song_data)
merge2.head()

Unnamed: 0,song_id,gnr,issue_date,album_id,artist_id_basket
0,525514,"[GN1402, GN1401]",20130506,2200223,[734201]
1,129701,"[GN0901, GN0902, GN1001]",20130917,2201802,[536907]
2,383374,"[GN1012, GN1005, GN1001]",19911021,2216938,[166978]
3,562083,"[GN1013, GN0901, GN0902, GN1001]",20000919,43227,[19035]
4,297861,"[GN1013, GN0901, GN0902, GN1001]",20050306,303657,[170117]


In [19]:
from sklearn.feature_extraction.text import CountVectorizer

merge2['gnr_literal'] = merge2['gnr'].apply(lambda x : (' ').join(x))

count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
gnr_mat = count_vect.fit_transform(merge2['gnr_literal'])

gnr_mat.shape

(21425, 835)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

gnr_sim = cosine_similarity(gnr_mat, gnr_mat)
gnr_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

- 노래 id가 주어지면 유사도 순으로 n개의 노래 추출

In [41]:
def find_sim_song(df, sim_matrix, song, top_n=10):
    
    title_song = df[df['song_id'] == song]
    title_index = title_song.index.values
    
    df["similarity"] = sim_matrix[title_index, :].reshape(-1,1)
    
    temp = df.sort_values(by="similarity", ascending=False)
    temp = temp[temp.index.values != title_index]
    
    final_index = temp.index.values[ : top_n]
    
    return df.iloc[final_index]

In [43]:
similar_songs = find_sim_song(merge2, gnr_sim, 525514, 100)
similar_songs[['song_id', 'similarity', 'issue_date']]

Unnamed: 0,song_id,similarity,issue_date
7508,626280,1.000000,20011030
9199,104126,1.000000,20040421
17851,65115,1.000000,20090824
15226,343958,1.000000,20140418
20176,645865,1.000000,20101102
...,...,...,...
21189,563769,0.774597,20141124
3417,460871,0.774597,20080930
14538,701978,0.774597,20101011
21009,300339,0.774597,19950714


## 노래 추천

- w2v로 추출한 태그에 해당하는 플레이리스트
- 세부 장르의 유사도가 높은 노래 리스트
- 히스토리(test 플레이리스트)의 발행 연도와 같은 연도에 발행한 노래