In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import io 
import os 
import json 
import distutils.dir_util
from collections import Counter
from tensorflow.keras.models import save_model 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense, SimpleRNN,Dropout

In [None]:
# arena_util.py
# -*- coding: utf-8 -*-

import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))



In [None]:
# evaluate.py
# -*- coding: utf-8 -*-
# import fire
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


# if __name__ == "__main__":
#     fire.Fire(ArenaEvaluator)


In [None]:
# -*- coding: utf-8 -*-
import copy
import random
import numpy as np


class ArenaSplitter:
    def _split_data(self, playlists):
        tot = len(playlists)
        train = playlists[:int(tot*0.80)]
        val = playlists[int(tot*0.80):]

        return train, val

    def _mask(self, playlists, mask_cols, del_cols):
        q_pl = copy.deepcopy(playlists)
        a_pl = copy.deepcopy(playlists)

        for i in range(len(playlists)):
            for del_col in del_cols:
                q_pl[i][del_col] = []
                if del_col == 'songs':
                    a_pl[i][del_col] = a_pl[i][del_col][:100]
                elif del_col == 'tags':
                    a_pl[i][del_col] = a_pl[i][del_col][:10]

            for col in mask_cols:
                mask_len = len(playlists[i][col])
                mask = np.full(mask_len, False)
                mask[:mask_len//2] = True
                np.random.shuffle(mask)

                q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
                a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

        return q_pl, a_pl

    def _mask_data(self, playlists):
        playlists = copy.deepcopy(playlists)
        tot = len(playlists)
        song_only = playlists[:int(tot * 0.3)]
        song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
        tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
        title_only = playlists[int(tot * 0.95):]

        print(f"Total: {len(playlists)}, "
              f"Song only: {len(song_only)}, "
              f"Song & Tags: {len(song_and_tags)}, "
              f"Tags only: {len(tags_only)}, "
              f"Title only: {len(title_only)}")

        song_q, song_a = self._mask(song_only, ['songs'], ['tags'])
        songtag_q, songtag_a = self._mask(song_and_tags, ['songs', 'tags'], [])
        tag_q, tag_a = self._mask(tags_only, ['tags'], ['songs'])
        title_q, title_a = self._mask(title_only, [], ['songs', 'tags'])

        q = song_q + songtag_q + tag_q + title_q
        a = song_a + songtag_a + tag_a + title_a

        shuffle_indices = np.arange(len(q))
        np.random.shuffle(shuffle_indices)

        q = list(np.array(q)[shuffle_indices])
        a = list(np.array(a)[shuffle_indices])

        return q, a

    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")

In [None]:
split = ArenaSplitter()
split.run("/content/drive/MyDrive/KUBIG 2021-2/추천시스템 프로젝트/멜론데이터/train.json")

Reading data...

Total playlists: 115071
Splitting data...
Original train...
Original val...
Masked val...
Total: 23015, Song only: 6904, Song & Tags: 11508, Tags only: 3452, Title only: 1151


In [None]:
def load_train_json(fname):
    f = pd.read_json(fname+'.json' ,typ = 'frame', encoding="utf-8")
    df = pd.DataFrame(f)
    df = df.sort_values(by=['like_cnt'],ascending=False)
    df = df[df['like_cnt']>10]
    print('load_train_json')
    return df

In [None]:
def load_val_json(fname):
    f = pd.read_json(fname+'.json' ,typ = 'frame', encoding="utf-8")
    df = pd.DataFrame(f)
    df = df.sort_values(by=['like_cnt'],ascending=False)
    #df = df[df['like_cnt']>10]
    print('load_val_json')
    return df

In [None]:
def most_popular(df,song_num):
    train_song = df['songs']
    train_tag = df['tags']
    
    song_list = [song for plist in train_song for song in plist ]
    tag_list = [tag for plist in train_tag for tag in plist]
    
    count_song = Counter(song_list)
    count_tag = Counter(tag_list)
    x={}
    for key, value in count_song.items():
        if value>song_num:
            x[key]=value
    song_len = len(x)
    for key, value in count_tag.items():
        if value>1:
            x[key]=value
    tag_len=len(x)-song_len
    print('most_popular')
    return list(x.keys()),song_len, tag_len


In [None]:
def create_zero(column_name):
    zero_df=pd.DataFrame(columns=col)
    return zero_df

In [None]:
def create_onehot(df,column_name):
    zero_matrix=np.zeros((len(df),len(column_name)))
    zero_df=pd.DataFrame(zero_matrix,columns=column_name,index=df['id'])
    for i in range(len(df)):
        for tag,song in zip(df.iloc[i,0],df.iloc[i,3]):
            if tag in column_name:
                zero_df.iloc[i,column_name.index(tag)]=1
            if song in column_name:
                zero_df.iloc[i,column_name.index(song)]=1
    return zero_df
            

In [None]:
def deep_learing(column_name,train_onehot,val_onehot):
        col = column_name
        
        encoding_dim=64
        input_plist=Input(shape=(len(col),))
        dropout = Dropout(0.2)(input_plist)
        encoded=Dense(encoding_dim,activation='relu')(input_plist)
        encoded=Dense(36,activation='relu')(encoded)
        decoded=Dense(len(col),activation='sigmoid')(encoded)
        autoencoder=Model(input_plist,decoded)
        
        autoencoder.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
        
        autoencoder.fit(train_onehot,train_onehot,epochs=5,batch_size=64,validation_split=0.1)
        
        predict_plist=autoencoder.predict(val_onehot)
        print('deep_learning')
        return predict_plist


In [None]:
def result(df_id, column_name, song_len, tag_len, pre):
        #train_df = df
        df_id = list(df_id)
        col= column_name
        ori_song = col[:song_len]
        ori_tag = col[song_len:]
        #onehot = df_one
        #predict = deep_learning(train)
        print('ok')
        song_predict = pre[:,:song_len]
        tag_predict = pre[:,song_len:]
        print('ok')
        result=[]
        n=0
        for i in df_id:
            dic={}
            dic['id']=i

            plist_song=song_predict[n].argsort()[-100:]
            p_song=[]
            for song in plist_song:
                p_song.append(ori_song[song])
            dic['songs']=p_song

            plist_tag=tag_predict[n].argsort()[-10:]
            p_tag=[]
            for tag in plist_tag:
                p_tag.append(ori_tag[tag])
            dic['tags']=p_tag
            n+=1
            result.append(dic)
        print('result')
        return result


In [None]:
train_df=load_train_json('/content/arena_data/orig/train')
t_col,t_song, t_tag=most_popular(train_df,13)
print(len(t_col))

load_train_json
most_popular
35880


In [None]:
train_one=create_onehot(train_df, t_col)

In [None]:
val_df=load_val_json('/content/arena_data/questions/val')
val_one=create_onehot(val_df, t_col)

load_val_json


In [None]:
pred = deep_learing(t_col,train_one, val_one)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
deep_learning


In [None]:
result_1 = result(val_df['id'], t_col, t_song, t_tag, pred)

ok
ok
result


In [None]:
result_1

In [None]:
df_result=pd.DataFrame(result_1)
print(df_result)

           id  ...                                           tags
0      102123  ...  [휴식, 힐링, 새벽, 잔잔한, 팝, 감성, 드라이브, 힙합, 발라드, 기분전환]
1       11762  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
2       56212  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
3       49159  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
4       46039  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
...       ...  ...                                            ...
23010   16644  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
23011  150668  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
23012   98054  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
23013   83540  ...     [까페, 기분전환, OST, CCM, 일렉, 랩, 팝, 힙합, 락, 발라드]
23014  102549  ...   [매장음악, OST, CCM, 일렉, 기분전환, 랩, 팝, 힙합, 락, 발라드]

[23015 rows x 3 columns]


In [None]:
answers = result_1
write_json(answers, "results/results.json")
evaluator = CustomEvaluator()
evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")

Music nDCG: 0.00854883
Tag nDCG: 0.11458
Score: 0.0244536
