In [1]:
import json
import statistics
import numpy as np
import pandas as pd
import tensorflow as tf
from KGE.models.translating_based.TransE import TransE

	Show the summary	
		- KG
		    - number of hrt triplets
		    - number of entities
		    - number of entity type(in dict)
		    - number of entities group by type(in dict)
		    - number of relation 
		
		- For train, validation, test各自的
		    - number of hrt triplets
		    - number of has_interest hrt triplet
		    - number of distinct user
    		- number of distinct item

In [None]:
def show_kg_summary():
  
    kg = pd.read_csv('./data/KKBOX/kgdata_all.csv')
    
    with open('./data/KKBOX/type_dict.json') as f:
        type_dict = json.load(f)
    
    
    print('Summary of KG\n'
          '-------------')

    # - number of hrt triplets
    print('number of hrt triplets: ', kg.shape[0])

    # - number of entities
    # make a dataframe append h & t -> calculate N_entities
    ht_df = kg['h'].append(kg['t'],ignore_index=True)
    print('number of entities: ', ht_df.nunique())

    # - number of entity type(in dict)
    print('number of entity type: ', len(list(set(type_dict.values()))))

    # - number of entities group by type(in dict)
    entity_groupby_type = {type:[entity for entity in type_dict.keys() if type_dict[entity] == type] for type in set(type_dict.values())}
    N_entity_groupby_type = {type:len(entity_groupby_type[type]) for type in entity_groupby_type.keys()}
    print('number of entities group by type: ', N_entity_groupby_type)

    # - number of relations
    print('number of relations: ', kg['r'].nunique())

In [None]:
def show_data_summary():
      
      train_df = pd.read_csv('./data/KKBOX/train_data.csv')
      valid_df = pd.read_csv('./data/KKBOX/valid_data.csv')
      test_df = pd.read_csv('./data/KKBOX/test_data.csv')
      
      with open('./data/KKBOX/type_dict.json') as f:
            type_dict = json.load(f)
      
      
      print('Summary of train, validation, test data\n'
            '---------------------------------------')

      # - number of hrt triplets
      N_hrt_triple = {'train':len(train_df), 'validation':len(valid_df), 'test':len(test_df)}
      print('number of hrt triplets: ', N_hrt_triple) 
      
      # - number of has_interest hrt triplet
      N_interest_hrt_triple = {'train':train_df[train_df['r'] == 'has_interest'].shape[0],\
                              'validation':valid_df[valid_df['r'] == 'has_interest'].shape[0],\
                              'test':test_df[test_df['r'] == 'has_interest'].shape[0]}
      print('number of has_interest hrt triplets: ', N_interest_hrt_triple) 
      

      ht_train_df = train_df['h'].append(train_df['t'],ignore_index=True)
      type_ht_train_df = [type_dict.get(ent) for ent in ht_train_df.unique()]
      ht_valid_df = valid_df['h'].append(valid_df['t'],ignore_index=True)
      type_ht_valid_df = [type_dict.get(ent) for ent in ht_valid_df.unique()]
      ht_test_df = test_df['h'].append(test_df['t'],ignore_index=True)
      type_ht_test_df = [type_dict.get(ent) for ent in ht_test_df.unique()]
      
      # - number of distinct user
      N_user = {'train':type_ht_train_df.count('member'), 'validation':type_ht_valid_df.count('member'),\
                'test':type_ht_test_df.count('member')}
      print('number of distinct user: ', N_user)

      # - number of distinct item
      N_item = {'train':type_ht_train_df.count('song'), 'validation':type_ht_valid_df.count('song'),\
                'test':type_ht_test_df.count('song')}
      print('number of distinct item: ', N_item)
      
      # # - number of distinct user
      # # user -    5    - item
      # # user - 0,2,3,8 - other
      # # item - 1,4,6,7 - other
      # N_train_user = train_df[(train_df['r'] == 0) | (train_df['r'] == 2) | (train_df['r'] == 3) |\
      #                         (train_df['r'] == 5) | (train_df['r'] == 8)]['h'].nunique()
      # N_val_user = valid_df['h'].nunique()                                  
      # N_test_user = test_df['h'].nunique() 
      # N_user = {'train':N_train_user, 'validation':N_val_user, 'test':N_test_user}

      # print('number of distinct user: ', N_user)

      # # - number of distinct item
      # N_train_item = train_df[(train_df['r'] == 5)]['t'].nunique() + \
      #                train_df[(train_df['r'] == 1) | (train_df['r'] == 4) | (train_df['r'] == 6) | \
      #                         (train_df['r'] == 7)]['h'].nunique()      
      # N_val_item = valid_df['t'].nunique()    
      # N_test_item = test_df['t'].nunique() 
      # N_item = {'train':N_train_item, 'validation':N_val_item, 'test':N_test_item}

      # print('number of distinct item: ', N_item)

In [None]:
show_kg_summary()
print()
show_data_summary()

In [2]:
# read data before model training
train = pd.read_csv('./data/KKBOX/train_index_data.csv').values
valid = pd.read_csv('./data/KKBOX/valid_index_data.csv').values
test = pd.read_csv('./data/KKBOX/test_index_data.csv').values

with open('./data/KKBOX/metadata.json') as f:
    metadata = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './data/KKBOX/train_index_data.csv'

In [None]:
# initialized TransE model object
model = TransE(
    embedding_params={"embedding_size": 10},
    negative_ratio=4,
    corrupt_side="h+t"
)

In [None]:
# train the model
model.train(train_X=train, val_X=valid, metadata=metadata, epochs=100, batch_size=30000,
            early_stopping_rounds=None, restore_best_weight=False,
            optimizer=tf.optimizers.Adam(learning_rate=0.0001),
            seed=12345, log_path="./tensorboard_logs/eb10lr0001", log_projector=True)

In [None]:
def batch(iterable, n = 1):
    # generate batches of batch_size:n 
    current_batch = []
    for item in iterable:
        current_batch.append(item)
        if len(current_batch) == n:
            yield current_batch
            current_batch = []
    if current_batch:
        yield current_batch
        
def recommend(user_list):
    '''
    A function to recommend 25 musics for each user in the input user list

        Parameter
        ---------
            user_list: list of user id
        
        Return
        ------
            dict: top 25 recommend songs for list of users
    '''
    # - input: list of user id
    # - output: list of recommend item (25 recommend songs for each user)
    # - logic:
    #     1. user id → user embedding
    #     2. a = user embedding + has_insterest embedding
    #     3. compare distance with all item embeddings, output the nearest 25 items

    test_users_rec_music = {}
    for users in batch(user_list,100):
        # users embedding (batch_users * embedding_size)
        users_index = [metadata['ent2ind'].get(user) for user in users]
        users_emb = tf.nn.embedding_lookup(model.model_weights['ent_emb'], users_index)

        # has_interest embedding (1 * embedding_size )
        has_interest_index = metadata['rel2ind']['has_interest']
        has_interest_emb = model.model_weights['rel_emb'][has_interest_index]
        
        # compute recommend songs (batch_users * embedding_size)
        compute_songs_emb = users_emb + has_interest_emb

        with open('./data/KKBOX/entity_groupby_type.json') as f:
            entity_groupby_type = json.load(f)

        # songs embedding (total_songs * embedding_size)
        song_id = [metadata['ent2ind'].get(ent) for ent in entity_groupby_type['song']]
        songs_emb = tf.nn.embedding_lookup(model.model_weights['ent_emb'], song_id)

        # 用matrix計算，算完全部compute_songs_emb (list) 與 全部songs_emb(list)的距離 (batch_users * total_songs)
        distances = [] 
        # for each user
        for i in range(compute_songs_emb.shape[0]):
            # calculate his rec_music embedding distance to all songs embeddings
            distances.append(tf.norm(tf.subtract(songs_emb, compute_songs_emb[i]), ord=2, axis=1))

        # 每個人的前25首embedding相似的song index (batch_users * 25)
        top_25_songs_index = tf.argsort(distances)[:,:25].numpy().tolist() 

        # song index to song id (batch_users * 25)
        song_ent = tf.convert_to_tensor(np.array(entity_groupby_type['song']))
        top_25_songs = tf.nn.embedding_lookup(song_ent, top_25_songs_index)

        # zip users and their rec_25_songs into a dict
        users_top25_songs =  dict(zip(users,top_25_songs))
        test_users_rec_music.update(users_top25_songs)
    
    return test_users_rec_music

In [None]:
# NDCG

def DCG(rec_list, ans_list):
    dcg = 0
    for i in range(len(rec_list)):
        r_i = 0
        if rec_list[i] in ans_list:
            r_i = 1
        dcg += (2**r_i - 1) / np.log2((i + 1) + 1)
    return dcg

def IDCG(rec_list, ans_list):
    A_temp_1 = []
    A_temp_0 = []
    for rec_music in rec_list:
        if rec_music in ans_list:
            A_temp_1.append(rec_music)
        else:
            A_temp_0.append(rec_music)
    A_temp_1.extend(A_temp_0)
    idcg = DCG(A_temp_1, ans_list)
    return idcg

def NDCG(rec_list, ans_list):
    dcg = DCG(rec_list, ans_list)
    idcg = IDCG(rec_list, ans_list)
    if dcg == 0 or idcg ==0:
        ndcg = 0
    else:
        ndcg = dcg / idcg
    return ndcg
    
def intersection(list1, list2):
    # check if two lists have intersect
    return list(set(list1) & set(list2))
    
def evaluate(test_users_rec_music):
    '''
    Evaluate the recommend result
        
        Parameters
        ----------
            test_users_rec_music(dict): top 25 recommended songs for each user
            log_path: the path to write in tensorboard log

        Returns
        -------
            metric_result(dict): metric include hit, recall, precision and NDCG
    '''
    TP_list = [] # each user's True Positive number
    ans_lengths = [] # each user's has_interest music number
    ndcg_list = []
    for user in test_users_rec_music.keys():
        ans_music_list = user_and_hasInterestItem[user]
        ans_lengths.append(len(ans_music_list))
        rec_music_list = [x.decode() for x in test_users_rec_music[user].numpy().tolist()]
        TP_list.append(len(intersection(rec_music_list, ans_music_list)))
        ndcg_list.append(NDCG(rec_music_list, ans_music_list))
        
    hit_list = [1 if TP >= 1 else 0 for TP in TP_list]
    precision_list = [TP/25 for TP in TP_list]
    recall_list = [TP_list[i]/ans_lengths[i] for i in range(len(TP_list))]
   
    # hit_list = []
    # recall_list = []
    # precision_list = []
    # for user in users:
    #     hit_count = 0
    #     # true answer in test
    #     ans_music_list = user_and_hasInterestItem[user]
        
    #     for rec_music in rec_music_list[i]:
    #         # check if recommended music hits
    #         if (rec_music.decode('utf8') in ans):
    #             hit_count += 1
    #     # hit (是否有推薦命中)
    #     hit_list.append(min(hit_count,1))
    #     # recall (真實有興趣的音樂 分之 推薦命中數)
    #     recall_list.append(hit_count/len(ans))
    #     # precision (25個推薦音樂 分之 推薦命中數)
    #     precision_list.append(hit_count/25)

    metric_result = {
        'hit': statistics.mean(hit_list),
        'recall': statistics.mean(recall_list),
        'precision': statistics.mean(precision_list),
        'ndcg': statistics.mean(ndcg_list)
    }

    # # write in tensorboard log
    # summary_writer = tf.summary.create_file_writer(log_path)
    # with summary_writer.as_default():
    #     tf.summary.scalar('hit', evaluate_result['hit'], step=0)
    #     tf.summary.scalar('recall', evaluate_result['recall'], step=0)
    #     tf.summary.scalar('precision', evaluate_result['precision'], step=0)
    #     tf.summary.scalar('ndcg', evaluate_result['ndcg'], step=0)

    return metric_result
    

In [None]:
# generate test data
test_data = pd.read_csv('./data/KKBOX/test_data.csv')
test_users = test_data['h'].unique().tolist()
user_and_hasInterestItem = test_data.groupby('h')['t'].apply(list).to_dict()

test_data[:10]

In [None]:
# recommend and evaluate on TEST data

test_users_rec_music = recommend(test_users)
test_evaluate_result = evaluate(test_users_rec_music)

In [None]:
# write in tensorboard log
summary_writer = tf.summary.create_file_writer('./tensorboard_logs/eb10lr0001')
with summary_writer.as_default():
    tf.summary.scalar('test-hit', test_evaluate_result['hit'], step=0)
    tf.summary.scalar('test-recall', test_evaluate_result['recall'], step=0)
    tf.summary.scalar('test-precision', test_evaluate_result['precision'], step=0)
    tf.summary.scalar('test-ndcg', test_evaluate_result['ndcg'], step=0)

In [None]:
# recommend and evaluate an TRAINING data
train_data = pd.read_csv('./data/KKBOX/train_data.csv')
train_data = train_data[train_data['r']=='has_interest']
train_users = train_data['h'].unique().tolist()
user_and_hasInterestItem = train_data.groupby('h')['t'].apply(list).to_dict()

train_users_rec_music = recommend(train_users)
train_evaluate_result = evaluate(train_users_rec_music)

summary_writer = tf.summary.create_file_writer('./tensorboard_logs/eb10lr0001')
with summary_writer.as_default():
    tf.summary.scalar('train-hit', train_evaluate_result['hit'], step=0)
    tf.summary.scalar('train-recall', train_evaluate_result['recall'], step=0)
    tf.summary.scalar('train-precision', train_evaluate_result['precision'], step=0)
    tf.summary.scalar('train-ndcg', train_evaluate_result['ndcg'], step=0)

In [None]:
# recommend and evaluate an VALIDATION data
valid_data = pd.read_csv('./data/KKBOX/valid_data.csv')
valid_users = valid_data['h'].unique().tolist()
user_and_hasInterestItem = valid_data.groupby('h')['t'].apply(list).to_dict()

valid_users_rec_music = recommend(valid_users)
valid_evaluate_result = evaluate(valid_users_rec_music)

summary_writer = tf.summary.create_file_writer('./tensorboard_logs/eb10lr0001')
with summary_writer.as_default():
    tf.summary.scalar('valid-hit', valid_evaluate_result['hit'], step=0)
    tf.summary.scalar('valid-recall', valid_evaluate_result['recall'], step=0)
    tf.summary.scalar('valid-precision', valid_evaluate_result['precision'], step=0)
    tf.summary.scalar('valid-ndcg', valid_evaluate_result['ndcg'], step=0)