In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from KGE.data_utils import index_kg, convert_kg_to_index
from KGE.models.translating_based.TransE import TransE
from sklearn.model_selection import train_test_split

In [4]:
# load data
interest_data = pd.read_csv('./data/KKBOX/kgdata_interest.csv').to_numpy()
interest_data[:10]

array([['uid_FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=',
        'has_interest',
        'sid_BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik='],
       ['uid_Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=',
        'has_interest',
        'sid_bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM='],
       ['uid_Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=',
        'has_interest',
        'sid_JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY='],
       ['uid_Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=',
        'has_interest',
        'sid_2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs='],
       ['uid_FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=',
        'has_interest',
        'sid_3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc='],
       ['uid_FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=',
        'has_interest',
        'sid_3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU='],
       ['uid_Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=',
        'has_interest',
        'sid_VkILU0H1h3NMmk9MQrXouNudGk5n8Ls5c

In [2]:
def kg_train_val_test_split(data, test_size, val_size):
    '''
        Parameters
        ----------
            data: the data to be split
            test_size: if float, represent the proportion of the dataset to include in the test split
                    if int, represents the absolute number of test samples
            val_size: same as test_size, but is the proportion of the train data

        Returns
        -------
            train, valid, test
    '''
   
    # sort by first column
    sorted_data = data[np.argsort(data[:,0])]

    # get unique user index and count
    unique_head, index, count = np.unique(sorted_data[:,0], return_index=True, return_counts=True)

    # train test split
    train_origin = [] 
    test = []
    
    for i in range(len(index)):
        if count[i]>1:
            tr, te = train_test_split(sorted_data[index[i]:index[i] + count[i], :], test_size=test_size, random_state=i)
            train_origin.append(tr)
            test.append(te)
        else: #只有一筆
            test.append(sorted_data[index[i],:])
            
    # train val split
    valid = []
    train = []

    for j in range(len(train_origin)):
        if len(train_origin[j])>1:
            tr, va = train_test_split(train_origin[j], test_size=val_size, random_state=j)
            train.append(tr)
            valid.append(va)
        else: #只有一筆
            valid.append(train_origin[j])
    
    
    return np.vstack(train), np.vstack(valid), np.vstack(test)

In [None]:
train, valid, test = kg_train_val_test_split(interest_data, 0.33, 0.1)

In [6]:
# read other data
other_data = pd.read_csv('./data/KKBOX/kgdata_other.csv').to_numpy()
other_data[:10]

array([['sid_CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=', 'length',
        '>=3min'],
       ['sid_o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=', 'length',
        '>=3min'],
       ['sid_DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=', 'length',
        '>=3min'],
       ['sid_dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=', 'length',
        '>=3min'],
       ['sid_W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=', 'length',
        '<3min'],
       ['sid_kKJ2JNU5h8rphyW21ovC+RZU+yEHPM+3w85J37p7vEQ=', 'length',
        '>=3min'],
       ['sid_N9vbanw7BSMoUgdfJlgX1aZPE1XZg8OS1wf88AQEcMc=', 'length',
        '>=3min'],
       ['sid_GsCpr618xfveHYJdo+E5SybrpR906tsjLMeKyrCNw8s=', 'length',
        '>=3min'],
       ['sid_oTi7oINPX+rxoGp+3O6llSltQTl80jDqHoULfRoLcG4=', 'length',
        '>=3min'],
       ['sid_btcG03OHY3GNKWccPP0auvtSbhxog/kllIIOx5grE/k=', 'length',
        '>=3min']], dtype=object)

In [7]:
# concate kgdata_interest & kgdata_other as train
train = np.concatenate((train, other_data))

In [8]:
# output data before index
pd.DataFrame(train,columns=['h','r','t']).to_csv('./data/KKBOX/train_data.csv', index=False)
pd.DataFrame(valid,columns=['h','r','t']).to_csv('./data/KKBOX/valid_data.csv', index=False)
pd.DataFrame(test,columns=['h','r','t']).to_csv('./data/KKBOX/test_data.csv', index=False)

In [9]:
# index the kg data
metadata = index_kg(train)

In [10]:
# output metadata json

with open('./data/KKBOX/metadata.json', 'w') as f:
    json.dump(metadata, f)

In [11]:
# conver kg into index
train = convert_kg_to_index(train, metadata["ent2ind"], metadata["rel2ind"])
valid = convert_kg_to_index(valid, metadata["ent2ind"], metadata["rel2ind"])
test = convert_kg_to_index(test, metadata["ent2ind"], metadata["rel2ind"])

In [12]:
train[:10]

array([[2505856,       5, 1931828],
       [2505856,       5, 1537330],
       [2505856,       5, 1603992],
       [2505856,       5, 2286732],
       [2505856,       5,  760905],
       [2505856,       5, 1072362],
       [2505856,       5, 1596949],
       [2505856,       5,  395076],
       [2505856,       5, 1939255],
       [2505856,       5, 1722423]])

In [20]:
# output data after index
pd.DataFrame(train,columns=['h','r','t']).to_csv('./data/KKBOX/train_index_data.csv', index=False)
pd.DataFrame(valid,columns=['h','r','t']).to_csv('./data/KKBOX/valid_index_data.csv', index=False)
pd.DataFrame(test,columns=['h','r','t']).to_csv('./data/KKBOX/test_index_data.csv', index=False)