In [1]:
import numpy as np
import pandas as pd
from KGE.data_utils import index_kg, convert_kg_to_index
from KGE.models.translating_based.TransE import TransE
from sklearn.model_selection import train_test_split

In [2]:
# load data
interest_data = np.loadtxt('./data/KKBOX/kgdata_interest.csv', dtype=str, delimiter=',')
interest_data

array([['FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=', 'has_interest',
        'BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik='],
       ['Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=', 'has_interest',
        'bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM='],
       ['Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=', 'has_interest',
        'JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY='],
       ...,
       ['ZxbVmt3Kh/XOH+h58c2Kdj6SjFZk+wnUO006IgWzMQE=', 'has_interest',
        '750RprmFfLV0bymtDH88g24pLZGVi5VpBAI300P6UOA='],
       ['0aH4Hd3ziPSRHClRX8rkeOEaAG5EPPkW1mKGCdXEok0=', 'has_interest',
        'G8wgqObgeAMER/rVCIlgcNeQ8mm0CzF/GsxiMK8TTnA='],
       ['0aH4Hd3ziPSRHClRX8rkeOEaAG5EPPkW1mKGCdXEok0=', 'has_interest',
        'Ju0VGkjWeBUZCd7r5Az2hUImhMoWxWLUicOedsmvG0g=']], dtype='<U44')

In [3]:
# 總user數
len(np.unique(interest_data[:, 0]))

27113

In [4]:
# 有興趣的音樂<3首的user數
sum(pd.DataFrame(interest_data, columns=['h', 'r', 't']).groupby('h').t.nunique() < 3)

2237

In [5]:
def kg_train_val_test_split(data, test_size, val_size):
    '''
        Parameters
        ----------
            data: the data to be split
            test_size: if float, represent the proportion of the dataset to include in the test split
                    if int, represents the absolute number of test samples
            val_size: same as test_size, but is the proportion of the train data

        Returns
        -------
            train, valid, test
    '''
   
    # sort by first column
    sorted_data = data[np.argsort(data[:,0])]

    # get unique user index and count
    unique_head, index, count = np.unique(sorted_data[:,0], return_index=True, return_counts=True)

    # train test split
    train_origin = [] 
    test = []
    
    for i in range(len(index)):
        if count[i]>1:
            tr, te = train_test_split(sorted_data[index[i]:index[i] + count[i], :], test_size=test_size, random_state=i)
            train_origin.append(tr)
            test.append(te)
        else: #只有一筆
            test.append(sorted_data[index[i],:])
            
    # train val split
    valid = []
    train = []

    for j in range(len(train_origin)):
        if len(train_origin[j])>1:
            tr, va = train_test_split(train_origin[j], test_size=val_size, random_state=j)
            train.append(tr)
            valid.append(va)
        else: #只有一筆
            valid.append(train_origin[j])
    
    
    return np.vstack(train), np.vstack(valid), np.vstack(test)

In [6]:
train, valid, test = kg_train_val_test_split(interest_data, 0.33, 0.1)

In [9]:
# read other data
other_data = pd.read_csv('./data/KKBOX/kgdata_other.csv').to_numpy()

In [10]:
# concate kgdata_interest & kgdata_other as train
train = np.concatenate((train, other_data))

In [7]:
# index the kg data (編號entity and relation)
metadata = index_kg(train)

In [13]:
# conver kg into index
atrain = convert_kg_to_index(train, metadata["ent2ind"], metadata["rel2ind"])
avalid = convert_kg_to_index(valid, metadata["ent2ind"], metadata["rel2ind"])
atest = convert_kg_to_index(test, metadata["ent2ind"], metadata["rel2ind"])

In [14]:
atrain, avalid, atest

(array([[0, 0, 148898],
        [0, 0, 114694],
        [0, 0, 120528],
        ...,
        [63286, None, None],
        [105948, None, None],
        [189926, None, None]], dtype=object),
 array([[0, 0, 143938],
        [0, 0, 18410],
        [0, 0, 57315],
        ...,
        [198151, 0, 60464],
        [198151, 0, 50262],
        [None, 0, 186645]], dtype=object),
 array([[0, 0, 169230],
        [0, 0, 163583],
        [0, 0, 18591],
        ...,
        [198151, 0, 118130],
        [198151, 0, 163248],
        [None, 0, 173084]], dtype=object))

In [15]:
atrain.dtype, avalid.dtype

(dtype('O'), dtype('O'))

In [16]:
valid[-4:-1]

array([['zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=', 'has_interest',
        'gvHR5iZ5GWVYZBMTQFo/k0LB3Hh2RDUNhDgOCzICosE='],
       ['zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=', 'has_interest',
        'HTruKDEG1ZXlvNkAqiEUH5GU+DqtOugK9hpaALyZM3k='],
       ['zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=', 'has_interest',
        'EBgGLa95GjiZUikvWXoX8bAF5d6SIs4hdsveXxYvSAE=']], dtype='<U44')

In [17]:
metadata['ent2ind']['zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=']

198151

In [21]:
# initialized TransE model object
model = TransE(
    embedding_params={"embedding_size": 32},
    negative_ratio=10,
    corrupt_side="h+t",
)

In [22]:
# train the model
model.train(train_X=atrain, val_X=avalid, metadata=metadata, epochs=10, batch_size=64,
            log_path="./tensorboard_logs", log_projector=True)

INFO:root:[2021-11-29 15:33:06.791058] Preparing for training...
INFO:root:[2021-11-29 15:33:06.808068] - Calculating number of batch...
INFO:root:[2021-11-29 15:33:06.811050] - Setting data iterator...


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).