In [1]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader

import pandas as pd
import numpy as np

import os
import json

from utils.hake_dataset import *

from hake.models import HAKE
from hake.data import DataReader, TrainDataset, BatchType, BidirectionalOneShotIterator
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = './data'

# folder to save entity id mapping
ID_MAPPING = os.path.join(DATA_PATH, 'entity_id_map')
os.makedirs(ID_MAPPING, exist_ok=True)

# folder to save edge index
EDGE_INDEX = os.path.join(DATA_PATH, 'edge_index')
os.makedirs(EDGE_INDEX, exist_ok=True)

# folder to save graph data
GRAPH_DATA = os.path.join(DATA_PATH, 'graph_data')
os.makedirs(GRAPH_DATA, exist_ok=True)

# folder to save HAKE data and model
HAKE_PATH = './hake'
os.makedirs(HAKE_PATH, exist_ok=True)

HAKE_DATA = os.path.join(HAKE_PATH, 'data/yfinance_kge')
os.makedirs(HAKE_DATA, exist_ok=True)

HAKE_EMBEDDINGS = os.path.join(HAKE_PATH, 'embeddings/yfinance_kge')
os.makedirs(HAKE_EMBEDDINGS, exist_ok=True)

# 1. Create data input for HAKE model

In [3]:
# prepare triples input
hake_triples = make_hake_triples(id_mapping_dir=ID_MAPPING, edge_index_dir=EDGE_INDEX,)

In [4]:
# convert the triples of the dataset into a HAKE compatible dataset
make_hake_dataset(
    triples=hake_triples,
    out_dir = HAKE_DATA
)

# 2. Use HAKE to generate node embeddings

In [5]:
!touch hake/data/yfinance_kge/valid.txt
!touch hake/data/yfinance_kge/test.txt

In [6]:
data_reader = DataReader(HAKE_DATA)

num_entity = len(data_reader.entity_dict)
num_relation = len(data_reader.relation_dict)

print(f"Num entities: {num_entity}")
print(f"Num relations: {num_relation}")

Num entities: 20050
Num relations: 4


In [7]:
MODEL = 'HAKE'
HIDDEN_DIM = 200
GAMMA = 12.0
BATCH_SIZE = 512
NEGATIVE_SAMPLE_SIZE = 256
LEARNING_RATE = 0.0001
MAX_STEPS = 10000
MODULUS_WEIGHT = 1.0
PHASE_WEIGHT = 0.5
ADVERSARIAL_TEMPERATURE = 1.0
CPU_NUM = 4

In [8]:
# -----------------------------
# Model
# -----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HAKE(num_entity, num_relation, HIDDEN_DIM, GAMMA, MODULUS_WEIGHT, PHASE_WEIGHT).to(device)

In [9]:
# -----------------------------
# DataLoader & iterator
# -----------------------------
train_dataloader_head = DataLoader(
    TrainDataset(data_reader, NEGATIVE_SAMPLE_SIZE, BatchType.HEAD_BATCH),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=CPU_NUM,
    collate_fn=TrainDataset.collate_fn
)

train_dataloader_tail = DataLoader(
    TrainDataset(data_reader, NEGATIVE_SAMPLE_SIZE, BatchType.TAIL_BATCH),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=CPU_NUM,
    collate_fn=TrainDataset.collate_fn
)

train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail)

In [10]:
# -----------------------------
# Optimizer
# -----------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [11]:
# -----------------------------
# Training loop
# -----------------------------
for step in range(MAX_STEPS):
    log = model.train_step(model, optimizer, train_iterator, args=type('args', (), {'adversarial_temperature': ADVERSARIAL_TEMPERATURE}))
    
    if step % 100 == 0:
        print(f"Step {step}: loss={log['loss']:.4f}")

Step 0: loss=3.3814
Step 100: loss=2.6990
Step 200: loss=2.0413
Step 300: loss=1.4444
Step 400: loss=0.9912
Step 500: loss=0.7444
Step 600: loss=0.6481
Step 700: loss=0.6025
Step 800: loss=0.5162
Step 900: loss=0.5062
Step 1000: loss=0.4742
Step 1100: loss=0.3992
Step 1200: loss=0.4020
Step 1300: loss=0.3902
Step 1400: loss=0.3796
Step 1500: loss=0.3407
Step 1600: loss=0.3336
Step 1700: loss=0.3264
Step 1800: loss=0.2852
Step 1900: loss=0.2985
Step 2000: loss=0.2720
Step 2100: loss=0.2716
Step 2200: loss=0.2596
Step 2300: loss=0.2462
Step 2400: loss=0.2509
Step 2500: loss=0.2442
Step 2600: loss=0.2177
Step 2700: loss=0.2283
Step 2800: loss=0.2243
Step 2900: loss=0.2169
Step 3000: loss=0.2067
Step 3100: loss=0.1909
Step 3200: loss=0.1993
Step 3300: loss=0.1956
Step 3400: loss=0.1918
Step 3500: loss=0.1873
Step 3600: loss=0.1837
Step 3700: loss=0.1768
Step 3800: loss=0.1748
Step 3900: loss=0.1769
Step 4000: loss=0.1575
Step 4100: loss=0.1616
Step 4200: loss=0.1540
Step 4300: loss=0.1672


In [12]:
# -----------------------------
# Save embeddings
# -----------------------------
torch.save(model.entity_embedding.detach().cpu(), os.path.join(HAKE_EMBEDDINGS, 'entity_embedding.pt'))
torch.save(model.relation_embedding.detach().cpu(), os.path.join(HAKE_EMBEDDINGS, 'relation_embedding.pt'))
print("Embeddings saved!")

Embeddings saved!
