In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import json
import os
import pickle as pkl
import torch
import openai
from dotenv import load_dotenv
load_dotenv()
from collections import defaultdict

### Task:
Notebook này là để thử nghiệm các phương pháp trích xuất subgraph. Cụ thể:   
- Depth và width được giữ nguyên cho các phương pháp  
- Các phương pháp được chạy bao gồm:  
    + BFS  
    + Cho LLM repharase nhiều phiên bản query tính score từng cái rồi gộp lại  
    + Cho LLM phân ra thành các sub objective theo prompt của PoG, score từng cái rồi gộp lại
    + Cho LLM phân ra nhỏ query theo các relation trong query, score từng cái rồi gộp lại.

### Khởi tạo model

In [None]:
with open("knowledge_graph/KG_data/FB15k-237-betae/id2ent.pkl", "rb") as f:
    id2ent = pkl.load(f)
with open("knowledge_graph/KG_data/FB15k-237-betae/id2rel.pkl", "rb") as f:
    id2rel = pkl.load(f)
with open("knowledge_graph/KG_data/FB15k-237-betae/FB15k_mid2name.txt", "r") as f:
    ent2name = {}
    for line in f:
        mid, name = line.strip().split("\t")
        ent2name[mid] = name

with open("knowledge_graph/queries/train_2c_id.pkl", "rb") as f:
    query_2_hop = pkl.load(f)
with open("knowledge_graph/queries/train_1c_id.pkl", "rb") as f:
    query_1_hop = pkl.load(f)
with open("knowledge_graph/queries/train_3c_id.pkl", "rb") as f:
    query_3_hop = pkl.load(f)

### Inspect queries

In [5]:
query_2_hop[0]

{'query_type': ('e', ('r', 'r')),
 'raw_query': (4582, (133, 17)),
 'named_query': ('Franklin',
  ('-/people/person/places_lived./people/place_lived/location',
   '-/award/award_nominee/award_nominations./award/award_nomination/award_nominee')),
 'transformed_query': ['Who are the award nominees that lived in places associated with Franklin?',
  'Which award nominees have resided in locations tied to Franklin?',
  'Can you tell me the names of award nominees who lived in locations related to Franklin?'],
 'answers_id': [2403, 1221, 137, 1589, 4093],
 'answers': ['Vince_Gill',
  'Albert_Lee',
  'Keith_Urban',
  'Carrie_Underwood',
  'John_Travolta']}

### Config

In [6]:
from expand_subgraph import ExpandSubgraph
from utils import  extract_numbers, extract_strings,\
                    extract_notations

import argparse
from load_data import DataLoader
import numpy as np

In [None]:
class Config:
    data_path = 'knowledge_graph/KG_data/FB15k-237-betae'
    seed = 1234
    k = 9 # beams
    depth = 8 # max depth of subgraph
    gpu = 0
    fact_ratio = 0.75
    val_num = -1 # how many triples are used as the validate set
    epoch = 200
    layer = 6
    batchsize = 16
    cpu = 1
    weight = ''
    add_manual_edges = False
    remove_1hop_edges = True
    only_eval = False
    not_shuffle_train = False
    device = "cuda:0"

args = Config()

loader = DataLoader(args, mode='train')
loader.shuffle_train()
# val_loader = DataLoader(args, mode='valid')
# test_loader = DataLoader(args, mode='test')

train_graph = loader.train_graph
train_graph_homo = list(set([(h,t) for (h,r,t) in train_graph]))
# test_data = np.concatenate([np.array(test_data, dtype=np.int32), loader.idd_data], 0, dtype=np.int32)
# train_graph = np.array(train_graph, dtype=np.int32)
args.n_ent = loader.n_ent
args.n_rel = loader.n_rel


==> removing 1-hop links...
==> done
==> removing 1-hop links...
==> done


---

In [None]:
all_subgraph = defaultdict(list)
def print_statistics(stats, label):
    print(f"{label} Statistics:")
    for key, value in stats.items():
        if key in ["mean", "std_dev"]:
            print(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")
def calculate_overall_score(query_set, train_sampler, method="default", label=None):
    overall_score = 0
    for query in query_set:
        if method == "default":
            topk_node, _, subgraph = train_sampler.sampleSubgraph(query)
        elif method == "bfs":
            topk_node, _, subgraph = train_sampler.sampleSubgraphBFS(query)
        s = 0
        all_subgraph[label].append({
            "query": query,
            "subgraph": subgraph
        })
        answers = set(query["answers_id"])
        topk_node_set = set(topk_node)
        precision = len(answers & topk_node_set) / len(answers) if len(answers) > 0 else 0
        overall_score += precision
    overall_score /= len(query_set)
    return overall_score

#### P2.s0 approach: 

In [None]:
train_sampler = ExpandSubgraph(args.n_ent, args.n_rel,
                               train_graph_homo,train_graph,
                               args=args)

query_1_hop_stat = []
query_2_hop_stat = []
query_3_hop_stat = []

for _ in range(1):
    loader.shuffle_train()
    train_sampler.updateEdges(loader.train_graph)
    random.shuffle(query_1_hop)
    random.shuffle(query_2_hop)
    random.shuffle(query_3_hop)
    query_1_hop_stat.append(calculate_overall_score(query_1_hop[:100], train_sampler, label = "p2s2"))
    query_2_hop_stat.append(calculate_overall_score(query_2_hop[:100], train_sampler, label = "p2s2"))
    query_3_hop_stat.append(calculate_overall_score(query_3_hop[:100], train_sampler, label = "p2s2"))
    print(f"epoch {_}")

==> removing 1-hop links...
==> done


In [None]:
from utils import calculate_statistics


stats_1_hop = calculate_statistics(query_1_hop_stat)
print_statistics(stats_1_hop, "1-Hop Query")
stats_2_hop = calculate_statistics(query_2_hop_stat)
print_statistics(stats_2_hop, "2-Hop Query")
stats_3_hop = calculate_statistics(query_3_hop_stat)
print_statistics(stats_3_hop, "3-Hop Query")

1-Hop Query Statistics:
  mean: 0.7939
  std_dev: 0.0058
2-Hop Query Statistics:
  mean: 0.3045
  std_dev: 0.0031
3-Hop Query Statistics:
  mean: 0.1924
  std_dev: 0.0049


#### Vanila BFS

In [None]:
train_sampler = ExpandSubgraph(args.n_ent, args.n_rel,
                               train_graph_homo,train_graph,
                               args=args)

query_1_hop_stat_bfs = []
query_2_hop_stat_bfs = []
query_3_hop_stat_bfs = []

for _ in range(2):
    loader.shuffle_train()
    train_sampler.updateEdges(loader.train_graph)
    random.shuffle(query_1_hop)
    random.shuffle(query_2_hop)
    random.shuffle(query_3_hop)
    query_1_hop_stat_bfs.append(calculate_overall_score(query_1_hop[:50], train_sampler, "bfs", label="bfs"))
    query_2_hop_stat_bfs.append(calculate_overall_score(query_2_hop[:50], train_sampler, "bfs", label="bfs"))
    query_3_hop_stat_bfs.append(calculate_overall_score(query_3_hop[:50], train_sampler, "bfs", label="bfs"))
    print(f"epoch {_}")

In [None]:
stats_1_hop_bfs = calculate_statistics(query_1_hop_stat_bfs)
print_statistics(stats_1_hop_bfs, "1-Hop Query")
stats_2_hop_bfs = calculate_statistics(query_2_hop_stat_bfs)
print_statistics(stats_2_hop_bfs, "2-Hop Query")
stats_3_hop_bfs = calculate_statistics(query_3_hop_stat_bfs)
print_statistics(stats_3_hop_bfs, "3-Hop Query")

1-Hop Query Statistics:
  mean: 0.7970
  std_dev: 0.0118
1-Hop Query Statistics:
  mean: 0.7970
  std_dev: 0.0118
2-Hop Query Statistics:
  mean: 0.4025
  std_dev: 0.0118
3-Hop Query Statistics:
  mean: 0.4431
  std_dev: 0.0123


In [36]:
train_sampler = ExpandSubgraph(args.n_ent, args.n_rel,
                               train_graph_homo,train_graph,
                               args=args)
for _ in range(3):
    random.shuffle(query_1_hop)
    random.shuffle(query_2_hop)
    random.shuffle(query_3_hop)
    calculate_overall_score(query_1_hop[0:1], train_sampler, "bfs", label = "p2s2_final")
    calculate_overall_score(query_2_hop[0:1], train_sampler, "bfs", label = "p2s2_final")
    calculate_overall_score(query_3_hop[0:1], train_sampler, "bfs", label = "p2s2_final")


In [40]:
sums = 0 
for a in all_subgraph['p2s2_final']:
    subgraph = a['subgraph']
    sums +=  len(np.unique(subgraph[:, [0, 2]].flatten()))

avg =sums / len(all_subgraph)
print("Average unique elements:", avg)

Average unique elements: 113169.0


### Sub-objective approach

In [None]:
train_sampler = ExpandSubgraph(args.n_ent, args.n_rel,
                               train_graph_homo,train_graph,
                               args=args, use_sub_objectives_a=True)

query_1_hop_stat_a = []
query_2_hop_stat_a = []
query_3_hop_stat_a = []

for _ in range(2):
    loader.shuffle_train()
    train_sampler.updateEdges(loader.train_graph)
    random.shuffle(query_1_hop)
    random.shuffle(query_2_hop)
    random.shuffle(query_3_hop)
    query_1_hop_stat_a.append(calculate_overall_score(query_1_hop[:50], train_sampler, label = "subojective"))
    query_2_hop_stat_a.append(calculate_overall_score(query_2_hop[:50], train_sampler, label = "subojective"))
    query_3_hop_stat_a.append(calculate_overall_score(query_3_hop[:50], train_sampler, label = "subojective"))
    print(f"epoch {_}")

openai error, retry: 1 validation error for SubobjectiveOutput
  Invalid JSON: EOF while parsing a string at line 1 column 169 [type=json_invalid, input_value='{"res":["Identify the aw... the awards he has been', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
epoch 0
epoch 1


In [None]:
from utils import calculate_statistics


stats_1_hop_a = calculate_statistics(query_1_hop_stat_a)
print_statistics(stats_1_hop_a, "1-Hop Query")
stats_2_hop_a = calculate_statistics(query_2_hop_stat_a)
print_statistics(stats_2_hop_a, "2-Hop Query")
stats_3_hop_a = calculate_statistics(query_3_hop_stat_a)
print_statistics(stats_3_hop_a, "3-Hop Query")

1-Hop Query Statistics:
  mean: 0.2864
  std_dev: 0.0002
2-Hop Query Statistics:
  mean: 0.0949
  std_dev: 0.0367
3-Hop Query Statistics:
  mean: 0.0416
  std_dev: 0.0312


### Sub-objective approach 2

In [None]:
train_sampler = ExpandSubgraph(args.n_ent, args.n_rel,
                               train_graph_homo,train_graph,
                               args=args, use_sub_objectives_b=True)

query_1_hop_stat_b = []
query_2_hop_stat_b = []
query_3_hop_stat_b = []

for _ in range(2):
    loader.shuffle_train()
    train_sampler.updateEdges(loader.train_graph)
    random.shuffle(query_1_hop)
    random.shuffle(query_2_hop)
    random.shuffle(query_3_hop)
    query_1_hop_stat_b.append(calculate_overall_score(query_1_hop[:50], train_sampler, label = "subojective"))
    query_2_hop_stat_b.append(calculate_overall_score(query_2_hop[:50], train_sampler, label = "subojective"))
    query_3_hop_stat_b.append(calculate_overall_score(query_3_hop[:50], train_sampler, label = "subojective"))
    print(f"epoch {_}")

==> removing 1-hop links...
==> done
epoch 0
==> removing 1-hop links...
==> done
epoch 1


In [None]:
from utils import calculate_statistics


stats_1_hop_b = calculate_statistics(query_1_hop_stat_b)
print_statistics(stats_1_hop_b, "1-Hop Query")
stats_2_hop_b = calculate_statistics(query_2_hop_stat_b)
print_statistics(stats_2_hop_b, "2-Hop Query")
stats_3_hop_b = calculate_statistics(query_3_hop_stat_b)
print_statistics(stats_3_hop_b, "3-Hop Query")

1-Hop Query Statistics:
  mean: 0.2926
  std_dev: 0.0399
2-Hop Query Statistics:
  mean: 0.1010
  std_dev: 0.0069
3-Hop Query Statistics:
  mean: 0.0475
  std_dev: 0.0114


### Analyzing the size of retrieved subgraph

In [None]:
avg_size = 0
for i in range(len(all_subgraph["p2s2"])):
    subgraph = all_subgraph["p2s2"][i]["subgraph"]
    # print(subgraph)
    # print(subgraph[:,[0,2]].flatten())
    number_of_nodes = np.unique(subgraph[:,[0,2]].flatten()).shape[0]
    # print(number_of_nodes)
    avg_size += number_of_nodes
    # break
avg_size /= len(all_subgraph["p2s2"])
avg_size

38.31111111111111

### output result

In [None]:
import json
results = {
    "mine":{
        "1-hop": stats_1_hop,
        "2-hop": stats_2_hop,
        "3-hop": stats_3_hop
    },
    "bfs": {
        "1-hop": stats_1_hop_bfs,
        "2-hop": stats_2_hop_bfs,
        "3-hop": stats_3_hop_bfs
    },
    "subobjective": {
        "1-hop": stats_1_hop_a,
        "2-hop": stats_2_hop_a,
        "3-hop": stats_3_hop_a
    }
}


In [None]:
with open("results/expand_subgraph/expand_subgraph_results_2_12.json", "w") as f:
    json.dump(results, f, indent=4)

## Show result:

In [None]:
import json

In [None]:
with open("results/expand_subgraph/expand_subgraph_results.json", "r") as f:
    results = json.load(f)
results.keys()

dict_keys(['mine', 'bfs'])

In [None]:
results['bfs']['3-hop']

{'count': 3,
 'mean': 0.44306987267385384,
 'median': 0.451781993885975,
 'std_dev': 0.012320799975220113,
 'min': 0.42564563024961144,
 'max': 0.451781993885975,
 '25th_percentile': 0.43871381206779325,
 '75th_percentile': 0.451781993885975}

In [None]:
results['sub_objective_a']={
    '1-hop': {
        'mean': 0.2864,
        'std_dev': 0.0002,
    },
    '2-hop': {
        'mean': 0.0949,
        'std_dev': 0.0367,
    },
    '3-hop':{
        'mean': 0.0416,
        'std_dev': 0.0312
    }
}
results['sub_objective_b']={
    '1-hop': {
        'mean': 0.2926,
        'std_dev': 0.0399
    },
    '2-hop': {
        'mean': 0.1010,
        'std_dev': 0.0069
    },
    '3-hop':{
        'mean': 0.0475,
        'std_dev': 0.0114
    }
}



In [None]:
config_dict = {
    "data_path": args.data_path,
    "seed": args.seed,
    "k": args.k,
    "depth": args.depth,
    "gpu": args.gpu,
    "fact_ratio": args.fact_ratio,
    "val_num": args.val_num,
    "epoch": args.epoch,
    "layer": args.layer,
    "batchsize": args.batchsize,
    "cpu": args.cpu,
    "weight": args.weight,
    "add_manual_edges": args.add_manual_edges,
    "remove_1hop_edges": args.remove_1hop_edges,
    "only_eval": args.only_eval,
    "not_shuffle_train": args.not_shuffle_train,
    "device": args.device
}