In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sentence_transformers import SentenceTransformer, util
import random
import json
import os
import pickle as pkl
import torch
import openai
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')

with open("../knowledge_graph/KG_data/FB15k-237-betae/id2ent.pkl", "rb") as f:
    id2ent = pkl.load(f)
with open("../knowledge_graph/KG_data/FB15k-237-betae/id2rel.pkl", "rb") as f:
    id2rel = pkl.load(f)
with open("/knowledge_graph/KG_data/FB15k-237-betae/FB15k_mid2name.txt", "r") as f:
    ent2name = {}
    for line in f:
        mid, name = line.strip().split("\t")
        ent2name[mid] = name



In [4]:
rels = id2rel.values()
rels = list(rels)
rels[:5]

['+/location/country/form_of_government',
 '-/location/country/form_of_government',
 '+/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor',
 '-/tv/tv_program/regular_cast./tv/regular_tv_appearance/actor',
 '+/media_common/netflix_genre/titles']

### Tạo embedding cho các relations

In [5]:
rel_embeddings = model.encode(rels)

Nhận xét:  
+ Direct và inverse relation thường có độ khớp gần như nhau.  

Original query:

In [6]:
query = "Which awards has David Copperfield received for his performances in films?"

#### Query split:  
Do query thể có nhiều phần và cần phải qua nhiều hop mới đạt được kết quả. Do đó, ta có thể đưa query về thành các sub-queries và xử lí từng đoạn.  
+ Splitting queries: Ta có thể đưa LLM làm (PoG có bước này)
+ Compose results: 
    + Sử dụng mean
    + Sử dụng reciprocal rank
    + 

In [7]:
split_queries = [
    "What films did David Copperfield perform in?",
    "What awards has David Copperfield won?"
]

In [8]:
split_queries = "Which awards has David Copperfield received for his performances in films?"

In [9]:
query_emb = model.encode(split_queries)

scores = util.dot_score(query_emb, rel_embeddings)

### Thử combine 2 relation để so khớp với query.

In [10]:
def fuse_mean(scores):
    ls = scores.unbind(dim = 0)
    res = torch.zeros(1,scores.shape[1])
    n = scores.shape[0]
    for l in ls:
        res += l / n 
    return res

scores = fuse_mean(scores)

In [11]:
def fuse_rrf(scores, k=10):
    ls = scores.unbind(dim=0)
    dict_scores = {}
    for l in ls:
        _, indices = torch.sort(l, descending=True)
        for i in range(scores.shape[1]):
            if indices[i].item() not in dict_scores:
                dict_scores[indices[i].item()] = []
            dict_scores[indices[i].item()].append(i + 1)

    for key in dict_scores:
        dict_scores[key] = sum([1/(k + rank) for rank in dict_scores[key]])
    scores = torch.tensor([list(dict_scores.values())])
    return scores
scores = fuse_rrf(scores)

In [12]:
# scores[0]

In [13]:



def compare_rel_query_and_return_topk(rels, query, fuse_func=fuse_mean, k=5):
    if type(rels) is not list:
        rels = [rels]
    rels_emb = model.encode(rels)
    query_emb = model.encode([query,])
    scores = util.dot_score(query_emb, rels_emb)
    return return_top_k(scores, rels, k=k, fuse_func=fuse_func)

def return_top_k(scores, rels, k=5, fuse_func=fuse_mean):
    scores = fuse_func(scores)
    sorted_scores, sorted_indices = torch.sort(scores, descending=True)
    for idx in sorted_indices.tolist()[0][:k]:
        print(f"Index: {idx}")
        rel = rels[idx]
        print(f"Relation: {rel}")
        print(f"Score: {scores[0,idx].item()}")
        print("-" * 50)

In [14]:
c_rel1 = rels[13] + rels[6]
c_rel2 = rels[6] + rels[13]
c_rels = [c_rel1, c_rel2]  # "award_nomination_awarded_to" + "award_winner_award_received"

In [15]:
compare_rel_query_and_return_topk(c_rels+rels, split_queries, fuse_func=fuse_mean, k=5)

Index: 15
Relation: -/film/actor/film./film/performance/film
Score: 93.86505126953125
--------------------------------------------------
Index: 0
Relation: -/film/actor/film./film/performance/film+/award/award_winner/awards_won./award/award_honor/award_winner
Score: 93.60128021240234
--------------------------------------------------
Index: 207
Relation: -/film/director/film
Score: 93.42400360107422
--------------------------------------------------
Index: 1
Relation: +/award/award_winner/awards_won./award/award_honor/award_winner-/film/actor/film./film/performance/film
Score: 93.12854766845703
--------------------------------------------------
Index: 14
Relation: +/film/actor/film./film/performance/film
Score: 92.56756591796875
--------------------------------------------------


Các relation liên quan tới film dominate


In [16]:
rels[6], rels[13]

('+/award/award_winner/awards_won./award/award_honor/award_winner',
 '-/film/actor/film./film/performance/film')

### Inspect queries

In [None]:
with open("/knowledge_graph/queries/train_2c_id.pkl", "rb") as f:
    train_queries = pkl.load(f)

query= train_queries[0]
query

{'query_type': ('e', ('r', 'r')),
 'raw_query': (4582, (133, 17)),
 'named_query': ('Franklin',
  ('-/people/person/places_lived./people/place_lived/location',
   '-/award/award_nominee/award_nominations./award/award_nomination/award_nominee')),
 'transformed_query': ['Who are the award nominees that have lived in places associated with Franklin?',
  'What are the names of the individuals who were nominated for awards and lived in locations linked to Franklin?',
  'Can you list the award nominees who have resided in locations related to Franklin?'],
 'answers': ['Vince_Gill',
  'Albert_Lee',
  'Keith_Urban',
  'Carrie_Underwood',
  'John_Travolta']}

In [40]:
with open("data/queries/train_2c_id1.pkl", "rb") as f:
    train_queries = pkl.load(f)

query= train_queries[0]
query

{'query_type': ('e', ('r', 'r')),
 'raw_query': ('Franklin',
  ('-/people/person/places_lived./people/place_lived/location',
   '-/award/award_nominee/award_nominations./award/award_nomination/award_nominee')),
 'transformed_query': ['Who are the award nominees that have lived in places associated with Franklin?',
  'What are the names of the individuals who were nominated for awards and lived in locations linked to Franklin?',
  'Can you list the award nominees who have resided in locations related to Franklin?'],
 'answers': ['Vince_Gill',
  'Albert_Lee',
  'Keith_Urban',
  'Carrie_Underwood',
  'John_Travolta']}

### Test pipeline extract subgraph

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b').to(device)

In [20]:
from expand_subgraph import ExpandSubgraph
from gen_query import  extract_numbers, extract_strings, extract_notations

In [39]:
expander = ExpandSubgraph(model, util, query)
expander.expand_subgraph()

depth: 0
1 entities to expand.
10 triplets found.
Selected 10 triplets to expand.
Subgraph now has 10 triplets.
depth: 1
9 entities to expand.
8710 triplets found.


KeyboardInterrupt: 

In [38]:
expander.evaluate_subgraph()

subgraph has 50 triplets.


(0.6, 0.0)

In [32]:
expander.answers_id

[2403, 1221, 137, 1589, 4093]

In [26]:
id2ent[2403]

'/m/02fn5r'

In [None]:
print(len(expander.subgraph))

50


In [None]:
name2ent = {v:k for k,v in ent2name.items()}
ent2id = {v:k for k,v in id2ent.items()}
rel2id = {v:k for k,v in id2rel.items()}

In [None]:
query

{'query_type': ('e', ('r', 'r')),
 'raw_query': ('Franklin',
  ('-/people/person/places_lived./people/place_lived/location',
   '-/award/award_nominee/award_nominations./award/award_nomination/award_nominee')),
 'transformed_query': ['Who are the award nominees that have lived in places associated with Franklin?',
  'What are the names of the individuals who were nominated for awards and lived in locations linked to Franklin?',
  'Can you list the award nominees who have resided in locations related to Franklin?'],
 'answers': ['Vince_Gill',
  'Albert_Lee',
  'Keith_Urban',
  'Carrie_Underwood',
  'John_Travolta']}

In [None]:
extract_strings(train_queries[0]['raw_query'])

['Franklin',
 '-/people/person/places_lived./people/place_lived/location',
 '-/award/award_nominee/award_nominations./award/award_nomination/award_nominee']

In [None]:
train_queries[1]

{'query_type': ('e', ('r', 'r')),
 'raw_query': ('String',
  ('+/music/performance_role/track_performances./music/track_contribution/role',
   '+/dataworld/gardening_hint/split_to')),
 'transformed_query': ['What roles do performers play in the track performances related to String?',
  'What are the different contributions of musicians in performances of the String genre?',
  'In the context of String, what are the various performance roles for track contributions?'],
 'answers': ['plucked_string_instruments',
  'Percussion',
  'Lead_guitar',
  'Electric_piano',
  'Horn']}

In [None]:
name2ent['string']

'/m/0680x0'