In [15]:
import os.path as osp
import torch
from tqdm import tqdm
import pandas as pd

In [16]:
def split_dataset(df, x):
    # Calculate the number of rows per subset
    total_rows = len(df)
    subset_size = total_rows // x

    # Create empty list to hold subsets
    subsets = []

    # Split the DataFrame into subsets
    for i in range(x):
        start_index = i * subset_size
        end_index = (i + 1) * subset_size if i < x - 1 else total_rows
        subset = df.iloc[start_index:end_index].reset_index(drop=True)
        subsets.append(subset)

    # Now subsets[i] contains the i-th subset
    # You can access each subset like subsets[0], subsets[1], etc.

    # Example usage: Print the first few rows of each subset
    for i, subset in enumerate(subsets):
        print(f"Subset {i+1} - Rows: {len(subset)}")
        print(subset.head())
        print()  # Separate subsets with a blank line
    return subsets

In [17]:
df = pd.read_csv("../prime/raw/kg.csv")

  df = pd.read_csv("../prime/raw/kg.csv")


In [18]:
df.head(5)

Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


In [20]:
# get all types to string
df = df.astype(str)
df.dtypes

relation            object
display_relation    object
x_index             object
x_id                object
x_type              object
x_name              object
x_source            object
y_index             object
y_id                object
y_type              object
y_name              object
y_source            object
dtype: object

# Get nodes

In [22]:
df_nodes = df[["x_index", "x_type", "x_name", "x_source"]]
df_nodes = df_nodes.rename(columns={"x_index": "y_index", "x_type": "y_type", "x_name": "y_name", "x_source": "y_source"})

# append the same columns with y_ prefix to df_nodes
df_nodes = pd.concat([df_nodes, df_nodes[["y_index", "y_type", "y_name", "y_source"]]], axis=0)

df_nodes = df_nodes.rename(columns={"y_index": "ID", "y_type": "type", "y_name": "name", "y_source": "source"})

print("Size of df_nodes before drop duplicates: ", df_nodes.shape[0])
# keep unique rows of df_nodes
df_nodes = df_nodes.drop_duplicates(ignore_index=False)
print("Size of df_nodes after drop duplicates: ", df_nodes.shape[0])

df_nodes['name'] = df_nodes['name'].str.replace("'", " ")
df_nodes

Size of df_nodes before drop duplicates:  16200996
Size of df_nodes after drop duplicates:  129375


Unnamed: 0,ID,type,name,source
0,0,gene/protein,PHYHIP,NCBI
1,1,gene/protein,GPANK1,NCBI
2,2,gene/protein,ZRSR2,NCBI
3,3,gene/protein,NRF1,NCBI
4,4,gene/protein,PI4KA,NCBI
...,...,...,...,...
6505728,129370,pathway,Ion transport by P-type ATPases,REACTOME
6505729,129371,pathway,Inhibition of voltage gated Ca2+ channels via...,REACTOME
6548634,129372,anatomy,anatomical entity,UBERON
6562425,129373,anatomy,multi-cellular organism,UBERON


## Get embeddings

In [25]:
emb_model = "text-embedding-ada-002"
candidate_emb_path = osp.join("../data/embedding/", 'candidate_emb_dict.pt')
print(candidate_emb_path)

../data/embedding/candidate_emb_dict.pt


In [26]:
if osp.exists(candidate_emb_path):
    candidate_emb_dict = torch.load(candidate_emb_path)
    print(f'Loaded candidate_emb_dict from {candidate_emb_path}!')
else:
    print(f'candidate_emb_dict not found in {candidate_emb_path}!')

  candidate_emb_dict = torch.load(candidate_emb_path)


Loaded candidate_emb_dict from ../data/embedding/candidate_emb_dict.pt!


In [27]:
list_id = []
list_emb = []
for item in tqdm(candidate_emb_dict):
    list_id.append(item)
    list_emb.append(candidate_emb_dict[item].numpy().tolist()[0])

len_emb = len(list_emb[0])
print(len_emb)

df_emb = pd.DataFrame(zip(list_id, list_emb), columns=["ID", "embedding"])
df_emb["ID"] = df_emb["ID"].astype(str)
df_emb

100%|██████████| 129375/129375 [00:14<00:00, 8651.62it/s] 


1536


Unnamed: 0,ID,embedding
0,0,"[-0.0497407391667366, -0.008042690344154835, -..."
1,1,"[-0.03097713552415371, 0.01564742438495159, 0...."
2,2,"[-0.029452160000801086, 0.011456595733761787, ..."
3,3,"[-0.03204813227057457, -0.013203300535678864, ..."
4,4,"[-0.03254737704992294, -0.01226135902106762, -..."
...,...,...
129370,129370,"[0.007171083241701126, 0.0017606967594474554, ..."
129371,129371,"[-0.011782837100327015, -0.01597079448401928, ..."
129372,129372,"[0.006585611030459404, 0.008282791823148727, 0..."
129373,129373,"[-0.02508579194545746, -0.006438905373215675, ..."


In [28]:
df_nodes_final = pd.merge(df_nodes, df_emb, on="ID", how="left")

In [30]:
df_nodes_final.head(5)

Unnamed: 0,ID,type,name,source,embedding
0,0,gene/protein,PHYHIP,NCBI,"[-0.0497407391667366, -0.008042690344154835, -..."
1,1,gene/protein,GPANK1,NCBI,"[-0.03097713552415371, 0.01564742438495159, 0...."
2,2,gene/protein,ZRSR2,NCBI,"[-0.029452160000801086, 0.011456595733761787, ..."
3,3,gene/protein,NRF1,NCBI,"[-0.03204813227057457, -0.013203300535678864, ..."
4,4,gene/protein,PI4KA,NCBI,"[-0.03254737704992294, -0.01226135902106762, -..."


In [31]:
df_datasets = split_dataset(df_nodes_final, 10)

Subset 1 - Rows: 12937
  ID          type    name source  \
0  0  gene/protein  PHYHIP   NCBI   
1  1  gene/protein  GPANK1   NCBI   
2  2  gene/protein   ZRSR2   NCBI   
3  3  gene/protein    NRF1   NCBI   
4  4  gene/protein   PI4KA   NCBI   

                                           embedding  
0  [-0.0497407391667366, -0.008042690344154835, -...  
1  [-0.03097713552415371, 0.01564742438495159, 0....  
2  [-0.029452160000801086, 0.011456595733761787, ...  
3  [-0.03204813227057457, -0.013203300535678864, ...  
4  [-0.03254737704992294, -0.01226135902106762, -...  

Subset 2 - Rows: 12937
      ID          type      name source  \
0  12937  gene/protein     TACR3   NCBI   
1  12938  gene/protein    SH2D1B   NCBI   
2  12939  gene/protein  PLA2G12A   NCBI   
3  12940  gene/protein   PRPF38A   NCBI   
4  12941  gene/protein    TRIM65   NCBI   

                                           embedding  
0  [-0.04068923741579056, 0.007761648390442133, -...  
1  [-0.04108533263206482, -0.00

In [None]:
for i, item in enumerate(df_datasets):
    item.to_csv(f"../prime/new/kg_nodes_{i}.csv", index=False)

# Get edges

In [None]:
df_edges = df[["relation", "display_relation"]]

# keep unique rows of df_nodes
df_edges = df_edges.drop_duplicates(ignore_index=False)

df_edges

So we want to use display relation as those are the true edges.

# Get node relation node list

In [None]:
df_relations = df[["x_index", "display_relation", "relation", "y_index"]]
df_relations["display_relation"] = df_relations["display_relation"].str.replace(" ", "_")
df_relations

In [None]:
df_relations.dtypes

In [None]:
df_datasets = split_dataset(df_relations, 10)

for i, item in enumerate(df_datasets):
    item.to_csv(f"../prime/new/kg_relations_{i}.csv", index=False)

In [None]:
print("Done!")