In [1]:
import os.path as osp
import torch
from tqdm import tqdm
import pandas as pd

In [2]:
def split_dataset(df, x):
    # Calculate the number of rows per subset
    total_rows = len(df)
    subset_size = total_rows // x

    # Create empty list to hold subsets
    subsets = []

    # Split the DataFrame into subsets
    for i in range(x):
        start_index = i * subset_size
        end_index = (i + 1) * subset_size if i < x - 1 else total_rows
        subset = df.iloc[start_index:end_index].reset_index(drop=True)
        subsets.append(subset)

    # Now subsets[i] contains the i-th subset
    # You can access each subset like subsets[0], subsets[1], etc.

    # Example usage: Print the first few rows of each subset
    for i, subset in enumerate(subsets):
        print(f"Subset {i+1} - Rows: {len(subset)}")
        print(subset.head())
        print()  # Separate subsets with a blank line
    return subsets

In [None]:
df = pd.read_csv("../prime/raw/kg.csv")

In [None]:
df.head(5)

In [None]:
# get all types to string
df = df.astype(str)
df.dtypes

# Get nodes

In [None]:
df_nodes = df[["x_index", "x_type", "x_name", "x_source"]]
df_nodes = df_nodes.rename(columns={"x_index": "y_index", "x_type": "y_type", "x_name": "y_name", "x_source": "y_source"})

# append the same columns with y_ prefix to df_nodes
df_nodes = pd.concat([df_nodes, df_nodes[["y_index", "y_type", "y_name", "y_source"]]], axis=0)

df_nodes = df_nodes.rename(columns={"y_index": "ID", "y_type": "type", "y_name": "name", "y_source": "source"})

print("Size of df_nodes before drop duplicates: ", df_nodes.shape[0])
# keep unique rows of df_nodes
df_nodes = df_nodes.drop_duplicates(ignore_index=False)
print("Size of df_nodes after drop duplicates: ", df_nodes.shape[0])

df_nodes['name'] = df_nodes['name'].str.replace("'", " ")
df_nodes

In [None]:
df_nodes['type'].unique()

## Get embeddings

In [None]:
emb_model = "text-embedding-ada-002"
candidate_emb_path = osp.join("../data/embedding/", 'candidate_emb_dict.pt')
print(candidate_emb_path)

In [None]:
if osp.exists(candidate_emb_path):
    candidate_emb_dict = torch.load(candidate_emb_path)
    print(f'Loaded candidate_emb_dict from {candidate_emb_path}!')
else:
    print(f'candidate_emb_dict not found in {candidate_emb_path}!')

In [None]:
list_id = []
list_emb = []
for item in tqdm(candidate_emb_dict):
    list_id.append(item)
    list_emb.append(candidate_emb_dict[item].numpy().tolist()[0])

len_emb = len(list_emb[0])
print(len_emb)

df_emb = pd.DataFrame(zip(list_id, list_emb), columns=["ID", "embedding"])
df_emb["ID"] = df_emb["ID"].astype(str)
df_emb

In [None]:
df_nodes_final = pd.merge(df_nodes, df_emb, on="ID", how="left")

In [None]:
df_nodes_final.head(5)

In [None]:
df_datasets = split_dataset(df_nodes_final, 10)

In [None]:
for i, item in enumerate(df_datasets):
    item.to_csv(f"../prime/new/kg_nodes_{i}.csv", index=False)

# Get edges

In [None]:
df_edges = df[["relation", "display_relation"]]

# keep unique rows of df_nodes
df_edges = df_edges.drop_duplicates(ignore_index=False)

df_edges

So we want to use display relation as those are the true edges.

# Get node relation node list

In [None]:
df_relations = df[["x_index", "display_relation", "relation", "y_index"]]
df_relations["display_relation"] = df_relations["display_relation"].str.replace(" ", "_")
df_relations

In [None]:
df_relations["display_relation"].unique()

In [None]:
df_relations.dtypes

In [None]:
df_datasets = split_dataset(df_relations, 10)

for i, item in enumerate(df_datasets):
    item.to_csv(f"../prime/new/kg_relations_{i}.csv", index=False)

In [None]:
print("Done!")