In [1]:
import os
from typing import List

import numpy as np
import pandas as pd

from kbc_pul.project_info import data_dir

%load_ext autoreload
%autoreload 2


# Explore how many overlapping entities occur for every position of every relation

In [2]:
dataset_name: str = "yago3_10"
data_partition_name = "train"

In [3]:
cleaned_csv_dir: str = os.path.join(data_dir, dataset_name, 'cleaned_csv')


In [4]:
train_tsv_input_file = os.path.join(cleaned_csv_dir, 'train.csv')
train_df = pd.read_csv(train_tsv_input_file, header=None, sep="\t",
                       names=["E1", "Rel", "E2"])
train_df.head()

Unnamed: 0,E1,Rel,E2
0,'e50_Cent',actedin,'eGet_Rich_or_Die_Tryin_(film)'
1,'e50_Cent',actedin,'eRighteous_Kill'
2,'eA._K._Hangal',actedin,'eAap_Ki_Kasam'
3,'eA._K._Hangal',actedin,'eAbhimaan_(1973_film)'
4,'eA._K._Hangal',actedin,'eAmar_Deep_(1979_film)'


In [5]:
train_df["Rel"].unique()

array(['actedin', 'created', 'dealswith', 'diedin', 'directed', 'edited',
       'exports', 'graduatedfrom', 'happenedin', 'hasacademicadvisor',
       'hascapital', 'haschild', 'hascurrency', 'hasgender',
       'hasmusicalrole', 'hasneighbor', 'hasofficiallanguage',
       'haswebsite', 'haswonprize', 'imports', 'influences',
       'isaffiliatedto', 'iscitizenof', 'isconnectedto', 'isinterestedin',
       'isknownfor', 'isleaderof', 'islocatedin', 'ismarriedto',
       'ispoliticianof', 'livesin', 'owns', 'participatedin', 'playsfor',
       'wasbornin', 'worksat', 'wrotemusicfor'], dtype=object)

In [6]:
relations_to_check = list(train_df["Rel"].unique())
relations_to_check

['actedin',
 'created',
 'dealswith',
 'diedin',
 'directed',
 'edited',
 'exports',
 'graduatedfrom',
 'happenedin',
 'hasacademicadvisor',
 'hascapital',
 'haschild',
 'hascurrency',
 'hasgender',
 'hasmusicalrole',
 'hasneighbor',
 'hasofficiallanguage',
 'haswebsite',
 'haswonprize',
 'imports',
 'influences',
 'isaffiliatedto',
 'iscitizenof',
 'isconnectedto',
 'isinterestedin',
 'isknownfor',
 'isleaderof',
 'islocatedin',
 'ismarriedto',
 'ispoliticianof',
 'livesin',
 'owns',
 'participatedin',
 'playsfor',
 'wasbornin',
 'worksat',
 'wrotemusicfor']

In [7]:
data_rows: List = []

for relation_index in range(len(relations_to_check)):
    relation_name = relations_to_check[relation_index]
    relation_df: pd.DataFrame = train_df[train_df["Rel"] == relation_name]

    relation_ent1_set: np.ndarray = relation_df["E1"].unique()
    relation_ent2_set: np.ndarray = relation_df["E2"].unique()

    relation_ent_set_intersection = np.intersect1d(relation_ent1_set, relation_ent2_set, assume_unique=True)
    relation_row = [relation_name, "E1",
       relation_name, "E2",
       len(relation_ent_set_intersection),
       len(relation_ent1_set),
       len(relation_ent2_set)]
    data_rows.append(relation_row)

    for other_relation_name in relations_to_check[relation_index+1:]:
        other_relation_df: pd.DataFrame = train_df[train_df["Rel"] == other_relation_name]

        other_relation_ent1_set: np.ndarray = other_relation_df["E1"].unique()
        other_relation_ent2_set: np.ndarray = other_relation_df["E2"].unique()

        for relation_ent_set, relation_ent_arg_pos in [(relation_ent1_set, "E1"), (relation_ent2_set, "E2")]:
            for other_relation_ent_set, other_relation_ent_arg_pos in [(other_relation_ent1_set, "E1"), (other_relation_ent2_set, "E2")]:
                ent_set_intersection = np.intersect1d(relation_ent_set, other_relation_ent_set, assume_unique=True)
                row = [relation_name, relation_ent_arg_pos,
                       other_relation_name, other_relation_ent_arg_pos,
                       len(ent_set_intersection),
                       len(relation_ent_set),
                       len(other_relation_ent_set)]
                data_rows.append(row)


In [8]:
print("DONE")


DONE


In [9]:
entity_counts_df = pd.DataFrame(data=data_rows, columns=["R1", "R1E", "R2", "R2E", "inter", "n_R1E", "n_R2E" ])
entity_counts_df.head()

Unnamed: 0,R1,R1E,R2,R2E,inter,n_R1E,n_R2E
0,actedin,E1,actedin,E2,0,5402,6754
1,actedin,E1,created,E1,537,5402,2850
2,actedin,E1,created,E2,5,5402,4425
3,actedin,E2,created,E1,0,6754,2850
4,actedin,E2,created,E2,3925,6754,4425


In [10]:
entity_counts_df = entity_counts_df[entity_counts_df["inter"] != 0]

In [11]:
entity_counts_df.head()

Unnamed: 0,R1,R1E,R2,R2E,inter,n_R1E,n_R2E
1,actedin,E1,created,E1,537,5402,2850
2,actedin,E1,created,E2,5,5402,4425
4,actedin,E2,created,E2,3925,6754,4425
9,actedin,E1,diedin,E1,1443,5402,9243
13,actedin,E1,directed,E1,470,5402,1558


In [12]:
entity_counts_filename = os.path.join(
    data_dir,
    dataset_name,
    'data_stats',
    'overlapping_entity_counts.tsv'
)
entity_counts_df.to_csv(entity_counts_filename,
                        sep="\t", index=False)