In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = [
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_sampleSeed10.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_dropLessHalfEntAndRela.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_dropMost5PersentEntAndRela.csv'
]
geo_path = './data/DBpedia_combin/DBpedia-geo.csv'
outdir = './final_dataset'

In [3]:
def count_scale(df_data):
    entity_num = len(pd.concat([df_data['head'], df_data['tail']]).unique())
    rela_num = len(df_data['relationship'].unique())
    triplets_num = df_data.shape[0]
    return triplets_num, entity_num, rela_num

## 生成数据集S-DBpedia

In [23]:
df = pd.read_csv(data_path[0], sep='\t', names=['head', 'relationship', 'tail'])
np.random.seed(99)
df = df.sample(frac=1.0)
df = df.reset_index(drop=True)
val_index = round(0.8 * df.shape[0])
test_index = round(0.9 * df.shape[0])
df_train = df.iloc[:val_index]
df_val = df.iloc[val_index: test_index]
df_test = df.iloc[test_index:]
ent = pd.concat([df_train['head'], df_train['tail']]).unique()
relationship = df_train['relationship'].unique()
df_head = pd.DataFrame({'head': ent, '1': True})
df_tail = pd.DataFrame({'tail': ent, '2': True})
df_rela = pd.DataFrame({'relationship': relationship, '3': True})
df_val = pd.merge(df_val, df_head, on='head', how='left')
df_val = pd.merge(df_val, df_tail, on='tail', how='left')
df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
df_test = pd.merge(df_test, df_head, on='head', how='left')
df_test = pd.merge(df_test, df_tail, on='tail', how='left')
df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
df_val = df_val[df_val['1'] == True]
df_val = df_val[df_val['2'] == True]
df_val = df_val[df_val['3'] == True]
df_test = df_test[df_test['1'] == True]
df_test = df_test[df_test['2'] == True]
df_test = df_test[df_test['3'] == True]
df_val = df_val[['head', 'relationship', 'tail']]
df_test = df_test[['head', 'relationship', 'tail']]
print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
df_train.to_csv(outdir + '/S-DBpedia/train.csv', sep='\t', header=None, index=None)
df_val.to_csv(outdir + '/S-DBpedia/valid.csv', sep='\t', header=None, index=None)
df_test.to_csv(outdir + '/S-DBpedia/test.csv', sep='\t', header=None, index=None)

df_val: 227574 (227574, 241719, 230)
df_test: 227574  (227574, 241526, 222)
after filter:
df_val: 195104 (195104, 206618, 208)
df_test: 195106  (195106, 206578, 209)
final:
df_train: (1820591, 879610, 305)
df_val: (195104, 206618, 208)
df_test: (195106, 206578, 209)


## 生成数据集S-DBpedia-large

In [24]:
df = pd.read_csv(data_path[1], sep='\t', names=['head', 'relationship', 'tail'])
np.random.seed(99)
df = df.sample(frac=1.0)
df = df.reset_index(drop=True)
val_index = round(0.8 * df.shape[0])
test_index = round(0.9 * df.shape[0])
df_train = df.iloc[:val_index]
df_val = df.iloc[val_index: test_index]
df_test = df.iloc[test_index:]
ent = pd.concat([df_train['head'], df_train['tail']]).unique()
relationship = df_train['relationship'].unique()
df_head = pd.DataFrame({'head': ent, '1': True})
df_tail = pd.DataFrame({'tail': ent, '2': True})
df_rela = pd.DataFrame({'relationship': relationship, '3': True})
df_val = pd.merge(df_val, df_head, on='head', how='left')
df_val = pd.merge(df_val, df_tail, on='tail', how='left')
df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
df_test = pd.merge(df_test, df_head, on='head', how='left')
df_test = pd.merge(df_test, df_tail, on='tail', how='left')
df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
df_val = df_val[df_val['1'] == True]
df_val = df_val[df_val['2'] == True]
df_val = df_val[df_val['3'] == True]
df_test = df_test[df_test['1'] == True]
df_test = df_test[df_test['2'] == True]
df_test = df_test[df_test['3'] == True]
df_val = df_val[['head', 'relationship', 'tail']]
df_test = df_test[['head', 'relationship', 'tail']]
print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
df_train.to_csv(outdir + '/S-DBpedia-large/train.csv', sep='\t', header=None, index=None)
df_val.to_csv(outdir + '/S-DBpedia-large/valid.csv', sep='\t', header=None, index=None)
df_test.to_csv(outdir + '/S-DBpedia-large/test.csv', sep='\t', header=None, index=None)

df_val: 113787 (113787, 134846, 214)
df_test: 113787  (113787, 134826, 208)
after filter:
df_val: 69577 (69577, 83960, 179)
df_test: 69603  (69603, 83937, 185)
final:
df_train: (910296, 650120, 278)
df_val: (69577, 83960, 179)
df_test: (69603, 83937, 185)


## 生成数据集 halfER

In [5]:
df = pd.read_csv(data_path[2], sep='\t', names=['head', 'relationship', 'tail'])
np.random.seed(99)
df = df.sample(frac=1.0)
df = df.reset_index(drop=True)
val_index = round(0.8 * df.shape[0])
test_index = round(0.9 * df.shape[0])
df_train = df.iloc[:val_index]
df_val = df.iloc[val_index: test_index]
df_test = df.iloc[test_index:]
ent = pd.concat([df_train['head'], df_train['tail']]).unique()
relationship = df_train['relationship'].unique()
df_head = pd.DataFrame({'head': ent, '1': True})
df_tail = pd.DataFrame({'tail': ent, '2': True})
df_rela = pd.DataFrame({'relationship': relationship, '3': True})
df_val = pd.merge(df_val, df_head, on='head', how='left')
df_val = pd.merge(df_val, df_tail, on='tail', how='left')
df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
df_test = pd.merge(df_test, df_head, on='head', how='left')
df_test = pd.merge(df_test, df_tail, on='tail', how='left')
df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
df_val = df_val[df_val['1'] == True]
df_val = df_val[df_val['2'] == True]
df_val = df_val[df_val['3'] == True]
df_test = df_test[df_test['1'] == True]
df_test = df_test[df_test['2'] == True]
df_test = df_test[df_test['3'] == True]
df_val = df_val[['head', 'relationship', 'tail']]
df_test = df_test[['head', 'relationship', 'tail']]
print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
df_train.to_csv(outdir + '/DBpedia-halfER/train.csv', sep='\t', header=None, index=None)
df_val.to_csv(outdir + '/DBpedia-halfER/valid.csv', sep='\t', header=None, index=None)
df_test.to_csv(outdir + '/DBpedia-halfER/test.csv', sep='\t', header=None, index=None)

df_val: 152621 (152621, 157847, 154)
df_test: 152622  (152622, 157852, 152)
after filter:
df_val: 148177 (148177, 154463, 152)
df_test: 148240  (148240, 154479, 152)
final:
df_train: (1220973, 461379, 157)
df_val: (148177, 154463, 152)
df_test: (148240, 154479, 152)


## 生成数据集 DPM5

In [4]:
df = pd.read_csv(data_path[3], sep='\t', names=['head', 'relationship', 'tail'])
np.random.seed(99)
df = df.sample(frac=1.0)
df = df.reset_index(drop=True)
val_index = round(0.8 * df.shape[0])
test_index = round(0.9 * df.shape[0])
df_train = df.iloc[:val_index]
df_val = df.iloc[val_index: test_index]
df_test = df.iloc[test_index:]
ent = pd.concat([df_train['head'], df_train['tail']]).unique()
relationship = df_train['relationship'].unique()
df_head = pd.DataFrame({'head': ent, '1': True})
df_tail = pd.DataFrame({'tail': ent, '2': True})
df_rela = pd.DataFrame({'relationship': relationship, '3': True})
df_val = pd.merge(df_val, df_head, on='head', how='left')
df_val = pd.merge(df_val, df_tail, on='tail', how='left')
df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
df_test = pd.merge(df_test, df_head, on='head', how='left')
df_test = pd.merge(df_test, df_tail, on='tail', how='left')
df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
df_val = df_val[df_val['1'] == True]
df_val = df_val[df_val['2'] == True]
df_val = df_val[df_val['3'] == True]
df_test = df_test[df_test['1'] == True]
df_test = df_test[df_test['2'] == True]
df_test = df_test[df_test['3'] == True]
df_val = df_val[['head', 'relationship', 'tail']]
df_test = df_test[['head', 'relationship', 'tail']]
print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
df_train.to_csv(outdir + '/DBpedia-DPM5/train.csv', sep='\t', header=None, index=None)
df_val.to_csv(outdir + '/DBpedia-DPM5/valid.csv', sep='\t', header=None, index=None)
df_test.to_csv(outdir + '/DBpedia-DPM5/test.csv', sep='\t', header=None, index=None)

df_val: 3323 (3323, 6375, 154)
df_test: 3324  (3324, 6375, 158)
after filter:
df_val: 621 (621, 1211, 82)
df_test: 566  (566, 1091, 86)
final:
df_train: (26589, 40827, 250)
df_val: (621, 1211, 82)
df_test: (566, 1091, 86)


In [9]:
seeds = [10, 45, 68, 77]
for seed in seeds: 
    print(f'seed:{seed}')
    df = pd.read_csv(data_path[3], sep='\t', names=['head', 'relationship', 'tail'])
    np.random.seed(seed)
    df = df.sample(frac=1.0)
    df = df.reset_index(drop=True)
    val_index = round(0.8 * df.shape[0])
    test_index = round(0.9 * df.shape[0])
    df_train = df.iloc[:val_index]
    df_val = df.iloc[val_index: test_index]
    df_test = df.iloc[test_index:]
    ent = pd.concat([df_train['head'], df_train['tail']]).unique()
    relationship = df_train['relationship'].unique()
    df_head = pd.DataFrame({'head': ent, '1': True})
    df_tail = pd.DataFrame({'tail': ent, '2': True})
    df_rela = pd.DataFrame({'relationship': relationship, '3': True})
    df_val = pd.merge(df_val, df_head, on='head', how='left')
    df_val = pd.merge(df_val, df_tail, on='tail', how='left')
    df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
    df_test = pd.merge(df_test, df_head, on='head', how='left')
    df_test = pd.merge(df_test, df_tail, on='tail', how='left')
    df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
    print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    df_val = df_val[df_val['1'] == True]
    df_val = df_val[df_val['2'] == True]
    df_val = df_val[df_val['3'] == True]
    df_test = df_test[df_test['1'] == True]
    df_test = df_test[df_test['2'] == True]
    df_test = df_test[df_test['3'] == True]
    df_val = df_val[['head', 'relationship', 'tail']]
    df_test = df_test[['head', 'relationship', 'tail']]
    print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
    df_train.to_csv(outdir + f'/DBpedia-DPM5-{seed}/train.csv', sep='\t', header=None, index=None)
    df_val.to_csv(outdir + f'/DBpedia-DPM5-{seed}/valid.csv', sep='\t', header=None, index=None)
    df_test.to_csv(outdir + f'/DBpedia-DPM5-{seed}/test.csv', sep='\t', header=None, index=None)

seed:10
df_val: 3323 (3323, 6324, 158)
df_test: 3324  (3324, 6322, 154)
after filter:
df_val: 625 (625, 1213, 83)
df_test: 589  (589, 1138, 81)
final:
df_train: (26589, 40903, 256)
df_val: (625, 1213, 83)
df_test: (589, 1138, 81)
seed:45
df_val: 3323 (3323, 6365, 156)
df_test: 3324  (3324, 6366, 168)
after filter:
df_val: 623 (623, 1202, 79)
df_test: 602  (602, 1162, 90)
final:
df_train: (26589, 40853, 248)
df_val: (623, 1202, 79)
df_test: (602, 1162, 90)
seed:68
df_val: 3323 (3323, 6344, 160)
df_test: 3324  (3324, 6353, 160)
after filter:
df_val: 588 (588, 1141, 82)
df_test: 617  (617, 1184, 89)
final:
df_train: (26589, 40888, 251)
df_val: (588, 1141, 82)
df_test: (617, 1184, 89)
seed:77
df_val: 3323 (3323, 6344, 157)
df_test: 3324  (3324, 6359, 156)
after filter:
df_val: 558 (558, 1083, 90)
df_test: 596  (596, 1152, 86)
final:
df_train: (26589, 40807, 253)
df_val: (558, 1083, 90)
df_test: (596, 1152, 86)


## 生成数据集 S-DBpedia-GTnE系列

In [4]:
data_path = [
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_get5Ent.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_get10Ent.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_get20Ent.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_get50Ent.csv'
]
outdir = './final_dataset/'
dataset_names = [f'S-DBpedia-GT{i}E' for i in [5, 10, 20, 50]]

In [5]:
for i, p in enumerate(data_path):
    df = pd.read_csv(p, sep='\t', names=['head', 'relationship', 'tail'])
    np.random.seed(99)
    df = df.sample(frac=1.0)
    df = df.reset_index(drop=True)
    val_index = round(0.8 * df.shape[0])
    test_index = round(0.9 * df.shape[0])
    df_train = df.iloc[:val_index]
    df_val = df.iloc[val_index: test_index]
    df_test = df.iloc[test_index:]
    ent = pd.concat([df_train['head'], df_train['tail']]).unique()
    relationship = df_train['relationship'].unique()
    df_head = pd.DataFrame({'head': ent, '1': True})
    df_tail = pd.DataFrame({'tail': ent, '2': True})
    df_rela = pd.DataFrame({'relationship': relationship, '3': True})
    df_val = pd.merge(df_val, df_head, on='head', how='left')
    df_val = pd.merge(df_val, df_tail, on='tail', how='left')
    df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
    df_test = pd.merge(df_test, df_head, on='head', how='left')
    df_test = pd.merge(df_test, df_tail, on='tail', how='left')
    df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
    print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    df_val = df_val[df_val['1'] == True]
    df_val = df_val[df_val['2'] == True]
    df_val = df_val[df_val['3'] == True]
    df_test = df_test[df_test['1'] == True]
    df_test = df_test[df_test['2'] == True]
    df_test = df_test[df_test['3'] == True]
    df_val = df_val[['head', 'relationship', 'tail']]
    df_test = df_test[['head', 'relationship', 'tail']]
    print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
    df_train.to_csv(outdir + dataset_names[i] + '/train.csv', sep='\t', header=None, index=None)
    df_val.to_csv(outdir + dataset_names[i] + '/valid.csv', sep='\t', header=None, index=None)
    df_test.to_csv(outdir + dataset_names[i] + '/test.csv', sep='\t', header=None, index=None)

df_val: 16103 (16103, 18329, 126)
df_test: 16103  (16103, 18353, 128)
after filter:
df_val: 15393 (15393, 17570, 123)
df_test: 15339  (15339, 17539, 125)
final:
df_train: (128823, 44187, 169)
df_val: (15393, 17570, 123)
df_test: (15339, 17539, 125)
df_val: 33448 (33448, 37527, 159)
df_test: 33448  (33448, 37244, 169)
after filter:
df_val: 32375 (32375, 36406, 158)
df_test: 32381  (32381, 36171, 159)
final:
df_train: (267585, 90523, 229)
df_val: (32375, 36406, 158)
df_test: (32381, 36171, 159)
df_val: 67767 (67767, 72138, 186)
df_test: 67768  (67768, 72237, 189)
after filter:
df_val: 66260 (66260, 70772, 177)
df_test: 66355  (66355, 70925, 184)
final:
df_train: (542141, 183706, 253)
df_val: (66260, 70772, 177)
df_test: (66355, 70925, 184)
df_val: 152705 (152705, 157967, 201)
df_test: 152705  (152705, 157980, 202)
after filter:
df_val: 148239 (148239, 154584, 197)
df_test: 148475  (148475, 154750, 199)
final:
df_train: (1221639, 461516, 281)
df_val: (148239, 154584, 197)
df_test: (148475

## 生成数据集RDM系列

In [6]:
data_path = [
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_sample12.csv',
    './data/DBpedia_combin/DBpedia_duplicates_filterOntology_dropMaxRelationship_filterGeo_sample25.csv'
]
outdir = './final_dataset/'
dataset_names = ['S-DBpedia_small', 'S-DBpedia_medium']

In [8]:
for i, p in enumerate(data_path):
    df = pd.read_csv(p, sep='\t', names=['head', 'relationship', 'tail'])
    np.random.seed(99)
    df = df.sample(frac=1.0)
    df = df.reset_index(drop=True)
    val_index = round(0.8 * df.shape[0])
    test_index = round(0.9 * df.shape[0])
    df_train = df.iloc[:val_index]
    df_val = df.iloc[val_index: test_index]
    df_test = df.iloc[test_index:]
    ent = pd.concat([df_train['head'], df_train['tail']]).unique()
    relationship = df_train['relationship'].unique()
    df_head = pd.DataFrame({'head': ent, '1': True})
    df_tail = pd.DataFrame({'tail': ent, '2': True})
    df_rela = pd.DataFrame({'relationship': relationship, '3': True})
    df_val = pd.merge(df_val, df_head, on='head', how='left')
    df_val = pd.merge(df_val, df_tail, on='tail', how='left')
    df_val = pd.merge(df_val, df_rela, on='relationship', how='left')
    df_test = pd.merge(df_test, df_head, on='head', how='left')
    df_test = pd.merge(df_test, df_tail, on='tail', how='left')
    df_test = pd.merge(df_test, df_rela, on='relationship', how='left')
    print(f'df_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    df_val = df_val[df_val['1'] == True]
    df_val = df_val[df_val['2'] == True]
    df_val = df_val[df_val['3'] == True]
    df_test = df_test[df_test['1'] == True]
    df_test = df_test[df_test['2'] == True]
    df_test = df_test[df_test['3'] == True]
    df_val = df_val[['head', 'relationship', 'tail']]
    df_test = df_test[['head', 'relationship', 'tail']]
    print(f'after filter:\ndf_val: {df_val.shape[0]} {count_scale(df_val)}\ndf_test: {df_test.shape[0]}  {count_scale(df_test)}')
    print(f'final:\ndf_train: {count_scale(df_train)}\ndf_val: {count_scale(df_val)}\ndf_test: {count_scale(df_test)}')
    df_train.to_csv(outdir + dataset_names[i] + '/train.csv', sep='\t', header=None, index=None)
    df_val.to_csv(outdir + dataset_names[i] + '/valid.csv', sep='\t', header=None, index=None)
    df_test.to_csv(outdir + dataset_names[i] + '/test.csv', sep='\t', header=None, index=None)

df_val: 28446 (28446, 39234, 148)
df_test: 28447  (28447, 39225, 155)
after filter:
df_val: 6026 (6026, 8986, 96)
df_test: 6107  (6107, 9066, 102)
final:
df_train: (227574, 241869, 221)
df_val: (6026, 8986, 96)
df_test: (6107, 9066, 102)
df_val: 56894 (56894, 73271, 176)
df_test: 56893  (56893, 73286, 174)
after filter:
df_val: 21108 (21108, 28534, 128)
df_test: 21439  (21439, 28975, 132)
final:
df_train: (455148, 413157, 247)
df_val: (21108, 28534, 128)
df_test: (21439, 28975, 132)
