In [1]:
import pandas as pd

Available embedding models: 'autosf', 'boxe', 'compgcn', 'complex', 'complexliteral', 'conve', 'convkb', 'cooccurrencefiltered', 'cp', 'crosse', 'distma', 'distmult', 'distmultliteral', 'distmultliteralgated', 'ermlp', 'ermlpe', 'fixed', 'hole', 'inductivenodepiece', 'inductivenodepiecegnn', 'kg2e', 'mure', 'nodepiece', 'ntn', 'pairre', 'proje', 'quate', 'rescal', 'rgcn', 'rotate', 'se', 'simple', 'toruse', 'transd', 'transe', 'transf', 'transh', 'transr', 'tucker', 'um'

In [2]:
from sklearn.model_selection import train_test_split

### Functions for custom split of triples

In [18]:
def get_entities(triples):
    entities = set()
    for l in triples:
        e1, _, e2 = l.split("\t")
        entities.update({e1, e2})
    return entities

def get_relations(triples):
    relations = set()
    for l in triples:
        relations.add(l.split('\t')[1])
    return relations

def filter_triples(triples, train, valid_entities, valid_relations):
    remaining, removed = [], []
    for l in triples:
        e1, r, e2 = l.split('\t')
        if e1 in valid_entities and e2 in valid_entities and r in valid_relations:
            remaining.append(l)
        else:
            removed.append(l)
    return train+removed, remaining

def write_to_file(storage_path, data):
    with open(storage_path, 'w') as file:
        file.writelines(data)

def split_data(kg, file_name = 'drkg.tsv'):
    path = "./datasets/"+kg
    with open(path+f"/{file_name}", 'r') as file:
        data = file.readlines()
        data = list(filter(lambda x: len(x.split('\t')) == 3, data))
        print(len(data))
        
    train, temp_test = train_test_split(data, test_size=0.2, random_state=142)
    train_entities = get_entities(train)
    train_relations = get_relations(train)
    
    train, test = filter_triples(temp_test, train, train_entities, train_relations)
    
    train, temp_valid = train_test_split(train, test_size=0.1, random_state=142)
    train_entities = get_entities(train)
    train_relations = get_relations(train)
    
    train, valid = filter_triples(temp_valid, train, train_entities, train_relations)
    
    print(f'\nStatistics train: {100*float(len(train))/len(data)}%, valid: {100*float(len(valid))/len(data)}%,\
          test: {100*float(len(test))/len(data)}%')
    storage_paths = ['./datasets/'+kg+'/train.txt', './datasets/'+kg+'/valid.txt', './datasets/'+kg+'/test.txt']
    for path, d in zip(storage_paths, [train, valid, test]):
        write_to_file(path, d)
    print('\n Done saving data')

### Creating triples.txt, a file containing all triples (train+test+valid). It helps split the data into train, test, valid in a way that all entities and relations that appear in test.txt and valid.txt also appear in train.txt.

In [None]:
#train = pd.read_csv('datasets/openbiolink/train_sample.csv', header=None)
#test = pd.read_csv('datasets/openbiolink/test_sample.csv', header=None)
#valid = pd.read_csv('datasets/openbiolink/val_sample.csv', header=None)
#
#train=list(map(lambda x: '\t'.join(x[0].split('\t')[:3])+'\n', (train.values).tolist()))
#test=list(map(lambda x: '\t'.join(x[0].split('\t')[:3])+'\n', (test.values).tolist()))
#valid=list(map(lambda x: '\t'.join(x[0].split('\t')[:3])+'\n', (valid.values).tolist()))

In [None]:
#with open('datasets/openbiolink/triples.txt', 'w') as file:
#    file.writelines(train+test+valid)

In [19]:
def merge_train_test_valid_data(kg):

    with open(f'datasets/{kg}/train.txt') as file:
        train = file.readlines()

    with open(f'datasets/{kg}/test.txt') as file:
        test = file.readlines()

    with open(f'datasets/{kg}/valid.txt') as file:
        valid = file.readlines()

    with open(f'datasets/{kg}/triples.txt', 'w') as file:
        file.writelines(train+test+valid)
    print('Donne creating triples.txt !')

In [15]:
merge_train_test_valid_data('yago3')

Donne creating triples.txt !


In [16]:
merge_train_test_valid_data('fb15k')

Donne creating triples.txt !


In [18]:
merge_train_test_valid_data('wn18rr')

Donne creating triples.txt !


### Splitting into train, test and validation as described above

In [20]:
split_data('wn18rr', 'triples.txt')

93003

Statistics train: 84.29943120114405%, valid: 4.504155779921078%,          test: 11.196413018934873%

 Done saving data


In [21]:
split_data('fb15k', 'triples.txt')

310116

Statistics train: 72.18943879064608%, valid: 7.934450334713462%,          test: 19.876110874640457%

 Done saving data


In [22]:
split_data('yago3', 'triples.txt')

1089040

Statistics train: 72.87978402997135%, valid: 7.726437963711159%,          test: 19.39377800631749%

 Done saving data


In [24]:
split_data('drkg')

5874261

Statistics train: 72.17558089434569%, valid: 7.946957072557723%,          test: 19.877462033096588%

 Done saving data


In [25]:
split_data('carcinogenesis', 'triples.txt')

96939

Statistics train: 77.3733997668637%, valid: 6.324595879883225%,          test: 16.302004353253075%

 Done saving data


In [26]:
split_data('mutagenesis', 'triples.txt')

62067

Statistics train: 77.21655630206068%, valid: 6.3624792562875605%,          test: 16.420964441651762%

 Done saving data


In [27]:
split_data('openbiolink', 'triples.txt')

4563405

Statistics train: 73.07898378513413%, valid: 7.684371647925179%,          test: 19.236644566940694%

 Done saving data


### Here we replace the white space by an underscore in entity and relation names to avoid wrong splits in rule learning systems

In [29]:
def remove_space(data_type, kg='drkg'):
    """ Only for the DRKG 
    - The function loads and modifies train, valid, and test files removing white spaces in entity and relation names
    """
    with open(f'./datasets/{kg}/'+data_type+'.txt') as file:
        data = file.readlines()
    new_data = []
    for triple in data:
        e1,r,e2 = triple.split('\t')
        e1 = e1.replace(' ', '_')
        e2 = e2.replace(' ', '_')
        r = r.replace(' ', '_')
        new_data.append('\t'.join([e1, r, e2]))
    with open(f'./datasets/{kg}/'+data_type+'.txt', 'w') as file:
        file.writelines(new_data)

In [30]:
remove_space('train')

In [31]:
remove_space('test')

In [32]:
remove_space('valid')

### Counting the number of triples in train, valid and test splits. The numbers of relations and entities are also printed.

In [18]:
def statistics(kg):
    for data_type in ['train', 'valid', 'test']:
        print(f'\n****** {data_type.upper()} ******\n')
        with open(f'./datasets/{kg}/{data_type}.txt') as file:
            data = list(map(str.strip, file.readlines()))

        print("triples: ", len(data))
        entities = set()
        for line in data:
            entities.update(set(line.split('\t')[0::2]))
        relations = set(map(lambda x: x.split('\t')[1], data))
        print("entities: ", len(entities))
        print("relations: ", len(relations))

In [19]:
statistics('carcinogenesis')


****** TRAIN ******

triples:  75005
entities:  23645
relations:  24

****** VALID ******

triples:  6131
entities:  7007
relations:  20

****** TEST ******

triples:  15803
entities:  14107
relations:  22


In [20]:
statistics('mutagenesis')


****** TRAIN ******

triples:  47926
entities:  15260
relations:  14

****** VALID ******

triples:  3949
entities:  4611
relations:  10

****** TEST ******

triples:  10192
entities:  9163
relations:  12


In [21]:
statistics('openbiolink')


****** TRAIN ******

triples:  3334890
entities:  184635
relations:  28

****** VALID ******

triples:  350669
entities:  47963
relations:  28

****** TEST ******

triples:  877846
entities:  68272
relations:  28


In [22]:
statistics('drkg')


****** TRAIN ******

triples:  4239782
entities:  97238
relations:  107

****** VALID ******

triples:  466825
entities:  46072
relations:  107

****** TEST ******

triples:  1167654
entities:  58591
relations:  107


In [23]:
statistics('yago3')


****** TRAIN ******

triples:  793690
entities:  123182
relations:  37

****** VALID ******

triples:  84144
entities:  65695
relations:  36

****** TEST ******

triples:  211206
entities:  95693
relations:  37


In [24]:
statistics('wn18rr')


****** TRAIN ******

triples:  78401
entities:  40943
relations:  11

****** VALID ******

triples:  4189
entities:  6384
relations:  11

****** TEST ******

triples:  10413
entities:  12659
relations:  11


In [25]:
statistics('fb15k')


****** TRAIN ******

triples:  223871
entities:  14541
relations:  237

****** VALID ******

triples:  24606
entities:  11112
relations:  237

****** TEST ******

triples:  61639
entities:  13166
relations:  237
