In [None]:
import pandas as pd
import numpy as np

Bioteque = pd.read_csv('BioKG_Ryu.txt', sep = '\t')

relations = Bioteque[['Relation']].drop_duplicates()

relations['ID'] = range(len(relations))

Bioteque = pd.merge(Bioteque, relations, left_on ='Relation', right_on = 'Relation', how = 'inner')

In [2]:
Bioteque[['Node 1', 'Node 2', 'ID']].to_csv('BKG_file.txt', sep = '\t', index = False, header = None)

relations.to_csv('BKG_Relation.txt', sep = '\t', index = False)

In [None]:
DDI = pd.read_csv('Ryu_DDI.txt', sep = '\t')

In [4]:
DDI['ID'] = DDI['Relation'].str[9:].astype(int)

In [5]:
DDI = DDI[['Node 1', 'Node 2', 'ID']]

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split

k_fold = 5
skf = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=42)

folds = []

for train_index, test_index in skf.split(DDI, DDI['ID']):
    train_set = DDI.iloc[train_index]
    test_set = DDI.iloc[test_index]

    train_set, val_set = train_test_split(train_set, test_size=0.25, stratify=train_set['ID'], random_state=42)

    folds.append((train_set.reset_index(drop=True), val_set.reset_index(drop=True), test_set.reset_index(drop=True)))
for i in range(k_fold):
    train_set, val_set, test_set = folds[i]
    iFold = 'iFold_'+str(i+1)
    train_set.to_csv(f'./{iFold}/train.txt', sep = '\t', index = False, header = None)
    val_set.to_csv(f'./{iFold}/valid.txt', sep = '\t', index = False, header = None)
    test_set.to_csv(f'./{iFold}/test.txt', sep = '\t', index = False, header = None)


train_set, val_set, test_set = folds[0]
print("Train Set:\n", train_set)
print("Validation Set:\n", val_set)
print("Test Set:\n", test_set)

Train Set:
         Node 1  Node 2  ID
0          176    1005   2
1          981     359   7
2          889     233  27
3          279     782  50
4          377      53  34
...        ...     ...  ..
114827     372      37   8
114828     205     340  77
114829     974    1200   2
114830     205    1626  77
114831     318    1146  80

[114832 rows x 3 columns]
Validation Set:
        Node 1  Node 2  ID
0        1483     983   2
1         344      32   8
2         526      35   2
3         704     677   7
4         657     702  12
...       ...     ...  ..
38273     707      97   8
38274    1024     479   2
38275     565     638  27
38276      75     139   3
38277     290     277   8

[38278 rows x 3 columns]
Test Set:
        Node 1  Node 2  ID
0           0     124   1
1           0     133   1
2           0     211   1
3           0     441   0
4           0    1197   2
...       ...     ...  ..
38273    1571    1482   7
38274    1572    1314   7
38275    1572    1315   7
38276    15

In [8]:
DDI['ID'].value_counts()

ID
2     60936
8     34146
7     23546
5      9350
30     8395
      ...  
18       11
65       11
72       10
49        7
73        6
Name: count, Length: 86, dtype: int64

In [9]:
Node_Type = pd.read_csv('Node_Type.txt', sep = '\t')

In [10]:
nodetype = Node_Type.iloc[:,2].drop_duplicates()
nodetype = pd.DataFrame(nodetype)

In [11]:
nodetype['Type_ID'] = range(0, len(nodetype))

In [12]:
Node_Type = pd.merge(Node_Type, nodetype, left_on = 'Type', right_on = 'Type', how = 'inner')

In [13]:
Node_Type[['Type_ID']].to_csv('entity.txt', header=None, index=False)