In [1]:
import pandas as pd
import numpy as np

Bioteque = pd.read_csv('BioKG_DrugBank.txt', sep = '\t')

relations = Bioteque[['Relation']].drop_duplicates()

relations['ID'] = range(len(relations))

Bioteque = pd.merge(Bioteque, relations, left_on ='Relation', right_on = 'Relation', how = 'inner')

In [2]:
Bioteque[['Node 1', 'Node 2', 'ID']].to_csv('BKG_file.txt', sep = '\t', index = False, header = None)

relations.to_csv('BKG_Relation.txt', sep = '\t', index = False)

In [3]:
DDI = pd.read_csv('DrugBank_DDI.txt', sep = '\t')

In [4]:
DDI['ID'] = DDI['Relation'].str[9:].astype(int)

In [5]:
DDI = DDI[['Node 1', 'Node 2', 'ID']]

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split


k_fold = 5
skf = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=42)

folds = []

for train_index, test_index in skf.split(DDI, DDI['ID']):
    train_set = DDI.iloc[train_index]
    test_set = DDI.iloc[test_index]

    train_set, val_set = train_test_split(train_set, test_size=0.25, stratify=train_set['ID'], random_state=42)

    folds.append((train_set.reset_index(drop=True), val_set.reset_index(drop=True), test_set.reset_index(drop=True)))
for i in range(k_fold):
    train_set, val_set, test_set = folds[i]
    iFold = 'iFold_'+str(i+1)
    train_set.to_csv(f'./{iFold}/train.txt', sep = '\t', index = False, header = None)
    val_set.to_csv(f'./{iFold}/valid.txt', sep = '\t', index = False, header = None)
    test_set.to_csv(f'./{iFold}/test.txt', sep = '\t', index = False, header = None)

train_set, val_set, test_set = folds[0]
print("Train Set:\n", train_set)
print("Validation Set:\n", val_set)
print("Test Set:\n", test_set)

Train Set:
         Node 1  Node 2  ID
0          929    1266  20
1          407    1761  22
2         1764      32  65
3          228     436   9
4          379    1129  48
...        ...     ...  ..
370360     482      52  23
370361     555     574  48
370362     189    1800  22
370363    1401    1517  31
370364     730    1232  20

[370365 rows x 3 columns]
Validation Set:
         Node 1  Node 2  ID
0         1262     337   9
1         1692    1696  23
2          810    1268   9
3          404    1458  70
4          216    1986   9
...        ...     ...  ..
123451     387    1164  16
123452    1942    1785  24
123453    1414    1850  24
123454    1661    1108  14
123455    1599     409   2

[123456 rows x 3 columns]
Test Set:
         Node 1  Node 2  ID
0            0      10   0
1            0      58   1
2            0     119   2
3            0     130   0
4            0     149   0
...        ...     ...  ..
123451    2080     852  11
123452    2080    1475  11
123453    2080 

In [8]:
DDI['ID'].value_counts()

ID
9      119017
24      99692
20      39990
48      39313
14      38324
        ...  
206         5
172         5
207         5
155         5
130         5
Name: count, Length: 221, dtype: int64

In [9]:
Node_Type = pd.read_csv('Node_Type.txt', sep = '\t')

In [10]:
nodetype = Node_Type.iloc[:,2].drop_duplicates()
nodetype = pd.DataFrame(nodetype)

In [11]:
nodetype['Type_ID'] = range(0, len(nodetype))

In [12]:
Node_Type = pd.merge(Node_Type, nodetype, left_on = 'Type', right_on = 'Type', how = 'inner')

In [13]:
Node_Type[['Type_ID']].to_csv('entity.txt', header=None, index=False)