In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from MSI.load_msi_data import LoadData

In [2]:
'''DataLoader'''
dataloader = LoadData()

# get_dict
# protein_id2name, protein_name2id = dataloader.get_dict(type='protein')
drug_id2name, drug_name2id = dataloader.get_dict(type='drug')
# indication_id2name, indication_name2id = dataloader.get_dict(type='indication')
# biof_id2name, biof_name2id = dataloader.get_dict(type='biological_function')

Check common drugs between MSI and DCDB / C_DCDB / DB_DDI / TWOSIDES_DDI

In [3]:
msi_drug_list = list(drug_id2name.keys())

In [4]:
dcdb_dual = pd.read_csv('ori_labels/DCDB_dual.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('ori_labels/C_DCDB_dual.tsv', sep='\t')
db_ddi = pd.read_csv('ori_labels/DB_DDI.csv').drop(columns=['Y'])
twosides_ddi = pd.read_csv('ori_labels/TWOSIDES_DDI.tsv', sep='\t')

In [5]:
def leave_common_drugs(df):
    count = 0
    for row in df.iterrows():
        a, b = row[1][0], row[1][1]
        if a in msi_drug_list and b in msi_drug_list:
            count += 1
    print(f'Original number of rows: {len(df)}')
    print(f'Filtered number of rows: {count}')
    return df[df['drug_1'].isin(msi_drug_list) & df['drug_2'].isin(msi_drug_list)]

In [6]:
dcdb_dual = leave_common_drugs(dcdb_dual)
c_dcdb_dual = leave_common_drugs(c_dcdb_dual)
db_ddi = leave_common_drugs(db_ddi)
twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 598
Filtered number of rows: 455
Original number of rows: 9092
Filtered number of rows: 4221
Original number of rows: 191808
Filtered number of rows: 122040
Original number of rows: 18530
Filtered number of rows: 15710


Save DCDB, C_DCDB

In [7]:
dcdb_dual.to_csv('MSI/data/labels/DCDB_msi.tsv', sep='\t', index=None)
c_dcdb_dual.to_csv('MSI/data/labels/C_DCDB_msi.tsv', sep='\t', index=None)

Combine DCDB, C_DCDB

In [8]:
dcdb_dual = pd.read_csv('MSI/data/labels/DCDB_msi.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('MSI/data/labels/C_DCDB_msi.tsv', sep='\t')

dcdb_drug_set = []
for row in dcdb_dual.iterrows():
    dcdb_drug_set.append(set([row[1][0], row[1][1]]))
c_dcdb_drug_set = []
for row in c_dcdb_dual.iterrows():
    c_dcdb_drug_set.append(set([row[1][0], row[1][1]]))

not_common_idx = []
for i in range(len(dcdb_drug_set)):
    if dcdb_drug_set[i] not in c_dcdb_drug_set:
        not_common_idx.append(i)

dc = pd.concat([c_dcdb_dual, dcdb_dual.iloc[not_common_idx]], axis=0)
dc.reset_index(drop=True, inplace=True)

In [9]:
dc.to_csv('MSI/data/labels/DC_combined_msi.tsv', sep='\t', index=None)

Leave unique Drug-Drug Interaction pair (DDI - DC)

In [10]:
dc = pd.read_csv('MSI/data/labels/DC_combined_msi.tsv', sep='\t')

db_ddi = leave_common_drugs(db_ddi)
twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 122040
Filtered number of rows: 122040
Original number of rows: 15710
Filtered number of rows: 15710


In [11]:
def leave_unique_pair(dc, ddi):
    dc_set_list = []
    for row in dc.iterrows():
        dc_set_list.append(set([row[1][0], row[1][1]]))
    ddi_set_list = []
    for row in ddi.iterrows():
        ddi_set_list.append(set([row[1][0], row[1][1]]))
    not_common_idx = []
    for i in range(len(ddi_set_list)):
        if ddi_set_list[i] not in dc_set_list:
            not_common_idx.append(i)
    temp = ddi.iloc[not_common_idx]
    return temp.reset_index(drop=True)

In [12]:
db_ddi = leave_unique_pair(dc, db_ddi)
twosides_ddi = leave_unique_pair(dc, twosides_ddi)

In [13]:
print(len(db_ddi))
print(len(twosides_ddi))

120599
15300


In [14]:
db_ddi.to_csv('MSI/data/labels/DB_DDI_msi.tsv', sep='\t', index=None)
twosides_ddi.to_csv('MSI/data/labels/TWOSIDES_DDI_msi.tsv', sep='\t', index=None)