In [1]:
### drug+protein random-only
import json
import os
import random
import shutil
from tqdm import tqdm
import torch
import csv
import jsonlines

def readJSONL(fp):
    res = []
    with open(fp,"r",encoding='utf-8') as f:
        for line in f.readlines():
            res.append(json.loads(line))
    return res  

def writeJSONL(instance,fp):
    with jsonlines.open(fp,'w') as f:
        for sample in instance:
            f.write(sample)



def readCSV_data(fp):
    with open(fp, mode='r', encoding='us-ascii', errors='ignore') as file:
        # 使用 csv.reader 来处理逗号分隔的内容
        reader = csv.reader(file)
        # 将读取的内容转换为列表
        lines = list(reader)

        # 处理标题行
        headers = lines[0][:3]  # 取前3个标题
        print('headers:', headers)

        # 生成字典列表
        data_list = []
        for values in lines[1:]:  # 从第二行开始
            if len(values) < len(headers):
                print(f"Warning: Skipped line due to mismatch in length: {values}")
                continue
            # 创建字典并添加到列表
            row_dict = {headers[i]: values[i] for i in range(len(headers))}
            data_list.append(row_dict)
        return data_list

def readCSV_desc(fp):
    with open(fp, mode='r', encoding='us-ascii', errors='ignore') as file:
        # 使用 csv.reader 来处理逗号分隔的内容
        reader = csv.reader(file)
        # 将读取的内容转换为列表
        lines = list(reader)

        # 处理标题行
        headers = lines[0][:3]  # 取前3个标题
        print('headers:', headers)

        # 生成字典列表
        data_list = []
        for values in lines[1:]:  # 从第二行开始
            if len(values) < len(headers):
                print(f"Warning: Skipped line due to mismatch in length: {values}")
                continue
            # 创建字典并添加到列表
            row_dict = {headers[i]: values[i] for i in range(len(headers))}
            data_list.append(row_dict)

    return data_list
    
def filter_DBID_by_smiles(data_list, key_value):
    return [d for d in data_list if d.get('smiles') == key_value]

def filter_desc_by_DBID(data_list, key_value):
    return [d for d in data_list if d.get('DBID') == key_value]

def filter_desc_by_sequence(data_list, key_value):
    return [d for d in data_list if d.get('Sequence') == key_value]
    
def human_transform(input_dir, output_dir, desc_path, smile_path, protein_desc_path):
    ## 50 100 500
    random.seed(500)
    os.makedirs(output_dir, exist_ok=True)
    train_path = os.path.join(input_dir, "train.csv")
    test_path = os.path.join(input_dir, "test.csv")
    ou_train_path = os.path.join(output_dir, "train.jsonl")
    ou_test_path = os.path.join(output_dir, "test.jsonl")
    
    
    ### SMILES, Protein,  Y
    train_json = readCSV_data(train_path)
    test_json = readCSV_data(test_path)
    ### DBID, Drugname, Description
    desc_json = readCSV_desc(desc_path)
    ### drugbank_id, name, smiles
    smile_json = readCSV_data(smile_path)
    print('train length', len(train_json))
    print('test length', len(test_json))

    protein_desc = readJSONL(protein_desc_path)
    
    train_results = []
    for example in tqdm(train_json, total=len(train_json), desc="Processing Train dataset"):
              
        if int(float(example["Y"])) == 1:
            pos = "advise"
            neg = ["negative"]
        else:
            pos = "negative"
            neg = ["advise"]
            
        drug_DBID = filter_DBID_by_smiles(smile_json, example["SMILES"])
        protein_dict_list = filter_desc_by_sequence(protein_desc, example["Protein"])
        if drug_DBID and protein_dict_list:
            drug_DBID = drug_DBID[0]["drugbank_id"]
            drug_desc_dict = filter_desc_by_DBID(desc_json, drug_DBID)
            
            if drug_desc_dict:
                drug_desc_dict = drug_desc_dict[0]
                
                protein_dict = protein_dict_list[0]
                protein_text = f"'Protein names':{protein_dict['Protein names']}, 'Gene Names':{protein_dict['Gene Names']}, 'Sequence':{protein_dict['Sequence']}, 'Features':{protein_dict['Features']}, 'Keywords':{protein_dict['Keywords']}"
                
          
                dti_item = {
                        "drug1_id": drug_DBID,
                        "target_sequence": example["Protein"],
                        "drug1_name": drug_desc_dict["Drugname"],
                        "drug1_smile": example["SMILES"],
                        "drug1_desc": drug_desc_dict["Description"],
                        "protein_desc": protein_text,
                        "pos": pos,
                        "neg": neg
                    }
                # print(dti_item)
                train_results.append(dti_item)
        #     else:
        #         dti_item = {
        #                 "drug1_id": "",
        #                 "target_sequence": example["Protein"],
        #                 "drug1_name": "",
        #                 "drug1_smile": example["SMILES"],
        #                 "drug1_desc": "",
        #                 "pos": pos,
        #                 "neg": neg
        #             }
            
        #         train_results.append(dti_item)
                
        # else:
        #     dti_item = {
        #             "drug1_id": "",
        #             "target_sequence": example["Protein"],
        #             "drug1_name": "",
        #             "drug1_smile": example["SMILES"],
        #             "drug1_desc": "",
        #             "pos": pos,
        #             "neg": neg
        #         }
        
            # train_results.append(dti_item)
    random.shuffle(train_results)
    print("Original train_length", len(train_json))
    print("Filtered train_length", len(train_results))
    writeJSONL(train_results, ou_train_path)


    test_results = []
    for example in tqdm(test_json, total=len(test_json), desc="Processing Test dataset"):
        if int(float(example["Y"])) == 1:
            pos = "advise"
            neg = ["negative"]
        else:
            pos = "negative"
            neg = ["advise"]
            
        drug_DBID = filter_DBID_by_smiles(smile_json, example["SMILES"])
        protein_dict_list = filter_desc_by_sequence(protein_desc, example["Protein"])
        if drug_DBID and protein_dict_list:
            drug_DBID = drug_DBID[0]["drugbank_id"]
            drug_desc_dict = filter_desc_by_DBID(desc_json, drug_DBID)
            
            if drug_desc_dict:
                drug_desc_dict = drug_desc_dict[0]
                
                protein_dict = protein_dict_list[0]
                protein_text = f"'Protein names':{protein_dict['Protein names']}, 'Gene Names':{protein_dict['Gene Names']}, 'Sequence':{protein_dict['Sequence']}, 'Features':{protein_dict['Features']}, 'Keywords':{protein_dict['Keywords']}"
                
          
                dti_item = {
                        "drug1_id": drug_DBID,
                        "target_sequence": example["Protein"],
                        "drug1_name": drug_desc_dict["Drugname"],
                        "drug1_smile": example["SMILES"],
                        "drug1_desc": drug_desc_dict["Description"],
                        "protein_desc": protein_text,
                        "pos": pos,
                        "neg": neg
                    }
                # print(dti_item)
                test_results.append(dti_item)
        #     else:
        #         dti_item = {
        #                 "drug1_id": "",
        #                 "target_sequence": example["Protein"],
        #                 "drug1_name": "",
        #                 "drug1_smile": example["SMILES"],
        #                 "drug1_desc": "",
        #                 "pos": pos,
        #                 "neg": neg
        #             }
            
        #         test_results.append(dti_item)
                
        # else:
        #     dti_item = {
        #             "drug1_id": "",
        #             "target_sequence": example["Protein"],
        #             "drug1_name": "",
        #             "drug1_smile": example["SMILES"],
        #             "drug1_desc": "",
        #             "pos": pos,
        #             "neg": neg
        #         }
        
        #     test_results.append(dti_item)
    random.shuffle(test_results)
    print("Original train_length", len(test_json))
    print("Filtered train_length", len(test_results))
    writeJSONL(test_results, ou_test_path)
        
        
    


if __name__ == "__main__":
    desc_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp/Drug_description_expand_upload.csv"
    smile_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp/df_drugbank_smiles_filtered.csv"
    protein_desc_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/protein_desc/protein_desc.jsonl"
    
    input_dir = "/root/autodl-tmp/dataset_dti/datasets/datasets_ori/human/cold/"
    output_dir = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/human_tmp/random_only/fold0/"
    

    human_transform(input_dir, output_dir, desc_path, smile_path, protein_desc_path)

    print("All OK!")


headers: ['SMILES', 'Protein', 'Y']
headers: ['SMILES', 'Protein', 'Y']
headers: ['DBID', 'Drugname', 'Description']
headers: ['drugbank_id', 'name', 'smiles']
train length 3453
test length 311


Processing Train dataset: 100%|██████████| 3453/3453 [08:16<00:00,  6.96it/s]


Original train_length 3453
Filtered train_length 22


Processing Test dataset: 100%|██████████| 311/311 [00:44<00:00,  7.03it/s]


Original train_length 311
Filtered train_length 1
All OK!


In [4]:
### drug+protein cluster-only
#### cluster - only - correct 只处理target的


import json
import os
import random
import shutil
from tqdm import tqdm
import torch
import csv
import jsonlines

def readJSONL(fp):
    res = []
    with open(fp,"r",encoding='utf-8') as f:
        for line in f.readlines():
            res.append(json.loads(line))
    return res  

def writeJSONL(instance,fp):
    with jsonlines.open(fp,'w') as f:
        for sample in instance:
            f.write(sample)



def readCSV_data(fp):
    with open(fp, mode='r', encoding='us-ascii', errors='ignore') as file:
        # 使用 csv.reader 来处理逗号分隔的内容
        reader = csv.reader(file)
        # 将读取的内容转换为列表
        lines = list(reader)

        # 处理标题行
        headers = lines[0][:3]  # 取前3个标题
        print('headers:', headers)

        # 生成字典列表
        data_list = []
        for values in lines[1:]:  # 从第二行开始
            if len(values) < len(headers):
                print(f"Warning: Skipped line due to mismatch in length: {values}")
                continue
            # 创建字典并添加到列表
            row_dict = {headers[i]: values[i] for i in range(len(headers))}
            data_list.append(row_dict)
        return data_list

def readCSV_desc(fp):
    with open(fp, mode='r', encoding='us-ascii', errors='ignore') as file:
        # 使用 csv.reader 来处理逗号分隔的内容
        reader = csv.reader(file)
        # 将读取的内容转换为列表
        lines = list(reader)

        # 处理标题行
        headers = lines[0][:3]  # 取前3个标题
        print('headers:', headers)

        # 生成字典列表
        data_list = []
        for values in lines[1:]:  # 从第二行开始
            if len(values) < len(headers):
                print(f"Warning: Skipped line due to mismatch in length: {values}")
                continue
            # 创建字典并添加到列表
            row_dict = {headers[i]: values[i] for i in range(len(headers))}
            data_list.append(row_dict)

    return data_list
    
def filter_DBID_by_smiles(data_list, key_value):
    return [d for d in data_list if d.get('smiles') == key_value]

def filter_desc_by_DBID(data_list, key_value):
    return [d for d in data_list if d.get('DBID') == key_value]

def filter_desc_by_sequence(data_list, key_value):
    return [d for d in data_list if d.get('Sequence') == key_value]

def human_transform(input_dir, output_dir, desc_path, smile_path, protein_desc_path):
    ## 50, 100,500
    random.seed(500)
    os.makedirs(output_dir, exist_ok=True)
    # train_path = os.path.join(input_dir, "source_train.csv")
    train2_path = os.path.join(input_dir, "target_train.csv")
    test_path = os.path.join(input_dir, "target_test.csv")
    
    ou_train_path = os.path.join(output_dir, "train.jsonl")
    ou_test_path = os.path.join(output_dir, "test.jsonl")
    
    
    ### SMILES, Protein,  Y
    # train_json = readCSV_data(train_path)
    train2_json = readCSV_data(train2_path)

    test_json = readCSV_data(test_path)
    ### DBID, Drugname, Description
    desc_json = readCSV_desc(desc_path)
    ### drugbank_id, name, smiles
    smile_json = readCSV_data(smile_path)
    print('train2 length', len(train2_json))
    print('test length', len(test_json))

    protein_desc = readJSONL(protein_desc_path)
    
    train_results = []

            
    for example in tqdm(train2_json, total=len(train2_json), desc="Processing Train2 dataset"):
        if int(float(example["Y"])) == 1:
            pos = "advise"
            neg = ["negative"]
        else:
            pos = "negative"
            neg = ["advise"]
            
        drug_DBID = filter_DBID_by_smiles(smile_json, example["SMILES"])
        protein_dict_list = filter_desc_by_sequence(protein_desc, example["Protein"])
        if drug_DBID and protein_dict_list:
            drug_DBID = drug_DBID[0]["drugbank_id"]
            drug_desc_dict = filter_desc_by_DBID(desc_json, drug_DBID)
            
            if drug_desc_dict:
                drug_desc_dict = drug_desc_dict[0]
                
                protein_dict = protein_dict_list[0]
                protein_text = f"'Protein names':{protein_dict['Protein names']}, 'Gene Names':{protein_dict['Gene Names']}, 'Sequence':{protein_dict['Sequence']}, 'Features':{protein_dict['Features']}, 'Keywords':{protein_dict['Keywords']}"
                
          
                dti_item = {
                        "drug1_id": drug_DBID,
                        "target_sequence": example["Protein"],
                        "drug1_name": drug_desc_dict["Drugname"],
                        "drug1_smile": example["SMILES"],
                        "drug1_desc": drug_desc_dict["Description"],
                        "protein_desc": protein_text,
                        "pos": pos,
                        "neg": neg
                    }
                # print(dti_item)
                train_results.append(dti_item)           
            
    random.shuffle(train_results)
    print("Original train_length", len(train2_json))
    print("Filtered train_length", len(train_results))
    writeJSONL(train_results, ou_train_path)


    test_results = []
    for example in tqdm(test_json, total=len(test_json), desc="Processing Test dataset"):
        if int(float(example["Y"])) == 1:
            pos = "advise"
            neg = ["negative"]
        else:
            pos = "negative"
            neg = ["advise"]
            
        drug_DBID = filter_DBID_by_smiles(smile_json, example["SMILES"])
        protein_dict_list = filter_desc_by_sequence(protein_desc, example["Protein"])
        if drug_DBID and protein_dict_list:
            drug_DBID = drug_DBID[0]["drugbank_id"]
            drug_desc_dict = filter_desc_by_DBID(desc_json, drug_DBID)
            
            if drug_desc_dict:
                drug_desc_dict = drug_desc_dict[0]
                
                protein_dict = protein_dict_list[0]
                protein_text = f"'Protein names':{protein_dict['Protein names']}, 'Gene Names':{protein_dict['Gene Names']}, 'Sequence':{protein_dict['Sequence']}, 'Features':{protein_dict['Features']}, 'Keywords':{protein_dict['Keywords']}"
                
          
                dti_item = {
                        "drug1_id": drug_DBID,
                        "target_sequence": example["Protein"],
                        "drug1_name": drug_desc_dict["Drugname"],
                        "drug1_smile": example["SMILES"],
                        "drug1_desc": drug_desc_dict["Description"],
                        "protein_desc": protein_text,
                        "pos": pos,
                        "neg": neg
                    }
                # print(dti_item)
                test_results.append(dti_item)
    random.shuffle(test_results)
    print("Original train_length", len(test_json))
    print("Filtered train_length", len(test_results))
    writeJSONL(test_results, ou_test_path)
        
        
    


if __name__ == "__main__":
    desc_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp/Drug_description_expand_upload.csv"
    smile_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp/df_drugbank_smiles_filtered.csv"
    protein_desc_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/protein_desc/protein_desc.jsonl"
    
    input_dir = "/root/autodl-tmp/dataset_dti/datasets/datasets_ori/biosnap/cluster/"
    output_dir = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/biosnap_tmp/cluster_only_correct_0107/fold2/"
    
    human_transform(input_dir, output_dir, desc_path, smile_path, protein_desc_path)

    print("All OK!")



headers: ['SMILES', 'Protein', 'Y']
headers: ['SMILES', 'Protein', 'Y']
headers: ['DBID', 'Drugname', 'Description']
headers: ['drugbank_id', 'name', 'smiles']
train2 length 3628
test length 907


Processing Train2 dataset: 100%|██████████| 3628/3628 [08:44<00:00,  6.92it/s]


Original train_length 3628
Filtered train_length 1608


Processing Test dataset: 100%|██████████| 907/907 [02:11<00:00,  6.88it/s]


Original train_length 907
Filtered train_length 419
All OK!


In [19]:
#### 数据集筛选

import json
import pandas as pd

# 假设 train.jsonl 和 test.jsonl 文件路径
train_file_path = '/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/biosnap_tmp/random_only/fold2/train.jsonl'
test_file_path = '/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/biosnap_tmp/random_only/fold2/test.jsonl'
new_test_file_dir = '/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/biosnap_tmp/random_only_exist/fold2/'
new_test_data_path = os.path.join(new_test_file_dir, 'test.jsonl')
os.makedirs(new_test_file_dir, exist_ok=True)

# 从 train.jsonl 读取数据
train_ids = set()
train_sequences = set()
def readJSONL(fp):
    res = []
    with open(fp,"r",encoding='utf-8') as f:
        for line in f.readlines():
            res.append(json.loads(line))
    return res  

train_json = readJSONL(train_file_path)
test_json = readJSONL(test_file_path)
for data in train_json:
    train_ids.add(data['drug1_id'])
    train_sequences.add(data['target_sequence'])

# 从 test.jsonl 读取数据并筛选
new_test_data = []
for data in test_json:
    if data['drug1_id'] in train_ids and data['target_sequence'] in train_sequences:
        new_test_data.append(data)
print("new length", len(new_test_data))
# 将筛选后的数据写入新的 test.jsonl 文件
def writeJSONL(instance,fp):
    with jsonlines.open(fp,'w') as f:
        for sample in instance:
            f.write(sample)
writeJSONL(new_test_data, new_test_data_path)

print(f"新测试集已创建，包含 {len(new_test_data)} 条记录.")

new length 2198
新测试集已创建，包含 2198 条记录.


In [23]:
import json
import os
import pandas as pd
from datasets import Dataset, Features, Value, Sequence
from tqdm import tqdm
from data_utils import *


def make_arrow(data_list, output_dir):
    features = Features({
        'text': Value('string'),
        'text_pos': Value('string'),
        'text_neg': Sequence(Value('string')),
        'type': Value('string')
    })
    
    # 创建 Arrow 数据集
    arrow_dataset = Dataset.from_dict({key: [d[key] for d in data_list] for key in data_list[0]})
    
    # 保存到磁盘
    os.makedirs(output_dir, exist_ok=True)
    arrow_dataset.save_to_disk(str(output_dir))

def tmp2piccolo(root_path, output_dir):
    train_json = readJSONL(root_path)

    data_list = []
    #  {example["top1_contents"]}
    for example in tqdm(train_json, desc="Processing"):
        if example["drug1_id"]:
            text = f"""
            Try to figure out drug-target interaction between the drug and the target. 
            The drug name is {example["drug1_name"]}, the drug smiles is {example["drug1_smile"]} and the drug description is {example["drug1_desc"]}. 
            The target protein description is {example["protein_desc"]}
            Please think step by step!
            """
            text = text.replace("\n", " ")
            text_pos = example["pos"]
            text_neg = example["neg"]
            type_ = "cls_contrast"
            
            # Append the data as a dictionary
            data_list.append({"text": text, "text_pos": text_pos, "text_neg": text_neg, "type": type_})
    print("length", len(data_list))
        # else:
        #     text = f"""
        #     Try to figure out drug-target interaction between the drug and the target. 
        #     The drug smiles is {example["drug1_smile"]}. 
        #     The target protein sequence is {example["target_sequence"]}.
        #     Please think step by step!
        #     """
        #     text = text.replace("\n", " ")
        #     text_pos = example["pos"]
        #     text_neg = example["neg"]
        #     type_ = "cls_contrast"
            
        #     # Append the data as a dictionary
        #     data_list.append({"text": text, "text_pos": text_pos, "text_neg": text_neg, "type": type_})
    

    print(f"----------------------------- 开始写arrow文件 ---------------------------------")
    make_arrow(data_list, output_dir)
    print("OK!!!")

if __name__ == "__main__":
    root_path = "/root/autodl-tmp/dataset_dti/datasets/datasets_tmp_all/human_tmp_only_protein/cold_only_retrieval_balanced/fold0/train_add.jsonl"
    output_dir = "/root/autodl-tmp/dataset_dti/datasets/datasets_piccolo_all/human_balanced_cold_only_protein_p3/fold0/"
    tmp2piccolo(root_path, output_dir)

Processing: 100%|██████████| 8222/8222 [00:00<00:00, 446964.78it/s]

length 8222
----------------------------- 开始写arrow文件 ---------------------------------





Saving the dataset (0/1 shards):   0%|          | 0/8222 [00:00<?, ? examples/s]

OK!!!
