In [None]:
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import numpy as np
import os

class RefConf(Dataset):
    def __init__(self, md_dir, structure_dir):
        self.file_info = []
        self.start_structure = []
        self.final_structure = []
        for file in os.listdir(md_dir):
            file_path = os.path.join(md_dir, file)
            self.file_info.append(file_path)
            for structure in os.listdir(structure_dir):
                if structure.startswith(file.split('.')[0]+'_A'):
                    structure = os.path.join(structure_dir, structure)
                    self.start_structure.append(structure)
                if structure.startswith(file.split('.')[0]+'_B'):
                    structure = os.path.join(structure_dir, structure)
                    self.final_structure.append(structure)
        print(len(self.file_info))
        print(len(self.start_structure))

    def __len__(self):
        return len(self.file_info)
    
    def __getitem__(self, index):
        structure_a = []
        structure_b = []
        
        # 读取第一个结构文件
        structure_strat = self.start_structure[index]
        # print(structure_strat)
        with open(structure_strat, 'r') as f:
            lines = f.readlines()
            
        # 使用 np.genfromtxt 读取特定列数据
        data = np.genfromtxt(lines, usecols=(6, 7, 8))
        structure_a.append(data)
        structure_a = np.array(structure_a)
        
        if np.any(np.isnan(structure_a)):
            print(self.start_structure[index])  
        
        structure_a = torch.from_numpy(structure_a) / 10

        if torch.isnan(structure_a).any():
            print(structure_strat)

        structure_final = self.final_structure[index]
        with open(structure_final, 'r') as f:
            lines = f.readlines()

        data = np.genfromtxt(lines, usecols=(6, 7, 8))
        structure_b.append(data)
        structure_b = np.array(structure_b)
        
        if np.any(np.isnan(structure_b)):
            print(self.final_structure[index])  
        
        structure_b = torch.from_numpy(structure_b) / 10

        return structure_a, structure_b
    
data_dir = 'Dir to gromacs'
ref_dir = 'Dir to structures'
refset = RefConf(data_dir,ref_dir)
a = refset.file_info
b = refset.start_structure
print(a)
print(b)
lista = []
listb = []
for i in a:
    p_a = i.split('\\')[4].split('.')[0]
    lista.append(p_a)

for i in b:
    p_b = i.split('\\')[3].split('_A')[0]
    listb.append(p_b)
print(lista)
print(listb)
result = list(set(lista) - set(listb))
print(result)

import pickle
ref_list = []
for i in range(len(refset)):
    data = refset[i]
    ref_list.append(data)
print(len(ref_list))
with open("ref_conf_nano.pkl",'wb') as f:
    pickle.dump(ref_list, f)


In [None]:
import pickle
from torch.utils.data import Dataset
import torch
import numpy as np
import os
import torch.nn as nn
import numpy as np
from Dict import AMINOSET, AMINO_ACID_MAP

MAX_BATCH_SIZE = 512


class MD_Dataset(Dataset):
    def __init__(self, md_dir):
        self.file_info = []
        for file in os.listdir(md_dir):
            file_path = os.path.join(md_dir, file)
            self.file_info.append(file_path)

    def __len__(self):
        return len(self.file_info)
    
    def __getitem__(self, index):
        file = self.file_info[index]
        print(file)
        with open(file, 'r') as f:
            lines = f.readlines()

        L = int(lines[1])
        data_list = []
        for i in range(2, len(lines), L+3):
            data = np.genfromtxt(lines[i:i+L], usecols=(3, 4, 5))
            data_list.append(data)
        data_array = np.array(data_list)
        md_tensor = torch.from_numpy(data_array)
        return md_tensor

data_dir = 'Dir to gromacs'
dataset = MD_Dataset(data_dir)
print(len(dataset))

md_list = []
for i in range(len(dataset)):
    data = dataset[i]
    md_list.append(data)
print(md_list[0].shape)

with open("md_4000.pkl",'wb') as f:
    pickle.dump(md_list, f)
print(len(md_list))

In [None]:
import pickle
from torch.utils.data import Dataset
import torch
import numpy as np
import os

import torch.nn as nn
import numpy as np
from Dict import AMINOSET, AMINO_ACID_MAP

MAX_BATCH_SIZE = 512


class NAME_Dataset(Dataset):
    def __init__(self, md_dir):
        self.file_info = []
        for file in os.listdir(md_dir):
            file_path = os.path.join(md_dir, file)
            self.file_info.append(file_path)

    def __len__(self):
        return len(self.file_info)
    
    def __getitem__(self, index):
        file = self.file_info[index]
        file_name = file.split("\\")[4].split('.')[0]
        print(file_name)
        return file_name
data_dir = 'Dir to gromacs'
dataset = NAME_Dataset(data_dir)
print(len(dataset))

name_list = []
for i in range(len(dataset)):
    name = dataset[i]
    name_list.append(name)
print(len(name_list))

with open("name.pkl",'wb') as f:
    pickle.dump(name_list, f)
print(len(name_list))

In [None]:
from torch.utils.data import Dataset
import os


import torch.nn as nn
import numpy as np
from Dict import AMINOSET, AMINO_ACID_MAP

class AA_Dataset(Dataset):
    def __init__(self, md_dir):
        self.file_info = []
        for file in os.listdir(md_dir):
            file_path = os.path.join(md_dir, file)
            self.file_info.append(file_path)

    def __len__(self):
        return len(self.file_info)
    
    def __getitem__(self, index):
        file = self.file_info[index]
        with open(file, 'r') as f:
            lines = f.readlines()
        L = int(lines[1])
        seq = [AMINO_ACID_MAP[line[5:8]] for line in lines[2:2+L]]
        seq = ''.join(seq)
        return seq

data_dir = 'Dir to gromacs'
aa = AA_Dataset(data_dir)
print(len(aa))
aa_list = []
for i in range(len(aa)):
    data = aa[i]
    aa_list.append(data)

with open("aa.pkl",'wb') as f:
    pickle.dump(aa_list, f)


In [None]:
import pickle
import os
import esm
import torch
with open("aa.pkl",'rb') as f:
    seq_list = pickle.load(f)
seq_esm = []
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
model.eval()
for protein_sequence in seq_list:
    data = [(0, protein_sequence)]
    batch_labels, batch_strs, batch_tokens = alphabet.get_batch_converter()(data)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]
    sequence_rep = token_representations.mean(1)
    seq_esm.append(sequence_rep)

with open("esm.pkl",'wb') as f:
    pickle.dump(seq_esm, f)

In [None]:
import pickle
import os
with open("md_4000.pkl",'rb') as f:
    a = pickle.load(f)

with open("aa.pkl",'rb') as f:
    b = pickle.load(f)

with open("esm.pkl",'rb') as f:
    c = pickle.load(f)

with open("ref_conf_nano.pkl",'rb') as f:
    d = pickle.load(f)

with open("name.pkl",'rb') as f:
    e = pickle.load(f)

protein_md=[]
protein_seq=[]
protein_esm=[]
ref_conf1 = []
ref_conf2 = []
protein_name = []

for md in a:
    protein_md.append(md)

for seq in b:
    protein_seq.append(seq)

for esm in c:
    protein_esm.append(esm)

for ref in d:
    ref_conf1.append(ref[0])
    ref_conf2.append(ref[1])
    
for name in e:
    protein_name.append(name)


import pickle

assert len(protein_md) == len(protein_seq) == len(protein_esm) == len(ref_conf1) == len(protein_name)

data_dict = {
    "MD": protein_md,
    "SEQ": protein_seq,
    "ESM": protein_esm,
    "REFCONF1": ref_conf1,
    "REFCONF2": ref_conf2,
    "NAME": protein_name
}

with open('dataset_4000_nano.pkl', 'wb') as f:
    pickle.dump(data_dict, f)