In [4]:
import pandas as pd
data = pd.read_csv('data_test/NPInter2.csv')
data.head(5)

Unnamed: 0,Type,Y,RNA_list,RNA_aa_code,target_aa_code,protein_list
0,Undirected,1,n121,UGUUGUUAUGUGUUGGUUAUGUGUUGAAUAUAAUGUCCUAUAAGCU...,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,Q13148
1,Undirected,1,n127,GGGCAUGGUGGCACAUGCCUGUAGUCCCAGCUACUCGGUGGGCUUA...,MPSKFSCRQLREAGQCFESFLVVRGLDMETDRERLRTIYNRDFKIS...,Q9HCE1
2,Undirected,1,n1315,GCCACAUGAUGAUAUCAAGGCUGUUGUGAUUCAGUUGGUUUGGCUA...,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,P35637
3,Undirected,1,n1315,GCCACAUGAUGAUAUCAAGGCUGUUGUGAUUCAGUUGGUUUGGCUA...,MASTDYSTYSQAAAQQGYSAYTAQPTQGYAQTTQAYGQQSYGTYGQ...,Q01844
4,Undirected,1,n1315,GCCACAUGAUGAUAUCAAGGCUGUUGUGAUUCAGUUGGUUUGGCUA...,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,Q13148


In [5]:
rna_seqs = data['RNA_aa_code']
rna_names = data['RNA_list']
rna_list = pd.DataFrame({'RNA_list': rna_names, 'RNA_aa_code': rna_seqs})
len(rna_seqs)
rna_filter = rna_list[rna_list['RNA_aa_code'].apply(lambda x: len(x) <= 600)]


In [6]:
rna_seqs.to_csv('data/rna_seq_noindex.csv',
                index=False,
                 
                )
rna_filter.to_csv('data/rna_seq_filter.csv',
                index=False,)


In [7]:
import os
import random
import pickle  # Python 3中使用pickle替代cPickle
import collections
import numpy as np
from pathlib import Path

# 定义核心函数（需确保one_hot函数已实现）
def one_hot(seq):
    """示例函数，需根据实际需求实现RNA序列的one-hot编码"""
    base_dict = {'A': [1,0,0,0], 'U': [0,1,0,0], 
                'C': [0,0,1,0], 'G': [0,0,0,1]}
    return np.array([base_dict.get(nuc, [0,0,0,0]) for nuc in seq])

# 核心处理逻辑
def process_rna_files(file_path):
    RNA_SS_data = collections.namedtuple('RNA_SS_data', 'seq ss_label length name pairs')
    all_files_list = []
    
    file_dir = Path(file_path)
    all_files = os.listdir(file_dir)
    random.seed(4)
    random.shuffle(all_files)

    for index, item_file in enumerate(all_files):
        file_path = file_dir / item_file
        
        # 使用Python原生文件操作替代awk
        with open(file_path, 'r') as f:
            lines = [line.strip().split() for line in f.readlines()]
        
        # 提取第二列数据（原t0）
        seq = ''.join([parts[1] for parts in lines if len(parts)>=2])
        
        # 提取第一列和第三列数据（原t1/t2）
        try:
            col1 = [parts[0] for parts in lines if len(parts)>=1]
            col3 = [parts[2] for parts in lines if len(parts)>=3]
            pair_dict_all_list = [
                [int(a)-1, int(b)-1] 
                for a, b in zip(col1, col3) 
                if b != '0'
            ]
        except (IndexError, ValueError):
            pair_dict_all_list = []

        # 处理序列特征
        try:
            one_hot_matrix = one_hot(seq.upper())
        except Exception as e:
            print(f"Error processing {item_file}: {str(e)}")
            continue

        # 构建数据结构
        seq_name = item_file
        seq_len = len(seq)
        
        if 0 < seq_len <= 600:
            # 创建标签矩阵
            ss_label = np.zeros((seq_len, 3), dtype=int)
            valid_pairs = [pair for pair in pair_dict_all_list if pair[0] < pair[1]]
            
            # 设置配对标签
            for i, j in valid_pairs:
                if i < seq_len and j < seq_len:
                    ss_label[i] = [0, 1, 0]  # 起始标记
                    ss_label[j] = [0, 0, 1]  # 结束标记
            
            # 填充未配对位置
            unpaired = np.where(ss_label.sum(axis=1) == 0)
            ss_label[unpaired] = [1, 0, 0]

            # 扩展到600长度
            one_hot_matrix_600 = np.zeros((600, 4))
            one_hot_matrix_600[:seq_len] = one_hot_matrix
            
            ss_label_600 = np.zeros((600, 3), dtype=int)
            ss_label_600[:seq_len] = ss_label

            # 保存样本
            sample = RNA_SS_data(
                seq=one_hot_matrix_600,
                ss_label=ss_label_600,
                length=seq_len,
                name=seq_name,
                pairs=valid_pairs
            )
            all_files_list.append(sample)

        if (index+1) % 100 == 0:
            print(f'Processed {index+1}/{len(all_files)} files')

    # 保存结果
    output_path = file_dir.parent / 'processed_data.pkl'
    with open(output_path, 'wb') as f:
        pickle.dump(all_files_list, f)
    
    print(f"Successfully processed {len(all_files_list)} sequences")
    return all_files_list

# 在Jupyter中直接运行
if __name__ == '__main__':
    file_path = './data/rna_seq_noindex.csv'  # 修改为你的实际路径
    result = process_rna_files(file_path)

NotADirectoryError: [WinError 267] 目录名称无效。: 'data\\rna_seq_noindex.csv'

In [None]:
import csv
import os

def csv_to_bpseq(csv_file_path):
    with open(csv_file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # 跳过标题行
        for row in reader:
            rna_id = row[0]
            rna_seq = row[1]
            bpseq_content = []
            for i, base in enumerate(rna_seq, start=1):
                # 假设所有碱基都未配对，配对信息为 0
                bpseq_content.append(f"{i} {base} 0")
            # 生成 .bpseq 文件
            if not os.path.exists(f"./output/"):
                os.makedirs(f"./output/")
            if not os.path.exists(f"./output/{rna_id}.bpseq"):
                bpseq_file_path = f"./output/{rna_id}.bpseq"
            with open(bpseq_file_path, 'w') as bpseqfile:
                bpseqfile.write('\n'.join(bpseq_content))
            print(f"Generated {bpseq_file_path}")


# 使用示例
csv_file_path = './data_test/rna_seqs.csv'
csv_to_bpseq(csv_file_path)
    

Generated ./output/n121.bpseq
Generated ./output/n127.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1315.bpseq
Generated ./output/n1414.bpseq
Generated ./output/n1414.bpseq
Generated ./output/n1414.bpseq
Generated ./output/n1433.bpseq
Generated ./output/n1433.bpseq
Generated ./output/n1460.bpseq
Generated ./output/n1460.bpseq
Generated ./output/n1461.bpseq
Generated ./output/n1579.bpseq
Generated ./output/n1579.bpseq
Generated ./output/n177816.bpseq
Generated ./output/n177817.bpseq
Generated ./output/n177965.bpseq
Generated ./output/n178012.bpseq
Generated ./output/n178094.bpseq
Generated ./output/n178096.bpseq
Generated ./output/n178110.bpseq
Generated ./output/n178123.bpseq
Generated ./output/n178127.bpseq
Generated ./output/n178128.bpseq
Generated ./output/n178133.bpseq
Generated ./output/n178134.bpseq
Generated ./output/n178137.bpseq
Generated ./out

In [11]:
import csv

# 读取CSV文件
csv_file = './data/rna_seq_filter.csv'  # 替换为实际的CSV文件名
fasta_file = 'input.txt'   # 生成的FASTA文件名

with open(csv_file, 'r') as csvfile, open(fasta_file, 'w') as fastafile:
    reader = csv.reader(csvfile)
    next(reader)  # 跳过标题行
    for row in reader:
        name, seq = row
        fastafile.write(f">{name}\n{seq}\n")

In [None]:
%pip install git+https://github.com/uci-cbcl/UFold.git

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting git+https://github.com/uci-cbcl/UFold.git
  Cloning https://github.com/uci-cbcl/UFold.git to c:\users\lenovo\appdata\local\temp\pip-req-build-ziakigsh
  Resolved https://github.com/uci-cbcl/UFold.git to commit 3c92fa184ae66e385214f3e4c1da6cf9bfd667f5
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/uci-cbcl/UFold.git 'C:\Users\lenovo\AppData\Local\Temp\pip-req-build-ziakigsh'
ERROR: git+https://github.com/uci-cbcl/UFold.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.


In [12]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

预训练模型验证

In [13]:
torch.cuda.device_count()

1

In [14]:
from Network import U_Net as FCNNet
import torch
model = torch.load('models/ufold_train_pdbfinetune.pt',map_location={'cuda:3':'cuda:0'})

  model = torch.load('models/ufold_train_pdbfinetune.pt',map_location={'cuda:3':'cuda:0'})


In [15]:
model

OrderedDict([('Conv1.conv.0.weight',
              tensor([[[[-1.2613,  2.9505,  1.2870],
                        [-2.5900,  0.4352, -1.7522],
                        [-0.6872,  2.9316, -1.9149]],
              
                       [[ 0.4989, -2.9621, -0.7333],
                        [-1.0372, -7.7266, -2.5632],
                        [-5.7749, -2.6898, -1.2887]],
              
                       [[-1.7263, -3.3602,  4.1592],
                        [-4.3400, -2.3422, -1.7902],
                        [ 1.5990, -1.1365, -1.5634]],
              
                       ...,
              
                       [[-2.7477,  1.1181,  0.3436],
                        [-1.8447,  0.6551,  1.9094],
                        [-7.2187, -1.4224,  1.2643]],
              
                       [[ 1.4228,  0.7159, -1.0938],
                        [ 1.1072, -2.3389, -0.5264],
                        [-2.5329,  3.8633, -2.8268]],
              
                       [[-0.6508,  1.3449,  4

In [16]:
import torch

from Network import U_Net as FCNNet
Ufold = FCNNet(img_ch=17, output_ch=1)
ufold = Ufold.load_state_dict(model)
ufold

<All keys matched successfully>

In [None]:
seq = "GCCCCCAUCGUCUAACGGUUAGGACACCAGACUUUCAAUCUGACAACGAGAGUUCGACUCUCUCUGGGGGUA"


数据预处理

In [None]:
import numpy as np
import os
import collections
from torch.utils import data

# 定义 RNASSDataGenerator_input 类
class RNASSDataGenerator_input(object):
    def __init__(self, data_dir, split):
        self.data_dir = data_dir
        self.split = split
        self.load_data()

    def load_data(self):
        # 读取输入文件
        input_file = open(os.path.join(self.data_dir, '%s.txt' % self.split), 'r').readlines()
        # 提取序列名称
        self.data_name = np.array([itm.strip()[1:] for itm in input_file if itm.startswith('>')])
        # 提取RNA序列并进行处理
        self.seq = [itm.strip().upper().replace('T', 'U') for itm in input_file if itm.upper().startswith(('A', 'U', 'C', 'G', 'T'))]
        # 序列数量
        self.len = len(self.seq)
        # 每个序列的长度
        self.seq_length = np.array([len(item) for item in self.seq])
        # 将序列转换为One - Hot编码矩阵
        self.data_x = np.array([self.one_hot_600(item) for item in self.seq])
        # 最大序列长度
        self.seq_max_len = 600
        # 这里将 data_y 设为 data_x
        self.data_y = self.data_x

    def one_hot_600(self, seq_item):
        RNN_seq = seq_item
        BASES = 'AUCG'
        bases = np.array([base for base in BASES])
        # 生成One - Hot编码
        feat = np.concatenate(
            [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base
             in RNN_seq])
        one_hot_matrix_600 = np.zeros((600, 4))
        one_hot_matrix_600[:len(seq_item), ] = feat
        return one_hot_matrix_600

    def get_one_sample(self, index):
        data_seq = self.data_x[index]
        data_len = self.seq_length[index]
        data_name = self.data_name[index]
        return data_seq, data_len, data_name

# 定义 Dataset_FCN_input 类，用于生成数据集
class Dataset_FCN_input(data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return self.data.len

    def __getitem__(self, index):
        data_seq, data_len, data_name = self.data.get_one_sample(index)
        return data_seq, data_len, data_name

# 数据预处理函数
def preprocess_fasta_file(data_dir, split):
    # 创建数据生成器实例
    data_generator = RNASSDataGenerator_input(data_dir, split)
    # 创建数据集实例
    dataset = Dataset_FCN_input(data_generator)
    return dataset

# 使用示例
if __name__ == "__main__":
    data_dir = 'data'  # FASTA文件所在目录
    split = 'input'    # FASTA文件名（不包含扩展名）
    dataset = 'data/input.fasta'
    # 打印数据集的长度
    print(f"Number of samples in dataset: {len(dataset)}")
    print(dataset[0])
    # 打印第一个样本的信息
    sample_seq, sample_len, sample_name = dataset[0]
    print(f"Sample name: {sample_name}")
    print(f"Sample length: {sample_len}")
    print(f"Sample sequence shape: {sample_seq.shape}")

Number of samples in dataset: 16
d


ValueError: not enough values to unpack (expected 3, got 1)

In [17]:
import numpy as np
import torch
from torch.utils import data

# 定义序列转换为One - Hot编码的函数，填充到600长度
def one_hot_600(seq_item):
    BASES = 'AUCG'
    bases = np.array([base for base in BASES])
    feat = np.concatenate(
        [[(bases == base.upper()).astype(int)] if str(base).upper() in BASES else np.array([[-1] * len(BASES)]) for base
         in seq_item])
    if len(seq_item) <= 600:
        one_hot_matrix_600 = np.zeros((600, 4))
    else:
        one_hot_matrix_600 = np.zeros((600, 4))
    one_hot_matrix_600[:len(seq_item), ] = feat
    return one_hot_matrix_600

# 定义FASTA数据生成器类
class FastaDataGenerator:
    def __init__(self, fasta_content):
        self.data_name = []
        self.seq = []
        self._parse_fasta(fasta_content)
        self.len = len(self.seq)
        self.seq_length = np.array([len(item) for item in self.seq])
        self.data_x = np.array([one_hot_600(item) for item in self.seq])

    def _parse_fasta(self, fasta_content):
        lines = fasta_content.strip().split('\n')
        current_name = None
        current_seq = []
        for line in lines:
            if line.startswith('>'):
                if current_name is not None:
                    self.data_name.append(current_name)
                    self.seq.append(''.join(current_seq))
                    current_seq = []
                current_name = line[1:]
            else:
                current_seq.append(line.strip())
        if current_name is not None:
            self.data_name.append(current_name)
            self.seq.append(''.join(current_seq))

    def get_one_sample(self, index):
        data_seq = self.data_x[index]
        data_len = self.seq_length[index]
        data_name = self.data_name[index]
        return data_seq, data_len, data_name

# 定义数据集类
class FastaDataset(data.Dataset):
    def __init__(self, data_generator):
        self.data_generator = data_generator

    def __len__(self):
        return self.data_generator.len

    def __getitem__(self, index):
        data_seq, data_len, data_name = self.data_generator.get_one_sample(index)
        return data_seq, data_len, data_name

# 示例FASTA内容
fasta_content = """
>n121
UGUUGUUAUGUGUUGGUUAUGUGUUGAAUAUAAUGUCCUAUAAGCUUCAGUUAGGUCAGGUUAUGUGUUGCUUAAAUCUUUUGUACCUUUACUGAUUUGUGUGAGAGAGUGUGUGUGUGUGUGUGUGUCUGUGUUUGCACGCGCACAUGUGCGUGCGUGCUUGUGCUUGUUUUAUUACUUGCUGAGAGGAUACUACAAACUCAAACAAUUAUUGUAGAUUUAGAAUUACCUUACUUAU
>n127
GGGCAUGGUGGCACAUGCCUGUAGUCCCAGCUACUCGGUGGGCUUAGGCAUGAGGAUCGCUUGAGCCCAGGAGUUCUGAGCUGUAGUGAGCUAUGACCAUCAGAUGUCUGCAUUAAGUUCAGCAUCAAUAUGGUGACCUCCCGGGAGUGGGGACCACCAGGUUGCCUAAGGAAGGGUGAACAGGCCCAGGUCUGAAACGGAACAGGUCAAAACUCCCAUGCUAAUCAGUAGUGGGACCGUGCCUGUGAAUAGCCACUGCACUUCAGCCUGGGCAACAUGGCCAGACCUCAUCUC
>n1315
GCCACAUGAUGAUAUCAAGGCUGUUGUGAUUCAGUUGGUUUGGCUAAGCCCAGGGACCUUUGGCCUGUUAAAGGUCUGUAAUCUUGGUGGGCGAUACAGAGUUAUGUGUGUUCACUGUAAGGGCAGACCAACAAGAACUUUUUCCUACUUUUGAGCUACCUCUUUUUAAUAGGGGUGAUUCUUCCAGUUGCUGGAGAGAAAUUGUGGUAACUGGAGUGAGAGAGUAGGAACAGGGCAUGUUCAGGGUAUCAGGGCCAAGGGUCCUAAAGGACUUAGCUUGUGUUAUGGCCACUGAGAGAUGAAACACAGAUCUUUGGUAAUCUGAUGGCU
"""

# 数据预处理
data_generator = FastaDataGenerator(fasta_content)
dataset = FastaDataset(data_generator)

# 打印数据集信息
for i in range(len(dataset)):
    data_seq, data_len, data_name = dataset[i]
    print(f"Sequence Name: {data_name}")
    print(f"Sequence Length: {data_len}")
    print(f"One - Hot Encoded Sequence Shape: {data_seq.shape}")
    print()

Sequence Name: n121
Sequence Length: 238
One - Hot Encoded Sequence Shape: (600, 4)

Sequence Name: n127
Sequence Length: 294
One - Hot Encoded Sequence Shape: (600, 4)

Sequence Name: n1315
Sequence Length: 330
One - Hot Encoded Sequence Shape: (600, 4)



In [18]:
dataset

<__main__.FastaDataset at 0x1eaa2f6a4e0>

NameError: name 'java' is not defined

In [4]:
!python ufold_predict.py --nc False


Welcome using UFold prediction tool!!!
Sequencing number:  0


OMP: Error #111: Memory allocation failed.
OMP: Error #111: Memory allocation failed.
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "d:\anaconda\envs\pytorch\Lib\multiprocessing\spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\anaconda\envs\pytorch\Lib\multiprocessing\spawn.py", line 131, in _main
    prepare(preparation_data)
  File "d:\anaconda\envs\pytorch\Lib\multiprocessing\spawn.py", line 246, in prepare
    _fixup_main_from_path(data['init_main_from_path'])
  File "d:\anaconda\envs\pytorch\Lib\multiprocessing\spawn.py", line 297, in _fixup_main_from_path
    main_content = runpy.run_path(main_path,
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen runpy>", line 287, in run_path
  File "<frozen runpy>", line 98, in _run_module_code
  File "<frozen runpy>", line 88, in _run_code
  File "g:\文献阅读项目\UFold\ufold_predict.py", line 5, in <module>
    import torc