In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# === 配置部分 ===
INPUT_FILE = "data/peptides.csv"  # 原始数据路径
OUTPUT_DIR = "data/splits"        # 训练验证集输出路径
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")  # 标准20种氨基酸
MIN_LENGTH = 10  # 你可以调节这个阈值

def is_valid_Sequence(seq):
    """只保留标准氨基酸组成的序列"""
    if not isinstance(seq, str):
        return False
    return all(residue in VALID_AMINO_ACIDS for residue in seq)

def load_and_clean_data(file_path):
    """加载CSV文件并清洗非法序列"""
    df = pd.read_csv(file_path)
    print(f"读取列名: {df.columns.tolist()}")

    df['Sequence'] = df['Sequence'].str.upper()
    df = df[df['Sequence'].apply(is_valid_Sequence)]
    df = df[df['Sequence'].str.len() >= MIN_LENGTH]  # 过滤长度
    return df

def save_split(df, output_dir, train_ratio=0.8):
    """划分训练/验证集并保存为txt文件"""
    # train_df, val_df = train_test_split(df, train_size=train_ratio, random_state=42, stratify=df['label']) 现在没有label
    train_df, val_df = train_test_split(df, train_size=train_ratio, random_state=42)

    os.makedirs(output_dir, exist_ok=True)

    train_path = os.path.join(output_dir, "train.txt")
    val_path = os.path.join(output_dir, "val.txt")

    train_df.to_csv(train_path, sep='\t', index=False, header=True)
    val_df.to_csv(val_path, sep='\t', index=False, header=True)

    print(f"✅ 数据已成功拆分并保存：\n - {train_path}\n - {val_path}")

if __name__ == "__main__":
    df = load_and_clean_data(INPUT_FILE)
    print(f"共载入 {len(df)} 条合法多肽序列（长度 ≥ {MIN_LENGTH}）。")
    save_split(df, OUTPUT_DIR)



In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [2]:
# 示例多肽数据（你应替换为真实序列）
sequences = ['ACDEFGHIKLMNPQRSTVWY', 'MKTIIALSYIFCLVFAD', 'GAVLIMFWP']

# 获取全部字符（氨基酸）集合
all_chars = sorted(set(''.join(sequences)))
vocab_size = len(all_chars)

# 建立字符与索引的映射
char2idx = {ch: i for i, ch in enumerate(all_chars)}
idx2char = {i: ch for ch, i in char2idx.items()}


In [4]:
print(all_chars)
print(char2idx)

['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
{'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}


In [5]:
class PeptideDataset(Dataset):
    def __init__(self, sequences, seq_length=20):
        self.data = []
        for seq in sequences:
            for i in range(len(seq) - seq_length):
                input_seq = seq[i:i+seq_length]
                target_seq = seq[i+1:i+seq_length+1]
                self.data.append((input_seq, target_seq))
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, target_seq = self.data[idx]
        x = torch.zeros(self.seq_length, vocab_size)
        y = torch.zeros(self.seq_length, dtype=torch.long)
        for i, ch in enumerate(input_seq):
            x[i][char2idx[ch]] = 1.0
        for i, ch in enumerate(target_seq):
            y[i] = char2idx[ch]
        return x, y


In [6]:
class CharLSTMGenerator(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(CharLSTMGenerator, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden=None):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden


In [8]:
# 超参数
seq_length = 20
hidden_dim = 128
num_layers = 1
batch_size = 16
epochs = 30
learning_rate = 0.003

# 数据加载
dataset = PeptideDataset(sequences, seq_length=seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 模型
model = CharLSTMGenerator(vocab_size, hidden_dim, vocab_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


ValueError: num_samples should be a positive integer value, but got num_samples=0