# Импорт зависимостей и установка библиотек



In [None]:
pip install numpy pandas torch torchvision torchaudio biopython matplotlib seaborn optuna


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!sudo dpkg --configure -a
!sudo apt-get update
!sudo apt-get install -y dssp

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,517 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3,448 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://security.ub

In [None]:
!pip install py3Dmol


Collecting py3Dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading py3Dmol-2.4.2-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.4.2


In [None]:
import os
import requests
import json
import time
import numpy as np
import torch
import torch.nn as nn
import math
import optuna
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.PDB import PDBParser, PDBIO, Select
from Bio.PDB.DSSP import DSSP
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import py3Dmol

# Определение классов (модель, датасет, ранняя остановка), чтобы избежать ошибок с интеграцией Optuna

In [None]:
class PositionalEncoding(nn.Module):

    # Позиционное кодирование для входных эмбеддингов.

    def __init__(self, d_model, max_len=512):
        super().__init__()
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


class ProteinTransformer(nn.Module):

    # Трансформер для предсказания вторичной структуры

    def __init__(self, vocab_size, d_model, nhead, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(d_model, num_classes)

    def forward(self, src):
        src = self.embedding(src)
        src = self.pos_encoder(src)
        src = src.permute(1, 0, 2)
        encoded = self.transformer_encoder(src)
        encoded = encoded.permute(1, 0, 2)
        output = self.decoder(encoded)
        return output


class ProteinDataset(Dataset):

    # Хранит (sequences, targets), возвращает тензоры PyTorch

    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = torch.tensor(self.sequences[idx], dtype=torch.long)
        tgt = torch.tensor(self.targets[idx], dtype=torch.long)
        return seq, tgt


class EarlyStopping:

    # Ранняя остановка (ИИ-агент), прекращающая обучение при отсутствии улучшения метрики val_loss.

    def __init__(self, patience=3, delta=0.0):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta

    def __call__(self, val_loss):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f"EarlyStopping: {self.counter}/{self.patience} (no improvement)")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0


# Код для скачивания и предобработки (UniProt + PDB), извлечения вторичной структуры

In [None]:
def download_uniprot_data(filename="uniprot_sprot.fasta"):
    url = 'https://rest.uniprot.org/uniprotkb/stream?query=(reviewed:true)&format=fasta'
    if not os.path.exists(filename):
        print("Скачивание данных из UniProt...")
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'w') as f:
                f.write(response.text)
            print(f"Данные успешно скачаны и сохранены в {filename}")
        else:
            print("Ошибка при скачивании данных:", response.status_code)
    else:
        print(f"Файл {filename} уже существует, пропускаем.")

def create_amino_acid_dict():
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_to_int = {}
    for idx, aa in enumerate(amino_acids):
        aa_to_int[aa] = idx + 1
    return aa_to_int

def encode_sequence(seq, aa_to_int, max_len=512):
    seq_encoded = []
    for aa in seq:
        aa_code = aa_to_int.get(aa, 0)
        seq_encoded.append(aa_code)
    if len(seq_encoded) < max_len:
        seq_encoded += [0]*(max_len - len(seq_encoded))
    else:
        seq_encoded = seq_encoded[:max_len]
    return seq_encoded

def load_and_encode_sequences(fasta_file, max_len=512, limit=None):
    aa_to_int = create_amino_acid_dict()
    sequences = []
    count = 0
    for record in SeqIO.parse(fasta_file, "fasta"):
        if limit and count >= limit:
            break
        seq = str(record.seq)
        seq_encoded = encode_sequence(seq, aa_to_int, max_len)
        sequences.append(seq_encoded)
        count += 1
    return sequences

def extract_uniprot_ids(fasta_file, limit=None):
    uniprot_ids = []
    count = 0
    for record in SeqIO.parse(fasta_file, "fasta"):
        if limit and count >= limit:
            break
        desc = record.description
        if "|" in desc:
            parts = desc.split("|")
            if len(parts) > 1:
                uniprot_id = parts[1]
                uniprot_ids.append(uniprot_id)
            else:
                uniprot_ids.append(None)
        else:
            uniprot_ids.append(None)
        count += 1
    return uniprot_ids

def get_pdb_ids(uniprot_id):
    url = "https://search.rcsb.org/rcsbsearch/v2/query?json="
    query = {
        "query": {
            "type": "terminal",
            "service": "text",
            "parameters": {
                "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                "operator": "exact_match",
                "value": uniprot_id
            }
        },
        "return_type": "entry"
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, headers=headers, data=json.dumps(query))
    if response.status_code == 200:
        data = response.json()
        pdb_ids = [res['identifier'] for res in data.get('result_set', [])]
        return pdb_ids
    elif response.status_code == 204:
        return []
    else:
        print(f"Ошибка при запросе UniProt ID {uniprot_id}: {response.status_code}")
        return []

def download_pdb(pdb_id, save_dir='pdb_files'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    pdb_file = os.path.join(save_dir, f"{pdb_id}.pdb")
    if not os.path.exists(pdb_file):
        url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
        r = requests.get(url)
        if r.status_code == 200:
            with open(pdb_file, 'w') as f:
                f.write(r.text)
            print(f"Файл {pdb_id}.pdb сохранён.")
        else:
            print(f"Не удалось скачать {pdb_id}, статус:", r.status_code)
            pdb_file = None
    else:
        print(f"{pdb_id}.pdb уже существует.")
    return pdb_file

def get_secondary_structure(pdb_file, pdb_id):
    parser = PDBParser()
    structure = parser.get_structure(pdb_id, pdb_file)
    model = structure[0]
    try:
        dssp = DSSP(model, pdb_file)
        ss_seq = ''
        for residue in model.get_residues():
            chain_id = residue.get_parent().id
            key = (chain_id, residue.get_id())
            if key in dssp:
                ss = dssp[key][2]  # 'H', 'E', 'T', ...
                ss_seq += ss
        return ss_seq
    except Exception as e:
        print(f"Ошибка при обработке {pdb_file}: {e}")
        return None

ss_mapping = {
    'H': 0,  # Альфа-спираль
    'B': 1,  # Бета
    'E': 1,  # Бета
    'G': 0,  # 3-10 спираль
    'I': 0,  # Пи-спираль
    'T': 2,  # Поворот
    'S': 2,  # Изгиб
    '-': 3   # Катушка
}

def encode_secondary_structure(ss_seq, max_len=512):
    ss_encoded = []
    for s in ss_seq:
        code = ss_mapping.get(s, 3)
        ss_encoded.append(code)
    if len(ss_encoded) < max_len:
        ss_encoded += [3]*(max_len - len(ss_encoded))
    else:
        ss_encoded = ss_encoded[:max_len]
    return ss_encoded


# Демонстрация предобработки и извлечение вторичной структуры


In [None]:
max_seq_len = 512
sequence_limit = 1000
fasta_file = "uniprot_sprot.fasta"

# 1. Скачиваем UniProt
download_uniprot_data(fasta_file)

# 2. Кодируем последовательности
sequences = load_and_encode_sequences(fasta_file, max_len=max_seq_len, limit=sequence_limit)
print(f"Загружено и закодировано последовательностей:", len(sequences))
print("Пример кодировки:", sequences[0][:30])

# 3. Извлечение UniProt IDs
uniprot_ids = extract_uniprot_ids(fasta_file, limit=sequence_limit)
print("Количество UniProt ID:", len(uniprot_ids))
print("Примеры:", uniprot_ids[:5])

# 4. Сопоставим UniProt -> PDB (возьмём 5-10 шт. для примера)
uniprot_to_pdb = {}
for i, uid in enumerate(uniprot_ids[:5]):
    if uid:
        pdb_list = get_pdb_ids(uid)
        uniprot_to_pdb[uid] = pdb_list
        time.sleep(0.1)
    else:
        uniprot_to_pdb[uid] = []

print("UniProt -> PDB (первые 5):", uniprot_to_pdb)

pdb_files = []
for uid, pdb_list in uniprot_to_pdb.items():
    if pdb_list:
        pdb_id = pdb_list[0]
        pfile = download_pdb(pdb_id)
        pdb_files.append((uid, pdb_id, pfile))
    else:
        pdb_files.append((uid, None, None))

print("Скачанные PDB-файлы:", pdb_files)

# 5. Извлекаем вторичную структуру (для примера)
secondary_structures = []
for (uid, pid, pfile) in pdb_files:
    if pfile:
        ss_seq = get_secondary_structure(pfile, pid)
        secondary_structures.append(ss_seq)
    else:
        secondary_structures.append(None)

targets = []
for ss in secondary_structures:
    if ss:
        enc = encode_secondary_structure(ss, max_len=max_seq_len)
        targets.append(enc)
    else:
        targets.append(None)

print("Пример код. вторичной структуры:", targets[0][:30] if targets[0] else None)


Файл uniprot_sprot.fasta уже существует, пропускаем.
Загружено и закодировано последовательностей: 1000
Пример кодировки: [11, 16, 10, 4, 14, 9, 9, 6, 1, 3, 8, 8, 16, 9, 8, 10, 14, 8, 14, 12, 16, 8, 6, 9, 17, 17, 16, 13, 16, 17]
Количество UniProt ID: 1000
Примеры: ['A0A009IHW8', 'A0A023I7E1', 'A0A024B7W1', 'A0A024SC78', 'A0A024SH76']
UniProt -> PDB (первые 5): {'A0A009IHW8': ['7UWG', '7UXU', '8G83'], 'A0A023I7E1': ['4K35', '4K3A', '5XBZ', '5XC2'], 'A0A024B7W1': ['5GOZ', '5GP1', '5H30', '5H32', '5H37', '5IRE', '5IZ7', '5JMT', '5KQR', '5KQS'], 'A0A024SC78': ['4PSC', '4PSD', '4PSE'], 'A0A024SH76': []}
7UWG.pdb уже существует.
4K35.pdb уже существует.
5GOZ.pdb уже существует.
4PSC.pdb уже существует.
Скачанные PDB-файлы: [('A0A009IHW8', '7UWG', 'pdb_files/7UWG.pdb'), ('A0A023I7E1', '4K35', 'pdb_files/4K35.pdb'), ('A0A024B7W1', '5GOZ', 'pdb_files/5GOZ.pdb'), ('A0A024SC78', '4PSC', 'pdb_files/4PSC.pdb'), ('A0A024SH76', None, None)]




Пример код. вторичной структуры: [3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3]


# Обучение модели + Ранняя остановка


In [None]:
# Фильтруем только те (seq, tgt), где tgt != None
filtered_sequences = []
filtered_targets = []
for seq, tgt in zip(sequences[:len(targets)], targets):
    if tgt is not None:
        filtered_sequences.append(seq)
        filtered_targets.append(tgt)

print("Всего пар для обучения:", len(filtered_sequences))

# Делаем Dataset, затем train/test_split
ds = ProteinDataset(filtered_sequences, filtered_targets)
train_size = int(0.8 * len(ds))
val_size = len(ds) - train_size
train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

model = ProteinTransformer(vocab_size=21, d_model=128, nhead=8, num_layers=2, num_classes=4)
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
early_stopper = EarlyStopping(patience=3)

def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for seq_batch, tgt_batch in loader:
            outputs = model(seq_batch)
            loss = criterion(outputs.view(-1, 4), tgt_batch.view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for seq_batch, tgt_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(seq_batch)
        loss = criterion(outputs.view(-1,4), tgt_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    val_loss = evaluate_model(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}")
    early_stopper(val_loss)
    if early_stopper.early_stop:
        print("Early stopping triggered!")
        break

print("Обучение завершено.")


Всего пар для обучения: 4




Epoch 1/5 | Train Loss: 1.6404, Val Loss: 1.1245
Epoch 2/5 | Train Loss: 1.4340, Val Loss: 1.1067
Epoch 3/5 | Train Loss: 1.0611, Val Loss: 1.3331
EarlyStopping: 1/3 (no improvement)
Epoch 4/5 | Train Loss: 1.1657, Val Loss: 1.2027
EarlyStopping: 2/3 (no improvement)
Epoch 5/5 | Train Loss: 1.1056, Val Loss: 1.0294
Обучение завершено.


# Пример использования Optuna для подбора гиперпараметров


In [None]:
def objective(trial):
    d_model = trial.suggest_categorical("d_model", [64, 128])
    nhead = trial.suggest_categorical("nhead", [4, 8])
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    num_layers = trial.suggest_int("num_layers", 1, 3)

    model = ProteinTransformer(
        vocab_size=21,
        d_model=d_model,
        nhead=nhead,
        num_layers=num_layers,
        num_classes=4
    )
    criterion = nn.CrossEntropyLoss(ignore_index=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Используем тот же ds
    train_size = int(0.8 * len(ds))
    val_size = len(ds) - train_size
    train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size])
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

    stopper = EarlyStopping(patience=2)
    max_epochs = 5
    for ep in range(max_epochs):
        model.train()
        for seq_batch, tgt_batch in train_loader:
            optimizer.zero_grad()
            out = model(seq_batch)
            loss = criterion(out.view(-1,4), tgt_batch.view(-1))
            loss.backward()
            optimizer.step()

        val_loss = evaluate_model(model, val_loader, criterion)
        stopper(val_loss)
        if stopper.early_stop:
            break

    return val_loss

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)  # Сделаем три прогона
print("Best params:", study.best_params)
print("Best val_loss:", study.best_value)


[I 2024-12-31 17:18:58,502] A new study created in memory with name: no-name-e7bba648-6f4e-4d2f-a63c-025b0f02f483
[I 2024-12-31 17:18:59,485] Trial 0 finished with value: 1.4708925485610962 and parameters: {'d_model': 64, 'nhead': 4, 'lr': 0.00014421246209620473, 'num_layers': 1}. Best is trial 0 with value: 1.4708925485610962.


EarlyStopping: 1/2 (no improvement)


[I 2024-12-31 17:19:01,483] Trial 1 finished with value: 1.3603652715682983 and parameters: {'d_model': 64, 'nhead': 8, 'lr': 0.00015347248718592508, 'num_layers': 2}. Best is trial 1 with value: 1.3603652715682983.


EarlyStopping: 2/2 (no improvement)


[I 2024-12-31 17:19:03,163] Trial 2 finished with value: 1.0838634967803955 and parameters: {'d_model': 128, 'nhead': 4, 'lr': 0.00012578553015630815, 'num_layers': 1}. Best is trial 2 with value: 1.0838634967803955.


Best params: {'d_model': 128, 'nhead': 4, 'lr': 0.00012578553015630815, 'num_layers': 1}
Best val_loss: 1.0838634967803955


# Визуализация структуры через py3Dmol


In [None]:
class SecondaryStructureSelector(Select):
    """
    Записывает символы вторичной структуры (H, E, ...) в поле bfactor (ord(символ)).
    """
    def __init__(self, ss_seq):
        super().__init__()
        self.ss_seq = ss_seq
        self.counter = 0

    def accept_residue(self, residue):
        if self.counter < len(self.ss_seq):
            ss_char = self.ss_seq[self.counter]
            residue.bfactor = ord(ss_char)
            self.counter += 1
            return True
        else:
            return False

# Для примера, predicted_ss и true_ss
predicted_ss = "HHHHHHHHHH"  # dummy
true_ss = "HHHHEEEECC"

pdb_file = pdb_files[0][2] if pdb_files[0][2] else None
if pdb_file:
    parser = PDBParser()
    structure = parser.get_structure("pdbid", pdb_file)

    io = PDBIO()
    io.set_structure(structure)
    io.save("predicted_structure.pdb", SecondaryStructureSelector(predicted_ss.replace('-', 'C')))
    io.set_structure(structure)
    io.save("true_structure.pdb", SecondaryStructureSelector(true_ss.replace('-', 'C')))

    with open("true_structure.pdb", "r") as f:
        true_pdb = f.read()
    with open("predicted_structure.pdb", "r") as f:
        pred_pdb = f.read()

    view = py3Dmol.view(width=800, height=600)
    view.addModel(true_pdb, 'pdb')
    view.setStyle({'model': 0}, {'cartoon': {'color': 'grey', 'opacity': 0.5}})
    view.addModel(pred_pdb, 'pdb')
    view.setStyle({'model': 1}, {
        'cartoon': {'colorscheme': {
            'prop': 'b',
            'map': [(1, 'red'), (2, 'yellow'), (3, 'green'), (0, 'blue')]
        }}
    })
    view.setBackgroundColor('0xFFFFFF')
    view.zoomTo()
    view.show()
else:
    print("Нет PDB-файла для визуализации.")


